invoke.texi (-fprefetch-loop-arrays, [...]): Document.

* doc/invoke.texi (-fprefetch-loop-arrays, -fprefetch-loop-arrays-rtl): Document. * tree-ssa-loop-niter.c (number_of_iterations_ne, number_of_iterations_lt, number_of_iterations_cond): Remember the shape of the ending condition. * tree-ssa-loop-manip.c: Include params.h. (build_if_stmt, can_unroll_loop_p, determine_exit_conditions, tree_unroll_loop): New functions. * tree-pass.h (pass_loop_prefetch): Declare. * loop.c (rest_of_handle_loop_optimize): Test for -fprefetch-loop-arrays-rtl. * tree-scalar-evolution.h (affine_iv): Moved to tree-flow.h. * timevar.def (TV_TREE_PREFETCH): New timevar. * tree-ssa-loop.c (tree_ssa_loop_prefetch, gate_tree_ssa_loop_prefetch, pass_loop_prefetch): New. * tree-cfgcleanup.c: Include tree-scalar-evolution.h. (cleanup_tree_cfg_loop): Call scev_reset. * common.opt (fprefetch-loop-arrays-rtl): Add. * tree-ssa-loop-prefetch.c: New file. * tree-outof-ssa.c (struct value_expr_d): Add expr_vars field. (new_temp_expr_table): Initialize expr_vars. (free_temp_expr_table): Cleanup expr_vars. (check_replaceable, find_replaceable_in_bb): Prevent accumulating expressions from being merged into one. * tree-flow.h (affine_iv): Moved from tree-scalar-evolution.h. (struct tree_niter_desc): Add control, bound and cmp fields. (tree_ssa_prefetch_arrays, can_unroll_loop_p, tree_unroll_loop): Declare. * Makefile.in (tree-ssa-loop-prefetch.o): Add. (tree-cfgcleanup.o): Add SCEV_H dependency. (tree-ssa-loop-manip.o): Add PARAMS_H dependency. * passes.c (init_optimization_passes): Add pass_loop_prefetch. From-SVN: r110964
2006-02-14 13:22:11 +01:00 · 2006-02-14 13:22:11 +01:00 · 1768461812
parent 0a4288d92e
commit 1768461812
15 changed files with 512 additions and 41 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,38 @@
+2006-02-14  Zdenek Dvorak <dvorakz@suse.cz>
+
+	* doc/invoke.texi (-fprefetch-loop-arrays, -fprefetch-loop-arrays-rtl):
+	Document.
+	* tree-ssa-loop-niter.c (number_of_iterations_ne,
+	number_of_iterations_lt, number_of_iterations_cond): Remember the shape
+	of the ending condition.
+	* tree-ssa-loop-manip.c: Include params.h.
+	(build_if_stmt, can_unroll_loop_p, determine_exit_conditions,
+	tree_unroll_loop): New functions.
+	* tree-pass.h (pass_loop_prefetch): Declare.
+	* loop.c (rest_of_handle_loop_optimize): Test for
+	-fprefetch-loop-arrays-rtl.
+	* tree-scalar-evolution.h (affine_iv): Moved to tree-flow.h.
+	* timevar.def (TV_TREE_PREFETCH): New timevar.
+	* tree-ssa-loop.c (tree_ssa_loop_prefetch, gate_tree_ssa_loop_prefetch,
+	pass_loop_prefetch): New.
+	* tree-cfgcleanup.c: Include tree-scalar-evolution.h.
+	(cleanup_tree_cfg_loop): Call scev_reset.
+	* common.opt (fprefetch-loop-arrays-rtl): Add.
+	* tree-ssa-loop-prefetch.c: New file.
+	* tree-outof-ssa.c (struct value_expr_d): Add expr_vars field.
+	(new_temp_expr_table): Initialize expr_vars.
+	(free_temp_expr_table): Cleanup expr_vars.
+	(check_replaceable, find_replaceable_in_bb): Prevent accumulating
+	expressions from being merged into one.
+	* tree-flow.h (affine_iv): Moved from tree-scalar-evolution.h.
+	(struct tree_niter_desc): Add control, bound and cmp fields.
+	(tree_ssa_prefetch_arrays, can_unroll_loop_p, tree_unroll_loop):
+	Declare.
+	* Makefile.in (tree-ssa-loop-prefetch.o): Add.
+	(tree-cfgcleanup.o): Add SCEV_H dependency.
+	(tree-ssa-loop-manip.o): Add PARAMS_H dependency.
+	* passes.c (init_optimization_passes): Add pass_loop_prefetch.
+
 2006-02-14  Richard Guenther  <rguenther@suse.de>

 	PR tree-optimization/26258
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@ -963,7 +963,7 @@ OBJS-common = \
 tree-vect-generic.o tree-ssa-loop.o tree-ssa-loop-niter.o		   \
 tree-ssa-loop-manip.o tree-ssa-threadupdate.o tree-ssa-threadedge.o	   \
 tree-vectorizer.o tree-vect-analyze.o tree-vect-transform.o		   \
- tree-vect-patterns.o                                                      \
+ tree-vect-patterns.o tree-ssa-loop-prefetch.o				   \
 tree-ssa-loop-ivcanon.o tree-ssa-propagate.o tree-ssa-address.o	   \
 tree-ssa-math-opts.o							   \
 tree-ssa-loop-ivopts.o tree-if-conv.o tree-ssa-loop-unswitch.o		   \
@ -1975,6 +1975,12 @@ tree-ssa-loop-ch.o : tree-ssa-loop-ch.c $(TREE_FLOW_H) $(CONFIG_H) \
   $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) $(TREE_INLINE_H) \
   output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
   tree-pass.h $(FLAGS_H) $(BASIC_BLOCK_H) hard-reg-set.h
+tree-ssa-loop-prefetch.o: tree-ssa-loop-prefetch.c $(TREE_FLOW_H) $(CONFIG_H) \
+   $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) $(EXPR_H) \
+   output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
+   tree-pass.h $(GGC_H) $(RECOG_H) insn-config.h $(HASHTAB_H) $(SCEV_H) \
+   $(CFGLOOP_H) $(PARAMS_H) langhooks.h $(BASIC_BLOCK_H) hard-reg-set.h \
+   tree-chrec.h toplev.h langhooks.h
 tree-ssa-loop-ivopts.o : tree-ssa-loop-ivopts.c $(TREE_FLOW_H) $(CONFIG_H) \
   $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) $(EXPR_H) \
   output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
@ -1984,7 +1990,8 @@ tree-ssa-loop-ivopts.o : tree-ssa-loop-ivopts.c $(TREE_FLOW_H) $(CONFIG_H) \
 tree-ssa-loop-manip.o : tree-ssa-loop-manip.c $(TREE_FLOW_H) $(CONFIG_H) \
   $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) \
   output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
-   tree-pass.h $(CFGLAYOUT_H) $(SCEV_H) $(BASIC_BLOCK_H) hard-reg-set.h
+   tree-pass.h $(CFGLAYOUT_H) $(SCEV_H) $(BASIC_BLOCK_H) hard-reg-set.h \
+   $(PARAMS_H)
 tree-ssa-loop-im.o : tree-ssa-loop-im.c $(TREE_FLOW_H) $(CONFIG_H) \
   $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) domwalk.h \
   $(PARAMS_H) output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h \
--- a/gcc/common.opt
+++ b/gcc/common.opt
@ -659,7 +659,11 @@ Common Report Var(flag_pie,1) VarExists
 Generate position-independent code for executables if possible (small mode)

 fprefetch-loop-arrays
-Common Report Var(flag_prefetch_loop_arrays)
+Common Report Var(flag_prefetch_loop_arrays,1)
+Generate prefetch instructions, if available, for arrays in loops
+
+fprefetch-loop-arrays-rtl
+Common Report Var(flag_prefetch_loop_arrays,2)
 Generate prefetch instructions, if available, for arrays in loops

 fprofile
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@ -321,7 +321,7 @@ Objective-C and Objective-C++ Dialects}.
 -funsafe-math-optimizations  -funsafe-loop-optimizations  -ffinite-math-only @gol
 -fno-toplevel-reorder -fno-trapping-math  -fno-zero-initialized-in-bss @gol
 -fomit-frame-pointer  -foptimize-register-move @gol
-foptimize-sibling-calls  -fprefetch-loop-arrays @gol
+-foptimize-sibling-calls  -fprefetch-loop-arrays -fprefetch-loop-arrays-rtl @gol
 -fprofile-generate -fprofile-use @gol
 -fregmove  -frename-registers @gol
 -freorder-blocks  -freorder-blocks-and-partition -freorder-functions @gol
@ -5171,7 +5171,9 @@ With this option, the compiler will create multiple copies of some
 local variables when unrolling a loop which can result in superior code.

@item -fprefetch-loop-arrays
+@itemx -fprefetch-loop-arrays-rtl
@opindex fprefetch-loop-arrays
+@opindex fprefetch-loop-arrays-rtl
 If supported by the target machine, generate instructions to prefetch
 memory to improve the performance of loops that access large arrays.

@ -5709,7 +5711,9 @@ Move branches with loop invariant conditions out of the loop, with duplicates
 of the loop on both branches (modified according to result of the condition).

@item -fprefetch-loop-arrays
+@itemx -fprefetch-loop-arrays-rtl
@opindex fprefetch-loop-arrays
+@opindex fprefetch-loop-arrays-rtl
 If supported by the target machine, generate instructions to prefetch
 memory to improve the performance of loops that access large arrays.

--- a/gcc/loop.c
+++ b/gcc/loop.c
@ -11780,7 +11780,7 @@ rest_of_handle_loop_optimize (void)
  free_bb_for_insn ();
  profile_status = PROFILE_ABSENT;
  
-  do_prefetch = flag_prefetch_loop_arrays ? LOOP_PREFETCH : 0;
+  do_prefetch = flag_prefetch_loop_arrays == 2 ? LOOP_PREFETCH : 0;
  
  if (flag_rerun_loop_opt)
    {
--- a/gcc/passes.c
+++ b/gcc/passes.c
@ -601,6 +601,7 @@ init_optimization_passes (void)
     vectorizer creates alias relations that are not supported by
     pass_may_alias.  */
  NEXT_PASS (pass_complete_unroll);
+  NEXT_PASS (pass_loop_prefetch);
  NEXT_PASS (pass_iv_optimize);
  NEXT_PASS (pass_tree_loop_done);
  *p = NULL;
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@ -107,6 +107,7 @@ DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH    , "tree loop unswitching")
 DEFTIMEVAR (TV_COMPLETE_UNROLL       , "complete unrolling")
 DEFTIMEVAR (TV_TREE_VECTORIZATION    , "tree vectorization")
 DEFTIMEVAR (TV_TREE_LINEAR_TRANSFORM , "tree loop linear")
+DEFTIMEVAR (TV_TREE_PREFETCH	     , "tree prefetching")
 DEFTIMEVAR (TV_TREE_LOOP_IVOPTS	     , "tree iv optimization")
 DEFTIMEVAR (TV_TREE_LOOP_INIT	     , "tree loop init")
 DEFTIMEVAR (TV_TREE_LOOP_FINI	     , "tree loop fini")
--- a/gcc/tree-cfgcleanup.c
+++ b/gcc/tree-cfgcleanup.c
@ -45,6 +45,7 @@ Boston, MA 02110-1301, USA.  */
 #include "cfglayout.h"
 #include "hashtab.h"
 #include "tree-ssa-propagate.h"
+#include "tree-scalar-evolution.h"

 /* Remove any fallthru edge from EV.  Return true if an edge was removed.  */

@ -559,23 +560,26 @@ cleanup_tree_cfg (void)
 void
 cleanup_tree_cfg_loop (void)
 {
-  bitmap changed_bbs = BITMAP_ALLOC (NULL);
+  bool changed = cleanup_tree_cfg ();

-  cleanup_tree_cfg ();
+  if (changed)
+    {
+      bitmap changed_bbs = BITMAP_ALLOC (NULL);
+      fix_loop_structure (current_loops, changed_bbs);
+      calculate_dominance_info (CDI_DOMINATORS);

-  fix_loop_structure (current_loops, changed_bbs);
-  calculate_dominance_info (CDI_DOMINATORS);
+      /* This usually does nothing.  But sometimes parts of cfg that originally
+	 were inside a loop get out of it due to edge removal (since they
+	 become unreachable by back edges from latch).  */
+      rewrite_into_loop_closed_ssa (changed_bbs, TODO_update_ssa);

-  /* This usually does nothing.  But sometimes parts of cfg that originally
-     were inside a loop get out of it due to edge removal (since they
-     become unreachable by back edges from latch).  */
-  rewrite_into_loop_closed_ssa (changed_bbs, TODO_update_ssa);
-
-  BITMAP_FREE (changed_bbs);
+      BITMAP_FREE (changed_bbs);

 #ifdef ENABLE_CHECKING
-  verify_loop_structure (current_loops);
+      verify_loop_structure (current_loops);
 #endif
+      scev_reset ();
+    }
 }

 /* Merge the PHI nodes at BB into those at BB's sole successor.  */
--- a/gcc/tree-flow.h
+++ b/gcc/tree-flow.h
@ -667,6 +667,17 @@ extern void replace_exp (use_operand_p, tree);
 extern bool may_propagate_copy (tree, tree);
 extern bool may_propagate_copy_into_asm (tree);

+/* Affine iv.  */
+
+typedef struct
+{
+  /* Iv = BASE + STEP * i.  */
+  tree base, step;
+
+  /* True if this iv does not overflow.  */
+  bool no_overflow;
+} affine_iv;
+
 /* Description of number of iterations of a loop.  All the expressions inside
   the structure can be evaluated at the end of the loop's preheader
   (and due to ssa form, also anywhere inside the body of the loop).  */
@ -697,6 +708,15 @@ struct tree_niter_desc
 			   MAX_SIGNED_INT.  However if the (n <= 0) assumption
 			   is eliminated (by looking at the guard on entry of
 			   the loop), then the information would be lost.  */
+
+  /* The simplified shape of the exit condition.  The loop exits if
+     CONTROL CMP BOUND is false, where CMP is one of NE_EXPR,
+     LT_EXPR, or GT_EXPR, and step of CONTROL is positive if CMP is
+     LE_EXPR and negative if CMP is GE_EXPR.  This information is used
+     by loop unrolling.  */
+  affine_iv control;
+  tree bound;
+  enum tree_code cmp;
 };

 /* In tree-vectorizer.c */
@ -711,6 +731,7 @@ void tree_ssa_lim (struct loops *);
 void tree_ssa_unswitch_loops (struct loops *);
 void canonicalize_induction_variables (struct loops *);
 void tree_unroll_loops_completely (struct loops *, bool);
+void tree_ssa_prefetch_arrays (struct loops *);
 void remove_empty_loops (struct loops *);
 void tree_ssa_iv_optimize (struct loops *);

@ -748,6 +769,10 @@ struct loop *tree_ssa_loop_version (struct loops *, struct loop *, tree,
 tree expand_simple_operations (tree);
 void substitute_in_loop_info (struct loop *, tree, tree);
 edge single_dom_exit (struct loop *);
+bool can_unroll_loop_p (struct loop *loop, unsigned factor,
+			struct tree_niter_desc *niter);
+void tree_unroll_loop (struct loops *, struct loop *, unsigned,
+		       edge, struct tree_niter_desc *);

 /* In tree-ssa-threadedge.c */
 extern bool potentially_threadable_block (basic_block);
--- a/gcc/tree-outof-ssa.c
+++ b/gcc/tree-outof-ssa.c
@ -1299,7 +1299,8 @@ typedef struct value_expr_d
 typedef struct temp_expr_table_d 
 {
  var_map map;
-  void **version_info;		
+  void **version_info;
+  bitmap *expr_vars;
  value_expr_p *partition_dep_list;
  bitmap replaceable;
  bool saw_replaceable;
@ -1344,6 +1345,7 @@ new_temp_expr_table (var_map map)
  t->map = map;

  t->version_info = XCNEWVEC (void *, num_ssa_names + 1);
+  t->expr_vars = XCNEWVEC (bitmap, num_ssa_names + 1);
  t->partition_dep_list = XCNEWVEC (value_expr_p,
                                    num_var_partitions (map) + 1);

@ -1367,6 +1369,7 @@ free_temp_expr_table (temp_expr_table_p t)
 {
  value_expr_p p;
  tree *ret = NULL;
+  unsigned i;

 #ifdef ENABLE_CHECKING
  unsigned x;
@ -1383,6 +1386,11 @@ free_temp_expr_table (temp_expr_table_p t)
  BITMAP_FREE (t->partition_in_use);
  BITMAP_FREE (t->replaceable);

+  for (i = 0; i <= num_ssa_names; i++)
+    if (t->expr_vars[i])
+      BITMAP_FREE (t->expr_vars[i]);
+  free (t->expr_vars);
+
  free (t->partition_dep_list);
  if (t->saw_replaceable)
    ret = (tree *)t->version_info;
@ -1545,11 +1553,12 @@ add_dependance (temp_expr_table_p tab, int version, tree var)
 static bool
 check_replaceable (temp_expr_table_p tab, tree stmt)
 {
-  tree var, def;
+  tree var, def, basevar;
  int version;
  var_map map = tab->map;
  ssa_op_iter iter;
  tree call_expr;
+  bitmap def_vars = BITMAP_ALLOC (NULL), use_vars;

  if (TREE_CODE (stmt) != MODIFY_EXPR)
    return false;
@ -1580,12 +1589,19 @@ check_replaceable (temp_expr_table_p tab, tree stmt)
    }

  version = SSA_NAME_VERSION (def);
+  basevar = SSA_NAME_VAR (def);
+  bitmap_set_bit (def_vars, DECL_UID (basevar));

  /* Add this expression to the dependency list for each use partition.  */
  FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE)
    {
      add_dependance (tab, version, var);
+
+      use_vars = tab->expr_vars[SSA_NAME_VERSION (var)];
+      if (use_vars)
+	bitmap_ior_into (def_vars, use_vars);
    }
+  tab->expr_vars[version] = def_vars;

  /* If there are VUSES, add a dependence on virtual defs.  */
  if (!ZERO_SSA_OPERANDS (stmt, SSA_OP_VUSE))
@ -1704,7 +1720,7 @@ static void
 find_replaceable_in_bb (temp_expr_table_p tab, basic_block bb)
 {
  block_stmt_iterator bsi;
-  tree stmt, def;
+  tree stmt, def, use;
  stmt_ann_t ann;
  int partition;
  var_map map = tab->map;
@ -1717,30 +1733,34 @@ find_replaceable_in_bb (temp_expr_table_p tab, basic_block bb)
      ann = stmt_ann (stmt);

      /* Determine if this stmt finishes an existing expression.  */
-      FOR_EACH_SSA_TREE_OPERAND (def, stmt, iter, SSA_OP_USE)
+      FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE)
 	{
-	  if (tab->version_info[SSA_NAME_VERSION (def)])
+	  unsigned ver = SSA_NAME_VERSION (use);
+
+	  if (tab->version_info[ver])
 	    {
 	      bool same_root_var = false;
-	      tree def2;
 	      ssa_op_iter iter2;
+	      bitmap vars = tab->expr_vars[ver];

 	      /* See if the root variables are the same.  If they are, we
 		 do not want to do the replacement to avoid problems with
 		 code size, see PR tree-optimization/17549.  */
-	      FOR_EACH_SSA_TREE_OPERAND (def2, stmt, iter2, SSA_OP_DEF)
-		if (SSA_NAME_VAR (def) == SSA_NAME_VAR (def2))
-		  {
-		    same_root_var = true;
-		    break;
-		  }
+	      FOR_EACH_SSA_TREE_OPERAND (def, stmt, iter2, SSA_OP_DEF)
+		{
+		  if (bitmap_bit_p (vars, DECL_UID (SSA_NAME_VAR (def))))
+		    {
+		      same_root_var = true;
+		      break;
+		    }
+		}

 	      /* Mark expression as replaceable unless stmt is volatile
 		 or DEF sets the same root variable as STMT.  */
 	      if (!ann->has_volatile_ops && !same_root_var)
-		mark_replaceable (tab, def);
+		mark_replaceable (tab, use);
 	      else
-		finish_expr (tab, SSA_NAME_VERSION (def), false);
+		finish_expr (tab, ver, false);
 	    }
 	}
      
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@ -247,6 +247,7 @@ extern struct tree_opt_pass pass_record_bounds;
 extern struct tree_opt_pass pass_if_conversion;
 extern struct tree_opt_pass pass_vectorize;
 extern struct tree_opt_pass pass_complete_unroll;
+extern struct tree_opt_pass pass_loop_prefetch;
 extern struct tree_opt_pass pass_iv_optimize;
 extern struct tree_opt_pass pass_tree_loop_done;
 extern struct tree_opt_pass pass_ch;
--- a/gcc/tree-scalar-evolution.h
+++ b/gcc/tree-scalar-evolution.h
@ -34,17 +34,6 @@ extern void gather_stats_on_scev_database (void);
 extern void scev_analysis (void);
 void scev_const_prop (void);

-/* Affine iv.  */
-
-typedef struct
-{
-  /* Iv = BASE + STEP * i.  */
-  tree base, step;
-
-  /* True if this iv does not overflow.  */
-  bool no_overflow;
-} affine_iv;
-
 extern bool simple_iv (struct loop *, tree, tree, affine_iv *, bool);

 #endif  /* GCC_TREE_SCALAR_EVOLUTION_H  */
--- a/gcc/tree-ssa-loop-manip.c
+++ b/gcc/tree-ssa-loop-manip.c
@ -36,6 +36,7 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
 #include "tree-pass.h"
 #include "cfglayout.h"
 #include "tree-scalar-evolution.h"
+#include "params.h"

 /* Creates an induction variable with value BASE + STEP * iteration in LOOP.
   It is expected that neither BASE nor STEP are shared with other expressions
@ -618,3 +619,328 @@ tree_duplicate_loop_to_header_edge (struct loop *loop, edge e,

  return true;
 }
+
+/* Build if (COND) goto THEN_LABEL; else goto ELSE_LABEL;  */
+
+static tree
+build_if_stmt (tree cond, tree then_label, tree else_label)
+{
+  return build3 (COND_EXPR, void_type_node,
+		 cond,
+		 build1 (GOTO_EXPR, void_type_node, then_label),
+		 build1 (GOTO_EXPR, void_type_node, else_label));
+}
+
+/* Returns true if we can unroll LOOP FACTOR times.  Number
+   of iterations of the loop is returned in NITER.  */
+
+bool
+can_unroll_loop_p (struct loop *loop, unsigned factor,
+		   struct tree_niter_desc *niter)
+{
+  edge exit;
+
+  /* Check whether unrolling is possible.  We only want to unroll loops
+     for that we are able to determine number of iterations.  We also
+     want to split the extra iterations of the loop from its end,
+     therefore we require that the loop has precisely one
+     exit.  */
+
+  exit = single_dom_exit (loop);
+  if (!exit)
+    return false;
+
+  if (!number_of_iterations_exit (loop, exit, niter, false)
+      || niter->cmp == ERROR_MARK)
+    return false;
+
+  /* And of course, we must be able to duplicate the loop.  */
+  if (!can_duplicate_loop_p (loop))
+    return false;
+
+  /* The final loop should be small enough.  */
+  if (tree_num_loop_insns (loop) * factor
+      > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS))
+    return false;
+
+  return true;
+}
+
+/* Determines the conditions that control execution of LOOP unrolled FACTOR
+   times.  DESC is number of iterations of LOOP.  ENTER_COND is set to
+   condition that must be true if the main loop can be entered.
+   EXIT_BASE, EXIT_STEP, EXIT_CMP and EXIT_BOUND are set to values describing
+   how the exit from the unrolled loop should be controlled.  */
+
+static void
+determine_exit_conditions (struct loop *loop, struct tree_niter_desc *desc,
+			   unsigned factor, tree *enter_cond,
+			   tree *exit_base, tree *exit_step,
+			   enum tree_code *exit_cmp, tree *exit_bound)
+{
+  tree stmts;
+  tree base = desc->control.base;
+  tree step = desc->control.step;
+  tree bound = desc->bound;
+  tree type = TREE_TYPE (base);
+  tree bigstep, delta;
+  tree min = lower_bound_in_type (type, type);
+  tree max = upper_bound_in_type (type, type);
+  enum tree_code cmp = desc->cmp;
+  tree cond = boolean_true_node, assum;
+
+  *enter_cond = boolean_false_node;
+  *exit_base = NULL_TREE;
+  *exit_step = NULL_TREE;
+  *exit_cmp = ERROR_MARK;
+  *exit_bound = NULL_TREE;
+  gcc_assert (cmp != ERROR_MARK);
+
+  /* We only need to be correct when we answer question
+     "Do at least FACTOR more iterations remain?" in the unrolled loop.
+     Thus, transforming BASE + STEP * i <> BOUND to
+     BASE + STEP * i < BOUND is ok.  */
+  if (cmp == NE_EXPR)
+    {
+      if (tree_int_cst_sign_bit (step))
+	cmp = GT_EXPR;
+      else
+	cmp = LT_EXPR;
+    }
+  else if (cmp == LT_EXPR)
+    {
+      gcc_assert (!tree_int_cst_sign_bit (step));
+    }
+  else if (cmp == GT_EXPR)
+    {
+      gcc_assert (tree_int_cst_sign_bit (step));
+    }
+  else
+    gcc_unreachable ();
+
+  /* The main body of the loop may be entered iff:
+
+     1) desc->may_be_zero is false.
+     2) it is possible to check that there are at least FACTOR iterations
+	of the loop, i.e., BOUND - step * FACTOR does not overflow.
+     3) # of iterations is at least FACTOR  */
+
+  if (!zero_p (desc->may_be_zero))
+    cond = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+			invert_truthvalue (desc->may_be_zero),
+			cond);
+
+  bigstep = fold_build2 (MULT_EXPR, type, step,
+			 build_int_cst_type (type, factor));
+  delta = fold_build2 (MINUS_EXPR, type, bigstep, step);
+  if (cmp == LT_EXPR)
+    assum = fold_build2 (GE_EXPR, boolean_type_node,
+			 bound,
+			 fold_build2 (PLUS_EXPR, type, min, delta));
+  else
+    assum = fold_build2 (LE_EXPR, boolean_type_node,
+			 bound,
+			 fold_build2 (PLUS_EXPR, type, max, delta));
+  cond = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, assum, cond);
+
+  bound = fold_build2 (MINUS_EXPR, type, bound, delta);
+  assum = fold_build2 (cmp, boolean_type_node, base, bound);
+  cond = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, assum, cond);
+
+  cond = force_gimple_operand (unshare_expr (cond), &stmts, false, NULL_TREE);
+  if (stmts)
+    bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
+  /* cond now may be a gimple comparison, which would be OK, but also any
+     other gimple rhs (say a && b).  In this case we need to force it to
+     operand.  */
+  if (!is_gimple_condexpr (cond))
+    {
+      cond = force_gimple_operand (cond, &stmts, true, NULL_TREE);
+      if (stmts)
+	bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
+    }
+  *enter_cond = cond;
+
+  base = force_gimple_operand (unshare_expr (base), &stmts, true, NULL_TREE);
+  if (stmts)
+    bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
+  bound = force_gimple_operand (unshare_expr (bound), &stmts, true, NULL_TREE);
+  if (stmts)
+    bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
+
+  *exit_base = base;
+  *exit_step = bigstep;
+  *exit_cmp = cmp;
+  *exit_bound = bound;
+}
+
+/* Unroll LOOP FACTOR times.  LOOPS is the loops tree.  DESC describes
+   number of iterations of LOOP.  EXIT is the exit of the loop to that
+   DESC corresponds.
+   
+   If N is number of iterations of the loop and MAY_BE_ZERO is the condition
+   under that loop exits in the first iteration even if N != 0,
+   
+   while (1)
+     {
+       x = phi (init, next);
+
+       pre;
+       if (st)
+         break;
+       post;
+     }
+
+   becomes (with possibly the exit conditions formulated a bit differently,
+   avoiding the need to create a new iv):
+   
+   if (MAY_BE_ZERO || N < FACTOR)
+     goto rest;
+
+   do
+     {
+       x = phi (init, next);
+
+       pre;
+       post;
+       pre;
+       post;
+       ...
+       pre;
+       post;
+       N -= FACTOR;
+       
+     } while (N >= FACTOR);
+
+   rest:
+     init' = phi (init, x);
+
+   while (1)
+     {
+       x = phi (init', next);
+
+       pre;
+       if (st)
+         break;
+       post;
+     } */
+
+void
+tree_unroll_loop (struct loops *loops, struct loop *loop, unsigned factor,
+		  edge exit, struct tree_niter_desc *desc)
+{
+  tree dont_exit, exit_if, ctr_before, ctr_after;
+  tree enter_main_cond, exit_base, exit_step, exit_bound;
+  enum tree_code exit_cmp;
+  tree phi_old_loop, phi_new_loop, phi_rest, init, next, new_init, var;
+  struct loop *new_loop;
+  basic_block rest, exit_bb;
+  edge old_entry, new_entry, old_latch, precond_edge, new_exit;
+  edge nonexit, new_nonexit;
+  block_stmt_iterator bsi;
+  use_operand_p op;
+  bool ok;
+  unsigned est_niter;
+  sbitmap wont_exit;
+
+  est_niter = expected_loop_iterations (loop);
+  determine_exit_conditions (loop, desc, factor,
+			     &enter_main_cond, &exit_base, &exit_step,
+			     &exit_cmp, &exit_bound);
+
+  new_loop = loop_version (loops, loop, enter_main_cond, NULL, true);
+  gcc_assert (new_loop != NULL);
+  update_ssa (TODO_update_ssa);
+
+  /* Unroll the loop and remove the old exits.  */
+  dont_exit = ((exit->flags & EDGE_TRUE_VALUE)
+	       ? boolean_false_node
+	       : boolean_true_node);
+  if (exit == EDGE_SUCC (exit->src, 0))
+    nonexit = EDGE_SUCC (exit->src, 1);
+  else
+    nonexit = EDGE_SUCC (exit->src, 0);
+  nonexit->probability = REG_BR_PROB_BASE;
+  exit->probability = 0;
+  nonexit->count += exit->count;
+  exit->count = 0;
+  exit_if = last_stmt (exit->src);
+  COND_EXPR_COND (exit_if) = dont_exit;
+  update_stmt (exit_if);
+      
+  wont_exit = sbitmap_alloc (factor);
+  sbitmap_ones (wont_exit);
+  ok = tree_duplicate_loop_to_header_edge
+	  (loop, loop_latch_edge (loop), loops, factor - 1,
+	   wont_exit, NULL, NULL, NULL, DLTHE_FLAG_UPDATE_FREQ);
+  free (wont_exit);
+  gcc_assert (ok);
+  update_ssa (TODO_update_ssa);
+
+  /* Prepare the cfg and update the phi nodes.  */
+  rest = loop_preheader_edge (new_loop)->src;
+  precond_edge = single_pred_edge (rest);
+  loop_split_edge_with (loop_latch_edge (loop), NULL);
+  exit_bb = single_pred (loop->latch);
+
+  new_exit = make_edge (exit_bb, rest, EDGE_FALSE_VALUE);
+  new_exit->count = loop_preheader_edge (loop)->count;
+  est_niter = est_niter / factor + 1;
+  new_exit->probability = REG_BR_PROB_BASE / est_niter;
+
+  new_nonexit = single_pred_edge (loop->latch);
+  new_nonexit->flags = EDGE_TRUE_VALUE;
+  new_nonexit->probability = REG_BR_PROB_BASE - new_exit->probability;
+
+  old_entry = loop_preheader_edge (loop);
+  new_entry = loop_preheader_edge (new_loop);
+  old_latch = loop_latch_edge (loop);
+  for (phi_old_loop = phi_nodes (loop->header),
+       phi_new_loop = phi_nodes (new_loop->header);
+       phi_old_loop;
+       phi_old_loop = PHI_CHAIN (phi_old_loop),
+       phi_new_loop = PHI_CHAIN (phi_new_loop))
+    {
+      init = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_entry);
+      op = PHI_ARG_DEF_PTR_FROM_EDGE (phi_new_loop, new_entry);
+      gcc_assert (operand_equal_for_phi_arg_p (init, USE_FROM_PTR (op)));
+      next = PHI_ARG_DEF_FROM_EDGE (phi_old_loop, old_latch);
+
+      /* Prefer using original variable as a base for the new ssa name.
+	 This is necessary for virtual ops, and useful in order to avoid
+	 losing debug info for real ops.  */
+      if (TREE_CODE (next) == SSA_NAME)
+	var = SSA_NAME_VAR (next);
+      else if (TREE_CODE (init) == SSA_NAME)
+	var = SSA_NAME_VAR (init);
+      else
+	{
+	  var = create_tmp_var (TREE_TYPE (init), "unrinittmp");
+	  add_referenced_tmp_var (var);
+	}
+
+      new_init = make_ssa_name (var, NULL_TREE);
+      phi_rest = create_phi_node (new_init, rest);
+      SSA_NAME_DEF_STMT (new_init) = phi_rest;
+
+      add_phi_arg (phi_rest, init, precond_edge);
+      add_phi_arg (phi_rest, next, new_exit);
+      SET_USE (op, new_init);
+    }
+
+  /* Finally create the new counter for number of iterations and add the new
+     exit instruction.  */
+  bsi = bsi_last (exit_bb);
+  create_iv (exit_base, exit_step, NULL_TREE, loop,
+	     &bsi, true, &ctr_before, &ctr_after);
+  exit_if = build_if_stmt (build2 (exit_cmp, boolean_type_node, ctr_after,
+				   exit_bound),
+			   tree_block_label (loop->latch),
+			   tree_block_label (rest));
+  bsi_insert_after (&bsi, exit_if, BSI_NEW_STMT);
+
+  verify_flow_info ();
+  verify_dominators (CDI_DOMINATORS);
+  verify_loop_structure (loops);
+  verify_loop_closed_ssa ();
+}
--- a/gcc/tree-ssa-loop-niter.c
+++ b/gcc/tree-ssa-loop-niter.c
@ -140,6 +140,10 @@ number_of_iterations_ne (tree type, affine_iv *iv, tree final,
  tree niter_type = unsigned_type_for (type);
  tree s, c, d, bits, assumption, tmp, bound;

+  niter->control = *iv;
+  niter->bound = final;
+  niter->cmp = NE_EXPR;
+
  /* Rearrange the terms so that we get inequality s * i <> c, with s
     positive.  Also cast everything to the unsigned type.  */
  if (tree_int_cst_sign_bit (iv->step))
@ -410,6 +414,19 @@ number_of_iterations_lt (tree type, affine_iv *iv0, affine_iv *iv1,
  tree niter_type = unsigned_type_for (type);
  tree delta, step, s;

+  if (nonzero_p (iv0->step))
+    {
+      niter->control = *iv0;
+      niter->cmp = LT_EXPR;
+      niter->bound = iv1->base;
+    }
+  else
+    {
+      niter->control = *iv1;
+      niter->cmp = GT_EXPR;
+      niter->bound = iv0->base;
+    }
+
  delta = fold_build2 (MINUS_EXPR, niter_type,
 		       fold_convert (niter_type, iv1->base),
 		       fold_convert (niter_type, iv0->base));
@ -543,6 +560,9 @@ number_of_iterations_cond (tree type, affine_iv *iv0, enum tree_code code,
  niter->niter = NULL_TREE;
  niter->additional_info = boolean_true_node;

+  niter->bound = NULL_TREE;
+  niter->cmp = ERROR_MARK;
+
  /* Make < comparison from > ones, and for NE_EXPR comparisons, ensure that
     the control variable is on lhs.  */
  if (code == GE_EXPR || code == GT_EXPR
--- a/gcc/tree-ssa-loop.c
+++ b/gcc/tree-ssa-loop.c
@ -401,6 +401,40 @@ struct tree_opt_pass pass_complete_unroll =
  0					/* letter */
 };

+/* Prefetching.  */
+
+static void
+tree_ssa_loop_prefetch (void)
+{
+  if (!current_loops)
+    return;
+
+  tree_ssa_prefetch_arrays (current_loops);
+}
+
+static bool
+gate_tree_ssa_loop_prefetch (void)
+{
+  return flag_prefetch_loop_arrays == 1;
+}
+
+struct tree_opt_pass pass_loop_prefetch =
+{
+  "prefetch",				/* name */
+  gate_tree_ssa_loop_prefetch,		/* gate */
+  tree_ssa_loop_prefetch,	       	/* execute */
+  NULL,					/* sub */
+  NULL,					/* next */
+  0,					/* static_pass_number */
+  TV_TREE_PREFETCH,	  		/* tv_id */
+  PROP_cfg | PROP_ssa,			/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_dump_func | TODO_verify_loops,	/* todo_flags_finish */
+  0					/* letter */
+};
+
 /* Induction variable optimizations.  */

 static void