tree-vectorizer.c (slpeel_add_loop_guard): Gimplify the condition.

2007-12-05 Harsha Jagasia <harsha.jagasia@amd.com> * tree-vectorizer.c (slpeel_add_loop_guard): Gimplify the condition. (set_prologue_iterations): New. Set the prologue iterations to total number of scalar iterations if the cost model check indicates that scalar code should be generated. (slpeel_tree_peel_loop_to_edge): Add a new parameter and code for generating the cost condition for epilog. Call set_prologue_iterations to generate cost condition for prolog. (new_loop_vec_info): Initialize LOOP_VINFO_NITERS_UNCHANGED. * tree-vectorizer.h (LOOP_VINFO_NITERS_UNCHANGED): New. (slpeel_tree_peel_loop_to_edge): Update declaration. (set_prologue_iterations): New declaration. * tree-vect-analyze.c (vect_analyze_loop_form): Update LOOP_VINFO_NITERS_UNCHANGED. * tree-vect-transform.c (vect_estimate_min_profitable_iters): Add new parameter and code to check if run time cost model test is needed. Remove code that adds builtin vectorization cost to scalar outside cost for the run time cost model test. If run time cost model test is needed add the appropriate guard cost to the scalar outside cost. The guard cost depends on whether the guard is generated at versioning or at prolog generation or at epilog generation. Change cost model equation to include scalar outside cost. (conservative_cost_threshold): New. Return the less conservative profitability threshold between the cost model threshold and the user defined vectorization threshold. (vect_do_peeling_for_loop_bound): Call conservative_cost_threshold. (vect_do_peeling_for_alignment): Same. (vect_loop_versioning): Same. (vect_create_cond_for_align_checks): ANDs the cost model condition with the alignment condition. (vect_transform_loop): Call loop versioning only when there is a misalignment or an aliasing problem. From-SVN: r130651
2007-12-06 16:18:55 +00:00 · 2007-12-06 16:18:55 +00:00 · 749cc4b1b3
parent d7bd8aebf6
commit 749cc4b1b3
5 changed files with 446 additions and 81 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,39 @@
+2007-12-06  Harsha Jagasia <harsha.jagasia@amd.com>
+
+	* tree-vectorizer.c (slpeel_add_loop_guard): Gimplify the condition.
+	(set_prologue_iterations): New. Set the prologue iterations to total
+	number of scalar iterations if the cost model check indicates that
+	scalar code should be generated.
+	(slpeel_tree_peel_loop_to_edge): Add a new parameter and code for 
+	generating the cost condition for epilog. Call
+	set_prologue_iterations to generate cost condition for prolog.
+	(new_loop_vec_info): Initialize LOOP_VINFO_NITERS_UNCHANGED.
+	* tree-vectorizer.h (LOOP_VINFO_NITERS_UNCHANGED): New.
+	(slpeel_tree_peel_loop_to_edge): Update declaration.
+	(set_prologue_iterations): New declaration.
+	* tree-vect-analyze.c (vect_analyze_loop_form): Update 
+	LOOP_VINFO_NITERS_UNCHANGED.
+	* tree-vect-transform.c
+	(vect_estimate_min_profitable_iters): Add new parameter and
+	code to  check if run time cost model test is needed.
+	Remove code that adds builtin vectorization cost to scalar
+	outside cost for the run time cost model test. If run time
+	cost model test is needed add the appropriate guard cost to
+	the scalar outside cost. The guard cost depends on whether
+	the guard is generated at versioning or at prolog generation
+	or at epilog generation. Change cost model equation to include
+	scalar outside cost.
+	(conservative_cost_threshold): New. Return the less conservative
+	profitability threshold between the cost model threshold and the
+	user defined vectorization threshold.
+	(vect_do_peeling_for_loop_bound): Call conservative_cost_threshold.
+	(vect_do_peeling_for_alignment): Same.
+	(vect_loop_versioning): Same.
+	(vect_create_cond_for_align_checks): ANDs the cost model condition
+	with the alignment condition.
+	(vect_transform_loop): Call loop versioning only when there is a
+	misalignment or an aliasing problem.
+
 2007-12-06  Jakub Jelinek  <jakub@redhat.com>

 	PR middle-end/20983
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@ -602,6 +602,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)

  min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
  LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
+
  if (min_profitable_iters < 0)
    {
      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
@ -4216,6 +4217,7 @@ vect_analyze_loop_form (struct loop *loop)

  loop_vinfo = new_loop_vec_info (loop);
  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
+  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;

  STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;

--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@ -119,11 +119,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
  int vec_inside_cost = 0;
  int vec_outside_cost = 0;
  int scalar_single_iter_cost = 0;
+  int scalar_outside_cost = 0;
+  bool runtime_test = false;
  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
  int nbbs = loop->num_nodes;
-  int byte_misalign;
+  int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
  int peel_guard_costs = 0;
  int innerloop_iters = 0, factor;
  VEC (slp_instance, heap) *slp_instances;
@ -137,6 +139,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
      return 0;
    }

+  /* If the number of iterations is unknown, or the
+     peeling-for-misalignment amount is unknown, we will have to generate
+     a runtime test to test the loop count against the threshold.    */
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      || (byte_misalign < 0))
+    runtime_test = true;
+
  /* Requires loop versioning tests to handle misalignment.  */

  if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
@ -211,8 +220,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
     TODO: Build an expression that represents peel_iters for prologue and
     epilogue to be used in a run-time test.  */

-  byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
-
  if (byte_misalign < 0)
    {
      peel_iters_prologue = vf/2;
@ -276,24 +283,72 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                      + (peel_iters_epilogue * scalar_single_iter_cost)
                      + peel_guard_costs;

-  /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
-     information we provide for the target is whether testing against the
-     threshold involves a runtime test.  */
-  if (targetm.vectorize.builtin_vectorization_cost)
-    {
-      bool runtime_test = false;
+  /* FORNOW: The scalar outside cost is incremented in one of the
+     following ways:

-      /* If the number of iterations is unknown, or the
-	 peeling-for-misalignment amount is unknown, we eill have to generate
-	 a runtime test to test the loop count against the threshold.  */
-      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	  || (byte_misalign < 0))
-	runtime_test = true;
-      vec_outside_cost +=
-	targetm.vectorize.builtin_vectorization_cost (runtime_test);
-      if (vect_print_dump_info (REPORT_DETAILS))
-	fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
-		  targetm.vectorize.builtin_vectorization_cost (runtime_test));
+     1. The vectorizer checks for alignment and aliasing and generates
+     a condition that allows dynamic vectorization.  A cost model
+     check is ANDED with the versioning condition.  Hence scalar code
+     path now has the added cost of the versioning check.
+
+       if (cost > th & versioning_check)
+         jmp to vector code
+
+     Hence run-time scalar is incremented by not-taken branch cost.
+
+     2. The vectorizer then checks if a prologue is required.  If the
+     cost model check was not done before during versioning, it has to
+     be done before the prologue check.
+
+       if (cost <= th)
+         prologue = scalar_iters
+       if (prologue == 0)
+         jmp to vector code
+       else
+         execute prologue
+       if (prologue == num_iters)
+	 go to exit
+
+     Hence the run-time scalar cost is incremented by a taken branch,
+     plus a not-taken branch, plus a taken branch cost.
+
+     3. The vectorizer then checks if an epilogue is required.  If the
+     cost model check was not done before during prologue check, it
+     has to be done with the epilogue check.
+
+       if (prologue == 0)
+         jmp to vector code
+       else
+         execute prologue
+       if (prologue == num_iters)
+	 go to exit
+       vector code:
+         if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
+           jmp to epilogue
+
+     Hence the run-time scalar cost should be incremented by 2 taken
+     branches.
+
+     TODO: The back end may reorder the BBS's differently and reverse
+     conditions/branch directions.  Change the stimates below to
+     something more reasonable.  */
+
+  if (runtime_test)
+    {
+      /* Cost model check occurs at versioning.  */
+      if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+	  || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+	scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
+      else
+	{
+	  /* Cost model occurs at prologue generation.  */
+	  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+	    scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
+	      + TARG_COND_NOT_TAKEN_BRANCH_COST;
+	  /* Cost model check occurs at epilogue generation.  */
+	  else
+	    scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
+	}
    }

  /* Add SLP costs.  */
@ -306,9 +361,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)

  /* Calculate number of iterations required to make the vector version 
     profitable, relative to the loop bodies only. The following condition
-     must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
+     must hold true: 
+     SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
+     where
     SIC = scalar iteration cost, VIC = vector iteration cost,
-     VOC = vector outside cost and VF = vectorization factor.  */
+     VOC = vector outside cost, VF = vectorization factor,
+     PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
+     SOC = scalar outside cost for run time cost model check.  */

  if ((scalar_single_iter_cost * vf) > vec_inside_cost)
    {
@ -316,15 +375,15 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        min_profitable_iters = 1;
      else
        {
-          min_profitable_iters = (vec_outside_cost * vf 
-                                  - vec_inside_cost * peel_iters_prologue
+          min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
+				  - vec_inside_cost * peel_iters_prologue
                                  - vec_inside_cost * peel_iters_epilogue)
                                 / ((scalar_single_iter_cost * vf)
                                    - vec_inside_cost);

          if ((scalar_single_iter_cost * vf * min_profitable_iters)
              <= ((vec_inside_cost * min_profitable_iters)
-                  + (vec_outside_cost * vf)))
+                  + ((vec_outside_cost - scalar_outside_cost) * vf)))
            min_profitable_iters++;
        }
    }
@ -346,7 +405,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 	       vec_inside_cost);
      fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
 	       vec_outside_cost);
-      fprintf (vect_dump, "  Scalar cost: %d\n", scalar_single_iter_cost);
+      fprintf (vect_dump, "  Scalar iteration cost: %d\n",
+	       scalar_single_iter_cost);
+      fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
      fprintf (vect_dump, "  prologue iterations: %d\n",
               peel_iters_prologue);
      fprintf (vect_dump, "  epilogue iterations: %d\n",
@ -6467,6 +6528,37 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
    }
 }

+/* Return the more conservative threshold between the
+   min_profitable_iters returned by the cost model and the user
+   specified threshold, if provided.  */
+
+static unsigned int
+conservative_cost_threshold (loop_vec_info loop_vinfo,
+			     int min_profitable_iters)
+{
+  unsigned int th;
+  int min_scalar_loop_bound;
+
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+			    * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
+
+  /* Use the cost model only if it is more conservative than user specified
+     threshold.  */
+  th = (unsigned) min_scalar_loop_bound;
+  if (min_profitable_iters
+      && (!min_scalar_loop_bound
+          || min_profitable_iters > min_scalar_loop_bound))
+    th = (unsigned) min_profitable_iters;
+
+  if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))	      
+    fprintf (vect_dump, "not vectorized: vectorization may not be"
+	     "profitable.");
+  
+  if (th && vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "Vectorization may not be profitable.");
+
+  return th;
+}

 /* Function vect_do_peeling_for_loop_bound

@ -6487,8 +6579,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
  edge update_e;
  basic_block preheader;
  int loop_num;
-  unsigned int th;
-  int min_scalar_loop_bound;
+  bool check_profitability = false;
+  unsigned int th = 0;
  int min_profitable_iters;

  if (vect_print_dump_info (REPORT_DETAILS))
@ -6506,28 +6598,24 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)

  loop_num  = loop->num; 

-  /* Analyze cost to set threshhold for vectorized loop.  */
-  min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
-  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
-			    * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
+  /* If cost model check not done during versioning and 
+     peeling for alignment.  */
+  if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
+      && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+    {
+      check_profitability = true;

-  /* Use the cost model only if it is more conservative than user specified
-     threshold.  */
+      /* Get profitability threshold for vectorized loop.  */
+      min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);

-  th = (unsigned) min_scalar_loop_bound;
-  if (min_profitable_iters
-      && (!min_scalar_loop_bound
-          || min_profitable_iters > min_scalar_loop_bound))
-    th = (unsigned) min_profitable_iters;
-
-  if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
-      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-      && vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "vectorization may not be profitable.");
+      th = conservative_cost_threshold (loop_vinfo, 
+					min_profitable_iters);
+    }

  new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
                                            ratio_mult_vf_name, ni_name, false,
-                                            th);
+                                            th, check_profitability);
  gcc_assert (new_loop);
  gcc_assert (loop_num == loop->num);
 #ifdef ENABLE_CHECKING
@ -6745,6 +6833,9 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
  tree niters_of_prolog_loop, ni_name;
  tree n_iters;
  struct loop *new_loop;
+  bool check_profitability = false;
+  unsigned int th = 0;
+  int min_profitable_iters;

  if (vect_print_dump_info (REPORT_DETAILS))
    fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
@ -6754,10 +6845,26 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
  ni_name = vect_build_loop_niters (loop_vinfo);
  niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
  
+
+  /* If cost model check not done during versioning.  */
+  if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+    {
+      check_profitability = true;
+
+      /* Get profitability threshold for vectorized loop.  */
+      min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+
+      th = conservative_cost_threshold (loop_vinfo, 
+					min_profitable_iters);
+    }
+
  /* Peel the prolog loop and iterate it niters_of_prolog_loop.  */
-  new_loop = 
-	slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), 
-				       niters_of_prolog_loop, ni_name, true, 0); 
+  new_loop =
+    slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
+				   niters_of_prolog_loop, ni_name, true,
+				   th, check_profitability);
+
  gcc_assert (new_loop);
 #ifdef ENABLE_CHECKING
  slpeel_verify_cfg_after_peeling (new_loop, loop);
@ -6785,6 +6892,8 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
   checked at runtime.

   Input:
+   COND_EXPR  - input conditional expression.  New conditions will be chained
+                with logical AND operation.
   LOOP_VINFO - two fields of the loop information are used.
                LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
                LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
@ -6801,8 +6910,9 @@ vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
        test can be done as a&(n-1) == 0.  For example, for 16
        byte vectors the test is a&0xf == 0.  */

-static tree
+static void
 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
+                                   tree *cond_expr,
                                   tree *cond_expr_stmt_list)
 {
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@ -6818,6 +6928,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
  tree or_tmp_name = NULL_TREE;
  tree and_tmp, and_tmp_name, and_stmt;
  tree ptrsize_zero;
+  tree part_cond_expr;

  /* Check that mask is one less than a power of 2, i.e., mask is
     all zeros followed by all ones.  */
@ -6891,8 +7002,13 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
  /* Make and_tmp the left operand of the conditional test against zero.
     if and_tmp has a nonzero bit then some address is unaligned.  */
  ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
-  return build2 (EQ_EXPR, boolean_type_node,
-                 and_tmp_name, ptrsize_zero);
+  part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
+				and_tmp_name, ptrsize_zero);
+  if (*cond_expr)
+    *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+			      *cond_expr, part_cond_expr);
+  else
+    *cond_expr = part_cond_expr;
 }

 /* Function vect_vfa_segment_size.
@ -6933,7 +7049,7 @@ vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)

   Input:
   COND_EXPR  - input conditional expression.  New conditions will be chained
-                with logical and operation.
+                with logical AND operation.
   LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
 	        to be checked.

@ -7055,7 +7171,11 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
   data references that may or may not be aligned.  An additional
   sequence of runtime tests is generated for each pairs of DDRs whose
   independence was not proven.  The vectorized version of loop is 
-   executed only if both alias and alignment tests are passed.  */
+   executed only if both alias and alignment tests are passed.  
+  
+   The test generated to check which version of loop is executed
+   is modified to also check for profitability as indicated by the 
+   cost model initially.  */

 static void
 vect_loop_versioning (loop_vec_info loop_vinfo)
@ -7072,17 +7192,30 @@ vect_loop_versioning (loop_vec_info loop_vinfo)
  tree orig_phi, new_phi, arg;
  unsigned prob = 4 * REG_BR_PROB_BASE / 5;
  tree gimplify_stmt_list;
+  tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
+  int min_profitable_iters = 0;
+  unsigned int th;

-  if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
-      && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
-    return;
+  /* Get profitability threshold for vectorized loop.  */
+  min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
+
+  th = conservative_cost_threshold (loop_vinfo,
+				    min_profitable_iters);
+
+  cond_expr =
+    build2 (GT_EXPR, boolean_type_node, scalar_loop_iters, 
+	    build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+
+  cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
+				    false, NULL_TREE);

  if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
-    cond_expr =
-      vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
+      vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
+					 &cond_expr_stmt_list);

  if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
-    vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, &cond_expr_stmt_list);
+    vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, 
+				       &cond_expr_stmt_list);

  cond_expr =
    fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
@ -7260,7 +7393,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)

  if (vect_print_dump_info (REPORT_DETAILS))
    fprintf (vect_dump, "=== vec_transform_loop ===");
-  vect_loop_versioning (loop_vinfo);
+
+  if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
+      || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
+    vect_loop_versioning (loop_vinfo);

  /* CHECKME: we wouldn't need this if we called update_ssa once
     for all loops.  */
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@ -918,20 +918,29 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)

 static edge
 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
-		        basic_block dom_bb)
+		       basic_block dom_bb)
 {
  block_stmt_iterator bsi;
  edge new_e, enter_e;
  tree cond_stmt;
+  tree gimplify_stmt_list;

  enter_e = EDGE_SUCC (guard_bb, 0);
  enter_e->flags &= ~EDGE_FALLTHRU;
  enter_e->flags |= EDGE_FALSE_VALUE;
  bsi = bsi_last (guard_bb);

+  cond =
+    force_gimple_operand (cond, &gimplify_stmt_list, true,
+			  NULL_TREE);
  cond_stmt = build3 (COND_EXPR, void_type_node, cond,
 		      NULL_TREE, NULL_TREE);
+  if (gimplify_stmt_list)
+    bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
+
+  bsi = bsi_last (guard_bb);
  bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
+
  /* Add new edge to connect guard block to the merge/loop-exit block.  */
  new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
  set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
@ -1007,12 +1016,89 @@ slpeel_verify_cfg_after_peeling (struct loop *first_loop,
 }
 #endif

+/* If the run time cost model check determines that vectorization is
+   not profitable and hence scalar loop should be generated then set
+   FIRST_NITERS to prologue peeled iterations. This will allow all the
+   iterations to be executed in the prologue peeled scalar loop.  */
+
+void
+set_prologue_iterations (basic_block bb_before_first_loop,
+			 tree first_niters,
+			 struct loop *loop,
+			 unsigned int th)
+{
+  edge e;
+  basic_block cond_bb, then_bb;
+  tree var, prologue_after_cost_adjust_name, stmt;
+  block_stmt_iterator bsi;
+  tree newphi;
+  edge e_true, e_false, e_fallthru;
+  tree cond_stmt;
+  tree gimplify_stmt_list;
+  tree cost_pre_condition = NULL_TREE;
+  tree scalar_loop_iters = 
+    LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop));
+
+  e = single_pred_edge (bb_before_first_loop);
+  cond_bb = split_edge(e);
+
+  e = single_pred_edge (bb_before_first_loop);
+  then_bb = split_edge(e);
+  set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
+
+  e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
+				   EDGE_FALSE_VALUE);
+  set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
+
+  e_true = EDGE_PRED (then_bb, 0);
+  e_true->flags &= ~EDGE_FALLTHRU;
+  e_true->flags |= EDGE_TRUE_VALUE;
+
+  e_fallthru = EDGE_SUCC (then_bb, 0);
+
+  cost_pre_condition =
+    build2 (LE_EXPR, boolean_type_node, scalar_loop_iters, 
+	    build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+  cost_pre_condition =
+    force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
+			  true, NULL_TREE);
+  cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
+		      NULL_TREE, NULL_TREE);
+
+  bsi = bsi_last (cond_bb);
+  if (gimplify_stmt_list)
+    bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
+
+  bsi = bsi_last (cond_bb);
+  bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
+					  
+  var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
+			"prologue_after_cost_adjust");
+  add_referenced_var (var);
+  prologue_after_cost_adjust_name = 
+    force_gimple_operand (scalar_loop_iters, &stmt, false, var);
+
+  bsi = bsi_last (then_bb);
+  if (stmt)
+    bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
+
+  newphi = create_phi_node (var, bb_before_first_loop);
+  add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
+  add_phi_arg (newphi, first_niters, e_false);
+
+  first_niters = PHI_RESULT (newphi);
+}
+
+
 /* Function slpeel_tree_peel_loop_to_edge.

   Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
   that is placed on the entry (exit) edge E of LOOP. After this transformation
   we have two loops one after the other - first-loop iterates FIRST_NITERS
   times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
+   If the cost model indicates that it is profitable to emit a scalar 
+   loop instead of the vector one, then the prolog (epilog) loop will iterate
+   for the entire unchanged scalar iterations of the loop.

   Input:
   - LOOP: the loop to be peeled.
@ -1027,6 +1113,13 @@ slpeel_verify_cfg_after_peeling (struct loop *first_loop,
        for updating the loop bound of the first-loop to FIRST_NITERS.  If it
        is false, the caller of this function may want to take care of this
        (this can be useful if we don't want new stmts added to first-loop).
+   - TH: cost model profitability threshold of iterations for vectorization.
+   - CHECK_PROFITABILITY: specify whether cost model check has not occured
+                          during versioning and hence needs to occur during
+			  prologue generation or whether cost model check 
+			  has not occured during prologue generation and hence
+			  needs to occur during epilogue generation.
+	    

   Output:
   The function returns a pointer to the new loop-copy, or NULL if it failed
@ -1048,11 +1141,11 @@ struct loop*
 slpeel_tree_peel_loop_to_edge (struct loop *loop, 
 			       edge e, tree first_niters, 
 			       tree niters, bool update_first_loop_count,
-			       unsigned int th)
+			       unsigned int th, bool check_profitability)
 {
  struct loop *new_loop = NULL, *first_loop, *second_loop;
  edge skip_e;
-  tree pre_condition;
+  tree pre_condition = NULL_TREE;
  bitmap definitions;
  basic_block bb_before_second_loop, bb_after_second_loop;
  basic_block bb_before_first_loop;
@ -1060,6 +1153,9 @@ slpeel_tree_peel_loop_to_edge (struct loop *loop,
  basic_block new_exit_bb;
  edge exit_e = single_exit (loop);
  LOC loop_loc;
+  tree cost_pre_condition = NULL_TREE;
+  tree scalar_loop_iters = 
+    LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop));
  
  if (!slpeel_can_duplicate_loop_p (loop, e))
    return NULL;
@ -1116,32 +1212,121 @@ slpeel_tree_peel_loop_to_edge (struct loop *loop,
  rename_variables_in_loop (new_loop);


-  /* 2. Add the guard that controls whether the first loop is executed.
-        Resulting CFG would be:
+  /* 2.  Add the guard code in one of the following ways:

-        bb_before_first_loop:
-        if (FIRST_NITERS == 0) GOTO bb_before_second_loop
-                               GOTO first-loop
+     2.a Add the guard that controls whether the first loop is executed.
+         This occurs when this function is invoked for prologue or epilogiue
+	 generation and when the cost model check can be done at compile time.

-        first_loop:
-        do {
-        } while ...
+         Resulting CFG would be:

-        bb_before_second_loop:
+         bb_before_first_loop:
+         if (FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop

-        second_loop:
-        do {
-        } while ...
+         first_loop:
+         do {
+         } while ...

-        orig_exit_bb:
-   */
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+
+     2.b Add the cost model check that allows the prologue
+         to iterate for the entire unchanged scalar
+         iterations of the loop in the event that the cost
+         model indicates that the scalar loop is more
+         profitable than the vector one. This occurs when
+	 this function is invoked for prologue generation
+	 and the cost model check needs to be done at run
+	 time.
+
+         Resulting CFG after prologue peeling would be:
+
+         if (scalar_loop_iterations <= th)
+           FIRST_NITERS = scalar_loop_iterations
+
+         bb_before_first_loop:
+         if (FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop
+
+         first_loop:
+         do {
+         } while ...
+
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+
+     2.c Add the cost model check that allows the epilogue
+         to iterate for the entire unchanged scalar
+         iterations of the loop in the event that the cost
+         model indicates that the scalar loop is more
+         profitable than the vector one. This occurs when
+	 this function is invoked for epilogue generation
+	 and the cost model check needs to be done at run
+	 time.
+
+         Resulting CFG after prologue peeling would be:
+
+         bb_before_first_loop:
+         if ((scalar_loop_iterations <= th)
+             ||
+             FIRST_NITERS == 0) GOTO bb_before_second_loop
+                                GOTO first-loop
+
+         first_loop:
+         do {
+         } while ...
+
+         bb_before_second_loop:
+
+         second_loop:
+         do {
+         } while ...
+
+         orig_exit_bb:
+  */

  bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
  bb_before_second_loop = split_edge (single_exit (first_loop));

-  pre_condition =
-    fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
-	build_int_cst (TREE_TYPE (first_niters), th));
+  /* Epilogue peeling.  */
+  if (!update_first_loop_count)
+    {
+      pre_condition =
+	fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
+		     build_int_cst (TREE_TYPE (first_niters), 0));
+      if (check_profitability)
+	{
+	    cost_pre_condition = 
+	    build2 (LE_EXPR, boolean_type_node, scalar_loop_iters, 
+		    build_int_cst (TREE_TYPE (scalar_loop_iters), th));
+  
+	  pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+				       cost_pre_condition, pre_condition);
+	}
+    }
+
+  /* Prologue peeling.  */  
+  else
+    {
+      if (check_profitability)
+	set_prologue_iterations (bb_before_first_loop, first_niters,
+				 loop, th);
+
+      pre_condition =
+	fold_build2 (LE_EXPR, boolean_type_node, first_niters, 
+		     build_int_cst (TREE_TYPE (first_niters), 0));
+    }

  skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
                                  bb_before_second_loop, bb_before_first_loop);
@ -1468,6 +1653,7 @@ new_loop_vec_info (struct loop *loop)

  LOOP_VINFO_BBS (res) = bbs;
  LOOP_VINFO_NITERS (res) = NULL;
+  LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
  LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
  LOOP_VINFO_VECTORIZABLE_P (res) = 0;
  LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@ -228,6 +228,9 @@ typedef struct _loop_vec_info {
 #define LOOP_VINFO_LOOP(L)            (L)->loop
 #define LOOP_VINFO_BBS(L)             (L)->bbs
 #define LOOP_VINFO_NITERS(L)          (L)->num_iters
+/* Since LOOP_VINFO_NITERS can change after prologue peeling
+   retain total unchanged scalar loop iterations for cost model.  */
+#define LOOP_VINFO_NITERS_UNCHANGED(L)          (L)->num_iters
 #define LOOP_VINFO_COST_MODEL_MIN_ITERS(L)	(L)->min_profitable_iters
 #define LOOP_VINFO_VECTORIZABLE_P(L)  (L)->vectorizable
 #define LOOP_VINFO_VECT_FACTOR(L)     (L)->vectorization_factor
@ -630,7 +633,9 @@ extern bitmap vect_memsyms_to_rename;
   divide by the vectorization factor, and to peel the first few iterations
   to force the alignment of data references in the loop.  */
 extern struct loop *slpeel_tree_peel_loop_to_edge 
-  (struct loop *, edge, tree, tree, bool, unsigned int);
+  (struct loop *, edge, tree, tree, bool, unsigned int, bool);
+extern void set_prologue_iterations (basic_block, tree,
+				     struct loop *, unsigned int);
 extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree);
 extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
 #ifdef ENABLE_CHECKING