Optionally pick the cheapest loop_vec_info

This patch adds a mode in which the vectoriser tries each available base vector mode and picks the one with the lowest cost. The new behaviour is selected by autovectorize_vector_modes. The patch keeps the current behaviour of preferring a VF of loop->simdlen over any larger or smaller VF, regardless of costs or target preferences. 2019-11-16 Richard Sandiford <richard.sandiford@arm.com> gcc/ * target.h (VECT_COMPARE_COSTS): New constant. * target.def (autovectorize_vector_modes): Return a bitmask of flags. * doc/tm.texi: Regenerate. * targhooks.h (default_autovectorize_vector_modes): Update accordingly. * targhooks.c (default_autovectorize_vector_modes): Likewise. * config/aarch64/aarch64.c (aarch64_autovectorize_vector_modes): Likewise. * config/arc/arc.c (arc_autovectorize_vector_modes): Likewise. * config/arm/arm.c (arm_autovectorize_vector_modes): Likewise. * config/i386/i386.c (ix86_autovectorize_vector_modes): Likewise. * config/mips/mips.c (mips_autovectorize_vector_modes): Likewise. * tree-vectorizer.h (_loop_vec_info::vec_outside_cost) (_loop_vec_info::vec_inside_cost): New member variables. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them. (vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions. (vect_analyze_loop): When autovectorize_vector_modes returns VECT_COMPARE_COSTS, try vectorizing the loop with each available vector mode and picking the one with the lowest cost. (vect_estimate_min_profitable_iters): Record the computed costs in the loop_vec_info. From-SVN: r278336
2019-11-16 10:40:23 +00:00 · 2019-11-16 10:40:23 +00:00 · bcc7e346bf
commit bcc7e346bf
parent f884cd2fea
13 changed files with 219 additions and 19 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,26 @@
 2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
 	* target.h (VECT_COMPARE_COSTS): New constant.
 	* target.def (autovectorize_vector_modes): Return a bitmask of flags.
 	* doc/tm.texi: Regenerate.
 	* targhooks.h (default_autovectorize_vector_modes): Update accordingly.
 	* targhooks.c (default_autovectorize_vector_modes): Likewise.
 	* config/aarch64/aarch64.c (aarch64_autovectorize_vector_modes):
 	Likewise.
 	* config/arc/arc.c (arc_autovectorize_vector_modes): Likewise.
 	* config/arm/arm.c (arm_autovectorize_vector_modes): Likewise.
 	* config/i386/i386.c (ix86_autovectorize_vector_modes): Likewise.
 	* config/mips/mips.c (mips_autovectorize_vector_modes): Likewise.
 	* tree-vectorizer.h (_loop_vec_info::vec_outside_cost)
 	(_loop_vec_info::vec_inside_cost): New member variables.
 	* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them.
 	(vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions.
 	(vect_analyze_loop): When autovectorize_vector_modes returns
 	VECT_COMPARE_COSTS, try vectorizing the loop with each available
 	vector mode and picking the one with the lowest cost.
 	(vect_estimate_min_profitable_iters): Record the computed costs
 	in the loop_vec_info.
 2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
 	* tree-vectorizer.h (can_duplicate_and_interleave_p): Take an
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@ -15935,7 +15935,7 @@ aarch64_preferred_simd_mode (scalar_mode mode)
 /* Return a list of possible vector sizes for the vectorizer
   to iterate over.  */
-static void
+static unsigned int
 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
 {
  if (TARGET_SVE)
@ -15961,6 +15961,8 @@ aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
     TODO: We could similarly support limited forms of V2QImode and V2HImode
     for this case.  */
  modes->safe_push (V2SImode);
  return 0;
 }
 /* Implement TARGET_MANGLE_TYPE.  */
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@ -609,7 +609,7 @@ arc_preferred_simd_mode (scalar_mode mode)
 /* Implements target hook
   TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES.  */
-static void
+static unsigned int
 arc_autovectorize_vector_modes (vector_modes *modes, bool)
 {
  if (TARGET_PLUS_QMACW)
@ -617,6 +617,7 @@ arc_autovectorize_vector_modes (vector_modes *modes, bool)
      modes->quick_push (V4HImode);
      modes->quick_push (V2HImode);
    }
  return 0;
 }
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_misalignment (machine_mode mode,
 static void arm_conditional_register_usage (void);
 static enum flt_eval_method arm_excess_precision (enum excess_precision_type);
 static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
-static void arm_autovectorize_vector_modes (vector_modes *, bool);
+static unsigned int arm_autovectorize_vector_modes (vector_modes *, bool);
 static int arm_default_branch_cost (bool, bool);
 static int arm_cortex_a5_branch_cost (bool, bool);
 static int arm_cortex_m_branch_cost (bool, bool);
@ -29015,7 +29015,7 @@ arm_vector_alignment (const_tree type)
  return align;
 }
-static void
+static unsigned int
 arm_autovectorize_vector_modes (vector_modes *modes, bool)
 {
  if (!TARGET_NEON_VECTORIZE_DOUBLE)
@ -29023,6 +29023,7 @@ arm_autovectorize_vector_modes (vector_modes *modes, bool)
      modes->safe_push (V16QImode);
      modes->safe_push (V8QImode);
    }
  return 0;
 }
 static bool
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@ -21384,7 +21384,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
   vectors.  If AVX512F is enabled then try vectorizing with 512bit,
   256bit and 128bit vectors.  */
-static void
+static unsigned int
 ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
 {
  if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
@ -21414,6 +21414,8 @@ ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
  if (TARGET_MMX_WITH_SSE)
    modes->safe_push (V8QImode);
  return 0;
 }
 /* Implemenation of targetm.vectorize.get_mask_mode.  */
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@ -13455,11 +13455,12 @@ mips_preferred_simd_mode (scalar_mode mode)
 /* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES.  */
-static void
+static unsigned int
 mips_autovectorize_vector_modes (vector_modes *modes, bool)
 {
  if (ISA_HAS_MSA)
    modes->safe_push (V16QImode);
  return 0;
 }
 /* Implement TARGET_INIT_LIBFUNCS.  */
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@ -6008,7 +6008,7 @@ against lower halves of vectors recursively until the specified mode is
 reached.  The default is @var{mode} which means no splitting.
@end deftypefn
-@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all})
+@deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all})
 If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}
 is not the only approach worth considering, this hook should add one mode to
@var{modes} for each useful alternative approach.  These modes are then
@ -6024,9 +6024,19 @@ element mode.
 If @var{all} is true, add suitable vector modes even when they are generally
 not expected to be worthwhile.
 The hook returns a bitmask of flags that control how the modes in
@var{modes} are used.  The flags are:
@table @code
@item VECT_COMPARE_COSTS
 Tells the loop vectorizer to try all the provided modes and pick the one
 with the lowest cost.  By default the vectorizer will choose the first
 mode that works.
@end table
 The hook does not need to do anything if the vector returned by
@code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant
-for autovectorization.  The default implementation does nothing.
+for autovectorization.  The default implementation adds no modes and
 returns 0.
@end deftypefn
@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_RELATED_MODE (machine_mode @var{vector_mode}, scalar_mode @var{element_mode}, poly_uint64 @var{nunits})
--- a/gcc/target.def
+++ b/gcc/target.def
@ -1925,10 +1925,20 @@ element mode.\n\
 If @var{all} is true, add suitable vector modes even when they are generally\n\
 not expected to be worthwhile.\n\
 \n\
 The hook returns a bitmask of flags that control how the modes in\n\
@var{modes} are used.  The flags are:\n\
@table @code\n\
@item VECT_COMPARE_COSTS\n\
 Tells the loop vectorizer to try all the provided modes and pick the one\n\
 with the lowest cost.  By default the vectorizer will choose the first\n\
 mode that works.\n\
@end table\n\
 \n\
 The hook does not need to do anything if the vector returned by\n\
@code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\
-for autovectorization.  The default implementation does nothing.",
+for autovectorization.  The default implementation adds no modes and\n\
- void,
+returns 0.",
 unsigned int,
 (vector_modes *modes, bool all),
 default_autovectorize_vector_modes)
--- a/gcc/target.h
+++ b/gcc/target.h
@ -218,6 +218,14 @@ enum omp_device_kind_arch_isa {
  omp_device_isa
 };
 /* Flags returned by TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES:
   VECT_COMPARE_COSTS
       Tells the loop vectorizer to try all the provided modes and
       pick the one with the lowest cost.  By default the vectorizer
       will choose the first mode that works.  */
 const unsigned int VECT_COMPARE_COSTS = 1U << 0;
 /* The target structure.  This holds all the backend hooks.  */
 #define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
 #define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@ -1300,9 +1300,10 @@ default_split_reduction (machine_mode mode)
 /* By default only the preferred vector mode is tried.  */
-void
+unsigned int
 default_autovectorize_vector_modes (vector_modes *, bool)
 {
  return 0;
 }
 /* The default implementation of TARGET_VECTORIZE_RELATED_MODE.  */
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@ -113,7 +113,7 @@ default_builtin_support_vector_misalignment (machine_mode mode,
 					     int, bool);
 extern machine_mode default_preferred_simd_mode (scalar_mode mode);
 extern machine_mode default_split_reduction (machine_mode);
-extern void default_autovectorize_vector_modes (vector_modes *, bool);
+extern unsigned int default_autovectorize_vector_modes (vector_modes *, bool);
 extern opt_machine_mode default_vectorize_related_mode (machine_mode,
 							scalar_mode,
 							poly_uint64);
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@ -829,6 +829,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
    scan_map (NULL),
    slp_unrolling_factor (1),
    single_scalar_iteration_cost (0),
    vec_outside_cost (0),
    vec_inside_cost (0),
    vectorizable (false),
    can_fully_mask_p (true),
    fully_masked_p (false),
@ -2374,6 +2376,80 @@ again:
  goto start_over;
 }
 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
   to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
   OLD_LOOP_VINFO is better unless something specifically indicates
   otherwise.
   Note that this deliberately isn't a partial order.  */
 static bool
 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
 			  loop_vec_info old_loop_vinfo)
 {
  struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
  gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
  poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
  poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
  /* Always prefer a VF of loop->simdlen over any other VF.  */
  if (loop->simdlen)
    {
      bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
      bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
      if (new_simdlen_p != old_simdlen_p)
 	return new_simdlen_p;
    }
  /* Limit the VFs to what is likely to be the maximum number of iterations,
     to handle cases in which at least one loop_vinfo is fully-masked.  */
  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
  if (estimated_max_niter != -1)
    {
      if (known_le (estimated_max_niter, new_vf))
 	new_vf = estimated_max_niter;
      if (known_le (estimated_max_niter, old_vf))
 	old_vf = estimated_max_niter;
    }
  /* Check whether the (fractional) cost per scalar iteration is lower
     or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
  poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
 			     * poly_widest_int (old_vf));
  poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
 			     * poly_widest_int (new_vf));
  if (maybe_lt (rel_old, rel_new))
    return false;
  if (known_lt (rel_new, rel_old))
    return true;
  /* If there's nothing to choose between the loop bodies, see whether
     there's a difference in the prologue and epilogue costs.  */
  if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
    return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
  return false;
 }
 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
   true if we should.  */
 static bool
 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
 			loop_vec_info old_loop_vinfo)
 {
  if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
    return false;
  if (dump_enabled_p ())
    dump_printf_loc (MSG_NOTE, vect_location,
 		     "***** Preferring vector mode %s to vector mode %s\n",
 		     GET_MODE_NAME (new_loop_vinfo->vector_mode),
 		     GET_MODE_NAME (old_loop_vinfo->vector_mode));
  return true;
 }
 /* Function vect_analyze_loop.
   Apply a set of analyses on LOOP, and create a loop_vec_info struct
@ -2385,7 +2461,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
  auto_vector_modes vector_modes;
  /* Autodetect first vector size we try.  */
-  targetm.vectorize.autovectorize_vector_modes (&vector_modes,
+  unsigned int autovec_flags
    = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
 						    loop->simdlen != 0);
  unsigned int mode_i = 0;
@ -2409,6 +2486,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
  machine_mode next_vector_mode = VOIDmode;
  poly_uint64 lowest_th = 0;
  unsigned vectorized_loops = 0;
  bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
 			     && !unlimited_cost_model (loop));
  bool vect_epilogues = false;
  opt_result res = opt_result::success ();
@ -2429,6 +2508,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
      bool fatal = false;
      /* When pick_lowest_cost_p is true, we should in principle iterate
 	 over all the loop_vec_infos that LOOP_VINFO could replace and
 	 try to vectorize LOOP_VINFO under the same conditions.
 	 E.g. when trying to replace an epilogue loop, we should vectorize
 	 LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
 	 to replace the main loop, we should vectorize LOOP_VINFO as a main
 	 loop too.
 	 However, autovectorize_vector_modes is usually sorted as follows:
 	 - Modes that naturally produce lower VFs usually follow modes that
 	   naturally produce higher VFs.
 	 - When modes naturally produce the same VF, maskable modes
 	   usually follow unmaskable ones, so that the maskable mode
 	   can be used to vectorize the epilogue of the unmaskable mode.
 	 This order is preferred because it leads to the maximum
 	 epilogue vectorization opportunities.  Targets should only use
 	 a different order if they want to make wide modes available while
 	 disparaging them relative to earlier, smaller modes.  The assumption
 	 in that case is that the wider modes are more expensive in some
 	 way that isn't reflected directly in the costs.
 	 There should therefore be few interesting cases in which
 	 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
 	 treated as a standalone loop, and ends up being genuinely cheaper
 	 than FIRST_LOOP_VINFO.  */
      if (vect_epilogues)
 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
@ -2476,13 +2583,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 	      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
 	      simdlen = 0;
 	    }
 	  else if (pick_lowest_cost_p && first_loop_vinfo)
 	    {
 	      /* Keep trying to roll back vectorization attempts while the
 		 loop_vec_infos they produced were worse than this one.  */
 	      vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
 	      while (!vinfos.is_empty ()
 		     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
 		{
 		  gcc_assert (vect_epilogues);
 		  delete vinfos.pop ();
 		}
 	      if (vinfos.is_empty ()
 		  && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
 		{
 		  delete first_loop_vinfo;
 		  first_loop_vinfo = opt_loop_vec_info::success (NULL);
 		  LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
 		}
 	    }
 	  if (first_loop_vinfo == NULL)
 	    {
 	      first_loop_vinfo = loop_vinfo;
 	      lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
 	    }
-	  else if (vect_epilogues)
+	  else if (vect_epilogues
 		   /* For now only allow one epilogue loop.  */
 		   && first_loop_vinfo->epilogue_vinfos.is_empty ())
 	    {
 	      first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
 	      poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
@ -2506,12 +2634,14 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 			    && param_vect_epilogues_nomask
 			    && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
 			    && !loop->simduid
-			    /* For now only allow one epilogue loop.  */
+			    /* For now only allow one epilogue loop, but allow
-			    && first_loop_vinfo->epilogue_vinfos.is_empty ());
+			       pick_lowest_cost_p to replace it.  */
 			    && (first_loop_vinfo->epilogue_vinfos.is_empty ()
 				|| pick_lowest_cost_p));
 	  /* Commit to first_loop_vinfo if we have no reason to try
 	     alternatives.  */
-	  if (!simdlen && !vect_epilogues)
+	  if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
 	    break;
 	}
      else
@ -3510,6 +3640,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
  vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
  /* Stash the costs so that we can compare two loop_vec_infos.  */
  loop_vinfo->vec_inside_cost = vec_inside_cost;
  loop_vinfo->vec_outside_cost = vec_outside_cost;
  if (dump_enabled_p ())
    {
      dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@ -601,6 +601,13 @@ public:
  /* Cost of a single scalar iteration.  */
  int single_scalar_iteration_cost;
  /* The cost of the vector prologue and epilogue, including peeled
     iterations and set-up code.  */
  int vec_outside_cost;
  /* The cost of the vector loop body.  */
  int vec_inside_cost;
  /* Is the loop vectorizable? */
  bool vectorizable;