Optionally pick the cheapest loop_vec_info

This patch adds a mode in which the vectoriser tries each available
base vector mode and picks the one with the lowest cost.  The new
behaviour is selected by autovectorize_vector_modes.

The patch keeps the current behaviour of preferring a VF of
loop->simdlen over any larger or smaller VF, regardless of costs
or target preferences.

2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* target.h (VECT_COMPARE_COSTS): New constant.
	* target.def (autovectorize_vector_modes): Return a bitmask of flags.
	* doc/tm.texi: Regenerate.
	* targhooks.h (default_autovectorize_vector_modes): Update accordingly.
	* targhooks.c (default_autovectorize_vector_modes): Likewise.
	* config/aarch64/aarch64.c (aarch64_autovectorize_vector_modes):
	Likewise.
	* config/arc/arc.c (arc_autovectorize_vector_modes): Likewise.
	* config/arm/arm.c (arm_autovectorize_vector_modes): Likewise.
	* config/i386/i386.c (ix86_autovectorize_vector_modes): Likewise.
	* config/mips/mips.c (mips_autovectorize_vector_modes): Likewise.
	* tree-vectorizer.h (_loop_vec_info::vec_outside_cost)
	(_loop_vec_info::vec_inside_cost): New member variables.
	* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them.
	(vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions.
	(vect_analyze_loop): When autovectorize_vector_modes returns
	VECT_COMPARE_COSTS, try vectorizing the loop with each available
	vector mode and picking the one with the lowest cost.
	(vect_estimate_min_profitable_iters): Record the computed costs
	in the loop_vec_info.

From-SVN: r278336
This commit is contained in:
Richard Sandiford 2019-11-16 10:40:23 +00:00 committed by Richard Sandiford
parent f884cd2fea
commit bcc7e346bf
13 changed files with 219 additions and 19 deletions

View File

@ -1,3 +1,26 @@
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
* target.h (VECT_COMPARE_COSTS): New constant.
* target.def (autovectorize_vector_modes): Return a bitmask of flags.
* doc/tm.texi: Regenerate.
* targhooks.h (default_autovectorize_vector_modes): Update accordingly.
* targhooks.c (default_autovectorize_vector_modes): Likewise.
* config/aarch64/aarch64.c (aarch64_autovectorize_vector_modes):
Likewise.
* config/arc/arc.c (arc_autovectorize_vector_modes): Likewise.
* config/arm/arm.c (arm_autovectorize_vector_modes): Likewise.
* config/i386/i386.c (ix86_autovectorize_vector_modes): Likewise.
* config/mips/mips.c (mips_autovectorize_vector_modes): Likewise.
* tree-vectorizer.h (_loop_vec_info::vec_outside_cost)
(_loop_vec_info::vec_inside_cost): New member variables.
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them.
(vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions.
(vect_analyze_loop): When autovectorize_vector_modes returns
VECT_COMPARE_COSTS, try vectorizing the loop with each available
vector mode and picking the one with the lowest cost.
(vect_estimate_min_profitable_iters): Record the computed costs
in the loop_vec_info.
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
* tree-vectorizer.h (can_duplicate_and_interleave_p): Take an

View File

@ -15935,7 +15935,7 @@ aarch64_preferred_simd_mode (scalar_mode mode)
/* Return a list of possible vector sizes for the vectorizer
to iterate over. */
static void
static unsigned int
aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
{
if (TARGET_SVE)
@ -15961,6 +15961,8 @@ aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
TODO: We could similarly support limited forms of V2QImode and V2HImode
for this case. */
modes->safe_push (V2SImode);
return 0;
}
/* Implement TARGET_MANGLE_TYPE. */

View File

@ -609,7 +609,7 @@ arc_preferred_simd_mode (scalar_mode mode)
/* Implements target hook
TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */
static void
static unsigned int
arc_autovectorize_vector_modes (vector_modes *modes, bool)
{
if (TARGET_PLUS_QMACW)
@ -617,6 +617,7 @@ arc_autovectorize_vector_modes (vector_modes *modes, bool)
modes->quick_push (V4HImode);
modes->quick_push (V2HImode);
}
return 0;
}

View File

@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_misalignment (machine_mode mode,
static void arm_conditional_register_usage (void);
static enum flt_eval_method arm_excess_precision (enum excess_precision_type);
static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
static void arm_autovectorize_vector_modes (vector_modes *, bool);
static unsigned int arm_autovectorize_vector_modes (vector_modes *, bool);
static int arm_default_branch_cost (bool, bool);
static int arm_cortex_a5_branch_cost (bool, bool);
static int arm_cortex_m_branch_cost (bool, bool);
@ -29015,7 +29015,7 @@ arm_vector_alignment (const_tree type)
return align;
}
static void
static unsigned int
arm_autovectorize_vector_modes (vector_modes *modes, bool)
{
if (!TARGET_NEON_VECTORIZE_DOUBLE)
@ -29023,6 +29023,7 @@ arm_autovectorize_vector_modes (vector_modes *modes, bool)
modes->safe_push (V16QImode);
modes->safe_push (V8QImode);
}
return 0;
}
static bool

View File

@ -21384,7 +21384,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
vectors. If AVX512F is enabled then try vectorizing with 512bit,
256bit and 128bit vectors. */
static void
static unsigned int
ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
{
if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
@ -21414,6 +21414,8 @@ ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
if (TARGET_MMX_WITH_SSE)
modes->safe_push (V8QImode);
return 0;
}
/* Implemenation of targetm.vectorize.get_mask_mode. */

View File

@ -13455,11 +13455,12 @@ mips_preferred_simd_mode (scalar_mode mode)
/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */
static void
static unsigned int
mips_autovectorize_vector_modes (vector_modes *modes, bool)
{
if (ISA_HAS_MSA)
modes->safe_push (V16QImode);
return 0;
}
/* Implement TARGET_INIT_LIBFUNCS. */

View File

@ -6008,7 +6008,7 @@ against lower halves of vectors recursively until the specified mode is
reached. The default is @var{mode} which means no splitting.
@end deftypefn
@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all})
@deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all})
If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}
is not the only approach worth considering, this hook should add one mode to
@var{modes} for each useful alternative approach. These modes are then
@ -6024,9 +6024,19 @@ element mode.
If @var{all} is true, add suitable vector modes even when they are generally
not expected to be worthwhile.
The hook returns a bitmask of flags that control how the modes in
@var{modes} are used. The flags are:
@table @code
@item VECT_COMPARE_COSTS
Tells the loop vectorizer to try all the provided modes and pick the one
with the lowest cost. By default the vectorizer will choose the first
mode that works.
@end table
The hook does not need to do anything if the vector returned by
@code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant
for autovectorization. The default implementation does nothing.
for autovectorization. The default implementation adds no modes and
returns 0.
@end deftypefn
@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_RELATED_MODE (machine_mode @var{vector_mode}, scalar_mode @var{element_mode}, poly_uint64 @var{nunits})

View File

@ -1925,10 +1925,20 @@ element mode.\n\
If @var{all} is true, add suitable vector modes even when they are generally\n\
not expected to be worthwhile.\n\
\n\
The hook returns a bitmask of flags that control how the modes in\n\
@var{modes} are used. The flags are:\n\
@table @code\n\
@item VECT_COMPARE_COSTS\n\
Tells the loop vectorizer to try all the provided modes and pick the one\n\
with the lowest cost. By default the vectorizer will choose the first\n\
mode that works.\n\
@end table\n\
\n\
The hook does not need to do anything if the vector returned by\n\
@code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\
for autovectorization. The default implementation does nothing.",
void,
for autovectorization. The default implementation adds no modes and\n\
returns 0.",
unsigned int,
(vector_modes *modes, bool all),
default_autovectorize_vector_modes)

View File

@ -218,6 +218,14 @@ enum omp_device_kind_arch_isa {
omp_device_isa
};
/* Flags returned by TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES:
VECT_COMPARE_COSTS
Tells the loop vectorizer to try all the provided modes and
pick the one with the lowest cost. By default the vectorizer
will choose the first mode that works. */
const unsigned int VECT_COMPARE_COSTS = 1U << 0;
/* The target structure. This holds all the backend hooks. */
#define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
#define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;

View File

@ -1300,9 +1300,10 @@ default_split_reduction (machine_mode mode)
/* By default only the preferred vector mode is tried. */
void
unsigned int
default_autovectorize_vector_modes (vector_modes *, bool)
{
return 0;
}
/* The default implementation of TARGET_VECTORIZE_RELATED_MODE. */

View File

@ -113,7 +113,7 @@ default_builtin_support_vector_misalignment (machine_mode mode,
int, bool);
extern machine_mode default_preferred_simd_mode (scalar_mode mode);
extern machine_mode default_split_reduction (machine_mode);
extern void default_autovectorize_vector_modes (vector_modes *, bool);
extern unsigned int default_autovectorize_vector_modes (vector_modes *, bool);
extern opt_machine_mode default_vectorize_related_mode (machine_mode,
scalar_mode,
poly_uint64);

View File

@ -829,6 +829,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
scan_map (NULL),
slp_unrolling_factor (1),
single_scalar_iteration_cost (0),
vec_outside_cost (0),
vec_inside_cost (0),
vectorizable (false),
can_fully_mask_p (true),
fully_masked_p (false),
@ -2374,6 +2376,80 @@ again:
goto start_over;
}
/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
OLD_LOOP_VINFO is better unless something specifically indicates
otherwise.
Note that this deliberately isn't a partial order. */
static bool
vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
loop_vec_info old_loop_vinfo)
{
struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
/* Always prefer a VF of loop->simdlen over any other VF. */
if (loop->simdlen)
{
bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
if (new_simdlen_p != old_simdlen_p)
return new_simdlen_p;
}
/* Limit the VFs to what is likely to be the maximum number of iterations,
to handle cases in which at least one loop_vinfo is fully-masked. */
HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
if (estimated_max_niter != -1)
{
if (known_le (estimated_max_niter, new_vf))
new_vf = estimated_max_niter;
if (known_le (estimated_max_niter, old_vf))
old_vf = estimated_max_niter;
}
/* Check whether the (fractional) cost per scalar iteration is lower
or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
* poly_widest_int (old_vf));
poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
* poly_widest_int (new_vf));
if (maybe_lt (rel_old, rel_new))
return false;
if (known_lt (rel_new, rel_old))
return true;
/* If there's nothing to choose between the loop bodies, see whether
there's a difference in the prologue and epilogue costs. */
if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
return false;
}
/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
true if we should. */
static bool
vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
loop_vec_info old_loop_vinfo)
{
if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
return false;
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"***** Preferring vector mode %s to vector mode %s\n",
GET_MODE_NAME (new_loop_vinfo->vector_mode),
GET_MODE_NAME (old_loop_vinfo->vector_mode));
return true;
}
/* Function vect_analyze_loop.
Apply a set of analyses on LOOP, and create a loop_vec_info struct
@ -2385,8 +2461,9 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
auto_vector_modes vector_modes;
/* Autodetect first vector size we try. */
targetm.vectorize.autovectorize_vector_modes (&vector_modes,
loop->simdlen != 0);
unsigned int autovec_flags
= targetm.vectorize.autovectorize_vector_modes (&vector_modes,
loop->simdlen != 0);
unsigned int mode_i = 0;
DUMP_VECT_SCOPE ("analyze_loop_nest");
@ -2409,6 +2486,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
machine_mode next_vector_mode = VOIDmode;
poly_uint64 lowest_th = 0;
unsigned vectorized_loops = 0;
bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
&& !unlimited_cost_model (loop));
bool vect_epilogues = false;
opt_result res = opt_result::success ();
@ -2429,6 +2508,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
bool fatal = false;
/* When pick_lowest_cost_p is true, we should in principle iterate
over all the loop_vec_infos that LOOP_VINFO could replace and
try to vectorize LOOP_VINFO under the same conditions.
E.g. when trying to replace an epilogue loop, we should vectorize
LOOP_VINFO as an epilogue loop with the same VF limit. When trying
to replace the main loop, we should vectorize LOOP_VINFO as a main
loop too.
However, autovectorize_vector_modes is usually sorted as follows:
- Modes that naturally produce lower VFs usually follow modes that
naturally produce higher VFs.
- When modes naturally produce the same VF, maskable modes
usually follow unmaskable ones, so that the maskable mode
can be used to vectorize the epilogue of the unmaskable mode.
This order is preferred because it leads to the maximum
epilogue vectorization opportunities. Targets should only use
a different order if they want to make wide modes available while
disparaging them relative to earlier, smaller modes. The assumption
in that case is that the wider modes are more expensive in some
way that isn't reflected directly in the costs.
There should therefore be few interesting cases in which
LOOP_VINFO fails when treated as an epilogue loop, succeeds when
treated as a standalone loop, and ends up being genuinely cheaper
than FIRST_LOOP_VINFO. */
if (vect_epilogues)
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
@ -2476,13 +2583,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
simdlen = 0;
}
else if (pick_lowest_cost_p && first_loop_vinfo)
{
/* Keep trying to roll back vectorization attempts while the
loop_vec_infos they produced were worse than this one. */
vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
while (!vinfos.is_empty ()
&& vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
{
gcc_assert (vect_epilogues);
delete vinfos.pop ();
}
if (vinfos.is_empty ()
&& vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
{
delete first_loop_vinfo;
first_loop_vinfo = opt_loop_vec_info::success (NULL);
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
}
}
if (first_loop_vinfo == NULL)
{
first_loop_vinfo = loop_vinfo;
lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
}
else if (vect_epilogues)
else if (vect_epilogues
/* For now only allow one epilogue loop. */
&& first_loop_vinfo->epilogue_vinfos.is_empty ())
{
first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
@ -2506,12 +2634,14 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
&& param_vect_epilogues_nomask
&& LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
&& !loop->simduid
/* For now only allow one epilogue loop. */
&& first_loop_vinfo->epilogue_vinfos.is_empty ());
/* For now only allow one epilogue loop, but allow
pick_lowest_cost_p to replace it. */
&& (first_loop_vinfo->epilogue_vinfos.is_empty ()
|| pick_lowest_cost_p));
/* Commit to first_loop_vinfo if we have no reason to try
alternatives. */
if (!simdlen && !vect_epilogues)
if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
break;
}
else
@ -3509,7 +3639,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
&vec_inside_cost, &vec_epilogue_cost);
vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
/* Stash the costs so that we can compare two loop_vec_infos. */
loop_vinfo->vec_inside_cost = vec_inside_cost;
loop_vinfo->vec_outside_cost = vec_outside_cost;
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");

View File

@ -601,6 +601,13 @@ public:
/* Cost of a single scalar iteration. */
int single_scalar_iteration_cost;
/* The cost of the vector prologue and epilogue, including peeled
iterations and set-up code. */
int vec_outside_cost;
/* The cost of the vector loop body. */
int vec_inside_cost;
/* Is the loop vectorizable? */
bool vectorizable;