Optionally pick the cheapest loop_vec_info
This patch adds a mode in which the vectoriser tries each available base vector mode and picks the one with the lowest cost. The new behaviour is selected by autovectorize_vector_modes. The patch keeps the current behaviour of preferring a VF of loop->simdlen over any larger or smaller VF, regardless of costs or target preferences. 2019-11-16 Richard Sandiford <richard.sandiford@arm.com> gcc/ * target.h (VECT_COMPARE_COSTS): New constant. * target.def (autovectorize_vector_modes): Return a bitmask of flags. * doc/tm.texi: Regenerate. * targhooks.h (default_autovectorize_vector_modes): Update accordingly. * targhooks.c (default_autovectorize_vector_modes): Likewise. * config/aarch64/aarch64.c (aarch64_autovectorize_vector_modes): Likewise. * config/arc/arc.c (arc_autovectorize_vector_modes): Likewise. * config/arm/arm.c (arm_autovectorize_vector_modes): Likewise. * config/i386/i386.c (ix86_autovectorize_vector_modes): Likewise. * config/mips/mips.c (mips_autovectorize_vector_modes): Likewise. * tree-vectorizer.h (_loop_vec_info::vec_outside_cost) (_loop_vec_info::vec_inside_cost): New member variables. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them. (vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions. (vect_analyze_loop): When autovectorize_vector_modes returns VECT_COMPARE_COSTS, try vectorizing the loop with each available vector mode and picking the one with the lowest cost. (vect_estimate_min_profitable_iters): Record the computed costs in the loop_vec_info. From-SVN: r278336
This commit is contained in:
parent
f884cd2fea
commit
bcc7e346bf
@ -1,3 +1,26 @@
|
|||||||
|
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
|
||||||
|
|
||||||
|
* target.h (VECT_COMPARE_COSTS): New constant.
|
||||||
|
* target.def (autovectorize_vector_modes): Return a bitmask of flags.
|
||||||
|
* doc/tm.texi: Regenerate.
|
||||||
|
* targhooks.h (default_autovectorize_vector_modes): Update accordingly.
|
||||||
|
* targhooks.c (default_autovectorize_vector_modes): Likewise.
|
||||||
|
* config/aarch64/aarch64.c (aarch64_autovectorize_vector_modes):
|
||||||
|
Likewise.
|
||||||
|
* config/arc/arc.c (arc_autovectorize_vector_modes): Likewise.
|
||||||
|
* config/arm/arm.c (arm_autovectorize_vector_modes): Likewise.
|
||||||
|
* config/i386/i386.c (ix86_autovectorize_vector_modes): Likewise.
|
||||||
|
* config/mips/mips.c (mips_autovectorize_vector_modes): Likewise.
|
||||||
|
* tree-vectorizer.h (_loop_vec_info::vec_outside_cost)
|
||||||
|
(_loop_vec_info::vec_inside_cost): New member variables.
|
||||||
|
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them.
|
||||||
|
(vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions.
|
||||||
|
(vect_analyze_loop): When autovectorize_vector_modes returns
|
||||||
|
VECT_COMPARE_COSTS, try vectorizing the loop with each available
|
||||||
|
vector mode and picking the one with the lowest cost.
|
||||||
|
(vect_estimate_min_profitable_iters): Record the computed costs
|
||||||
|
in the loop_vec_info.
|
||||||
|
|
||||||
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
|
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
|
||||||
|
|
||||||
* tree-vectorizer.h (can_duplicate_and_interleave_p): Take an
|
* tree-vectorizer.h (can_duplicate_and_interleave_p): Take an
|
||||||
|
@ -15935,7 +15935,7 @@ aarch64_preferred_simd_mode (scalar_mode mode)
|
|||||||
|
|
||||||
/* Return a list of possible vector sizes for the vectorizer
|
/* Return a list of possible vector sizes for the vectorizer
|
||||||
to iterate over. */
|
to iterate over. */
|
||||||
static void
|
static unsigned int
|
||||||
aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
|
aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
|
||||||
{
|
{
|
||||||
if (TARGET_SVE)
|
if (TARGET_SVE)
|
||||||
@ -15961,6 +15961,8 @@ aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
|
|||||||
TODO: We could similarly support limited forms of V2QImode and V2HImode
|
TODO: We could similarly support limited forms of V2QImode and V2HImode
|
||||||
for this case. */
|
for this case. */
|
||||||
modes->safe_push (V2SImode);
|
modes->safe_push (V2SImode);
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Implement TARGET_MANGLE_TYPE. */
|
/* Implement TARGET_MANGLE_TYPE. */
|
||||||
|
@ -609,7 +609,7 @@ arc_preferred_simd_mode (scalar_mode mode)
|
|||||||
/* Implements target hook
|
/* Implements target hook
|
||||||
TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */
|
TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */
|
||||||
|
|
||||||
static void
|
static unsigned int
|
||||||
arc_autovectorize_vector_modes (vector_modes *modes, bool)
|
arc_autovectorize_vector_modes (vector_modes *modes, bool)
|
||||||
{
|
{
|
||||||
if (TARGET_PLUS_QMACW)
|
if (TARGET_PLUS_QMACW)
|
||||||
@ -617,6 +617,7 @@ arc_autovectorize_vector_modes (vector_modes *modes, bool)
|
|||||||
modes->quick_push (V4HImode);
|
modes->quick_push (V4HImode);
|
||||||
modes->quick_push (V2HImode);
|
modes->quick_push (V2HImode);
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_misalignment (machine_mode mode,
|
|||||||
static void arm_conditional_register_usage (void);
|
static void arm_conditional_register_usage (void);
|
||||||
static enum flt_eval_method arm_excess_precision (enum excess_precision_type);
|
static enum flt_eval_method arm_excess_precision (enum excess_precision_type);
|
||||||
static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
|
static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
|
||||||
static void arm_autovectorize_vector_modes (vector_modes *, bool);
|
static unsigned int arm_autovectorize_vector_modes (vector_modes *, bool);
|
||||||
static int arm_default_branch_cost (bool, bool);
|
static int arm_default_branch_cost (bool, bool);
|
||||||
static int arm_cortex_a5_branch_cost (bool, bool);
|
static int arm_cortex_a5_branch_cost (bool, bool);
|
||||||
static int arm_cortex_m_branch_cost (bool, bool);
|
static int arm_cortex_m_branch_cost (bool, bool);
|
||||||
@ -29015,7 +29015,7 @@ arm_vector_alignment (const_tree type)
|
|||||||
return align;
|
return align;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static unsigned int
|
||||||
arm_autovectorize_vector_modes (vector_modes *modes, bool)
|
arm_autovectorize_vector_modes (vector_modes *modes, bool)
|
||||||
{
|
{
|
||||||
if (!TARGET_NEON_VECTORIZE_DOUBLE)
|
if (!TARGET_NEON_VECTORIZE_DOUBLE)
|
||||||
@ -29023,6 +29023,7 @@ arm_autovectorize_vector_modes (vector_modes *modes, bool)
|
|||||||
modes->safe_push (V16QImode);
|
modes->safe_push (V16QImode);
|
||||||
modes->safe_push (V8QImode);
|
modes->safe_push (V8QImode);
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
|
@ -21384,7 +21384,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
|
|||||||
vectors. If AVX512F is enabled then try vectorizing with 512bit,
|
vectors. If AVX512F is enabled then try vectorizing with 512bit,
|
||||||
256bit and 128bit vectors. */
|
256bit and 128bit vectors. */
|
||||||
|
|
||||||
static void
|
static unsigned int
|
||||||
ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
|
ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
|
||||||
{
|
{
|
||||||
if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
|
if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
|
||||||
@ -21414,6 +21414,8 @@ ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
|
|||||||
|
|
||||||
if (TARGET_MMX_WITH_SSE)
|
if (TARGET_MMX_WITH_SSE)
|
||||||
modes->safe_push (V8QImode);
|
modes->safe_push (V8QImode);
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Implemenation of targetm.vectorize.get_mask_mode. */
|
/* Implemenation of targetm.vectorize.get_mask_mode. */
|
||||||
|
@ -13455,11 +13455,12 @@ mips_preferred_simd_mode (scalar_mode mode)
|
|||||||
|
|
||||||
/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */
|
/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */
|
||||||
|
|
||||||
static void
|
static unsigned int
|
||||||
mips_autovectorize_vector_modes (vector_modes *modes, bool)
|
mips_autovectorize_vector_modes (vector_modes *modes, bool)
|
||||||
{
|
{
|
||||||
if (ISA_HAS_MSA)
|
if (ISA_HAS_MSA)
|
||||||
modes->safe_push (V16QImode);
|
modes->safe_push (V16QImode);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Implement TARGET_INIT_LIBFUNCS. */
|
/* Implement TARGET_INIT_LIBFUNCS. */
|
||||||
|
@ -6008,7 +6008,7 @@ against lower halves of vectors recursively until the specified mode is
|
|||||||
reached. The default is @var{mode} which means no splitting.
|
reached. The default is @var{mode} which means no splitting.
|
||||||
@end deftypefn
|
@end deftypefn
|
||||||
|
|
||||||
@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all})
|
@deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all})
|
||||||
If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}
|
If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}
|
||||||
is not the only approach worth considering, this hook should add one mode to
|
is not the only approach worth considering, this hook should add one mode to
|
||||||
@var{modes} for each useful alternative approach. These modes are then
|
@var{modes} for each useful alternative approach. These modes are then
|
||||||
@ -6024,9 +6024,19 @@ element mode.
|
|||||||
If @var{all} is true, add suitable vector modes even when they are generally
|
If @var{all} is true, add suitable vector modes even when they are generally
|
||||||
not expected to be worthwhile.
|
not expected to be worthwhile.
|
||||||
|
|
||||||
|
The hook returns a bitmask of flags that control how the modes in
|
||||||
|
@var{modes} are used. The flags are:
|
||||||
|
@table @code
|
||||||
|
@item VECT_COMPARE_COSTS
|
||||||
|
Tells the loop vectorizer to try all the provided modes and pick the one
|
||||||
|
with the lowest cost. By default the vectorizer will choose the first
|
||||||
|
mode that works.
|
||||||
|
@end table
|
||||||
|
|
||||||
The hook does not need to do anything if the vector returned by
|
The hook does not need to do anything if the vector returned by
|
||||||
@code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant
|
@code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant
|
||||||
for autovectorization. The default implementation does nothing.
|
for autovectorization. The default implementation adds no modes and
|
||||||
|
returns 0.
|
||||||
@end deftypefn
|
@end deftypefn
|
||||||
|
|
||||||
@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_RELATED_MODE (machine_mode @var{vector_mode}, scalar_mode @var{element_mode}, poly_uint64 @var{nunits})
|
@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_RELATED_MODE (machine_mode @var{vector_mode}, scalar_mode @var{element_mode}, poly_uint64 @var{nunits})
|
||||||
|
@ -1925,10 +1925,20 @@ element mode.\n\
|
|||||||
If @var{all} is true, add suitable vector modes even when they are generally\n\
|
If @var{all} is true, add suitable vector modes even when they are generally\n\
|
||||||
not expected to be worthwhile.\n\
|
not expected to be worthwhile.\n\
|
||||||
\n\
|
\n\
|
||||||
|
The hook returns a bitmask of flags that control how the modes in\n\
|
||||||
|
@var{modes} are used. The flags are:\n\
|
||||||
|
@table @code\n\
|
||||||
|
@item VECT_COMPARE_COSTS\n\
|
||||||
|
Tells the loop vectorizer to try all the provided modes and pick the one\n\
|
||||||
|
with the lowest cost. By default the vectorizer will choose the first\n\
|
||||||
|
mode that works.\n\
|
||||||
|
@end table\n\
|
||||||
|
\n\
|
||||||
The hook does not need to do anything if the vector returned by\n\
|
The hook does not need to do anything if the vector returned by\n\
|
||||||
@code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\
|
@code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\
|
||||||
for autovectorization. The default implementation does nothing.",
|
for autovectorization. The default implementation adds no modes and\n\
|
||||||
void,
|
returns 0.",
|
||||||
|
unsigned int,
|
||||||
(vector_modes *modes, bool all),
|
(vector_modes *modes, bool all),
|
||||||
default_autovectorize_vector_modes)
|
default_autovectorize_vector_modes)
|
||||||
|
|
||||||
|
@ -218,6 +218,14 @@ enum omp_device_kind_arch_isa {
|
|||||||
omp_device_isa
|
omp_device_isa
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Flags returned by TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES:
|
||||||
|
|
||||||
|
VECT_COMPARE_COSTS
|
||||||
|
Tells the loop vectorizer to try all the provided modes and
|
||||||
|
pick the one with the lowest cost. By default the vectorizer
|
||||||
|
will choose the first mode that works. */
|
||||||
|
const unsigned int VECT_COMPARE_COSTS = 1U << 0;
|
||||||
|
|
||||||
/* The target structure. This holds all the backend hooks. */
|
/* The target structure. This holds all the backend hooks. */
|
||||||
#define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
|
#define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
|
||||||
#define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;
|
#define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;
|
||||||
|
@ -1300,9 +1300,10 @@ default_split_reduction (machine_mode mode)
|
|||||||
|
|
||||||
/* By default only the preferred vector mode is tried. */
|
/* By default only the preferred vector mode is tried. */
|
||||||
|
|
||||||
void
|
unsigned int
|
||||||
default_autovectorize_vector_modes (vector_modes *, bool)
|
default_autovectorize_vector_modes (vector_modes *, bool)
|
||||||
{
|
{
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The default implementation of TARGET_VECTORIZE_RELATED_MODE. */
|
/* The default implementation of TARGET_VECTORIZE_RELATED_MODE. */
|
||||||
|
@ -113,7 +113,7 @@ default_builtin_support_vector_misalignment (machine_mode mode,
|
|||||||
int, bool);
|
int, bool);
|
||||||
extern machine_mode default_preferred_simd_mode (scalar_mode mode);
|
extern machine_mode default_preferred_simd_mode (scalar_mode mode);
|
||||||
extern machine_mode default_split_reduction (machine_mode);
|
extern machine_mode default_split_reduction (machine_mode);
|
||||||
extern void default_autovectorize_vector_modes (vector_modes *, bool);
|
extern unsigned int default_autovectorize_vector_modes (vector_modes *, bool);
|
||||||
extern opt_machine_mode default_vectorize_related_mode (machine_mode,
|
extern opt_machine_mode default_vectorize_related_mode (machine_mode,
|
||||||
scalar_mode,
|
scalar_mode,
|
||||||
poly_uint64);
|
poly_uint64);
|
||||||
|
@ -829,6 +829,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
|
|||||||
scan_map (NULL),
|
scan_map (NULL),
|
||||||
slp_unrolling_factor (1),
|
slp_unrolling_factor (1),
|
||||||
single_scalar_iteration_cost (0),
|
single_scalar_iteration_cost (0),
|
||||||
|
vec_outside_cost (0),
|
||||||
|
vec_inside_cost (0),
|
||||||
vectorizable (false),
|
vectorizable (false),
|
||||||
can_fully_mask_p (true),
|
can_fully_mask_p (true),
|
||||||
fully_masked_p (false),
|
fully_masked_p (false),
|
||||||
@ -2374,6 +2376,80 @@ again:
|
|||||||
goto start_over;
|
goto start_over;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
|
||||||
|
to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
|
||||||
|
OLD_LOOP_VINFO is better unless something specifically indicates
|
||||||
|
otherwise.
|
||||||
|
|
||||||
|
Note that this deliberately isn't a partial order. */
|
||||||
|
|
||||||
|
static bool
|
||||||
|
vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
|
||||||
|
loop_vec_info old_loop_vinfo)
|
||||||
|
{
|
||||||
|
struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
|
||||||
|
gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
|
||||||
|
|
||||||
|
poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
|
||||||
|
poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
|
||||||
|
|
||||||
|
/* Always prefer a VF of loop->simdlen over any other VF. */
|
||||||
|
if (loop->simdlen)
|
||||||
|
{
|
||||||
|
bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
|
||||||
|
bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
|
||||||
|
if (new_simdlen_p != old_simdlen_p)
|
||||||
|
return new_simdlen_p;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Limit the VFs to what is likely to be the maximum number of iterations,
|
||||||
|
to handle cases in which at least one loop_vinfo is fully-masked. */
|
||||||
|
HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
|
||||||
|
if (estimated_max_niter != -1)
|
||||||
|
{
|
||||||
|
if (known_le (estimated_max_niter, new_vf))
|
||||||
|
new_vf = estimated_max_niter;
|
||||||
|
if (known_le (estimated_max_niter, old_vf))
|
||||||
|
old_vf = estimated_max_niter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check whether the (fractional) cost per scalar iteration is lower
|
||||||
|
or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
|
||||||
|
poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
|
||||||
|
* poly_widest_int (old_vf));
|
||||||
|
poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
|
||||||
|
* poly_widest_int (new_vf));
|
||||||
|
if (maybe_lt (rel_old, rel_new))
|
||||||
|
return false;
|
||||||
|
if (known_lt (rel_new, rel_old))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
/* If there's nothing to choose between the loop bodies, see whether
|
||||||
|
there's a difference in the prologue and epilogue costs. */
|
||||||
|
if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
|
||||||
|
return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
|
||||||
|
true if we should. */
|
||||||
|
|
||||||
|
static bool
|
||||||
|
vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
|
||||||
|
loop_vec_info old_loop_vinfo)
|
||||||
|
{
|
||||||
|
if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (dump_enabled_p ())
|
||||||
|
dump_printf_loc (MSG_NOTE, vect_location,
|
||||||
|
"***** Preferring vector mode %s to vector mode %s\n",
|
||||||
|
GET_MODE_NAME (new_loop_vinfo->vector_mode),
|
||||||
|
GET_MODE_NAME (old_loop_vinfo->vector_mode));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/* Function vect_analyze_loop.
|
/* Function vect_analyze_loop.
|
||||||
|
|
||||||
Apply a set of analyses on LOOP, and create a loop_vec_info struct
|
Apply a set of analyses on LOOP, and create a loop_vec_info struct
|
||||||
@ -2385,8 +2461,9 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
|
|||||||
auto_vector_modes vector_modes;
|
auto_vector_modes vector_modes;
|
||||||
|
|
||||||
/* Autodetect first vector size we try. */
|
/* Autodetect first vector size we try. */
|
||||||
targetm.vectorize.autovectorize_vector_modes (&vector_modes,
|
unsigned int autovec_flags
|
||||||
loop->simdlen != 0);
|
= targetm.vectorize.autovectorize_vector_modes (&vector_modes,
|
||||||
|
loop->simdlen != 0);
|
||||||
unsigned int mode_i = 0;
|
unsigned int mode_i = 0;
|
||||||
|
|
||||||
DUMP_VECT_SCOPE ("analyze_loop_nest");
|
DUMP_VECT_SCOPE ("analyze_loop_nest");
|
||||||
@ -2409,6 +2486,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
|
|||||||
machine_mode next_vector_mode = VOIDmode;
|
machine_mode next_vector_mode = VOIDmode;
|
||||||
poly_uint64 lowest_th = 0;
|
poly_uint64 lowest_th = 0;
|
||||||
unsigned vectorized_loops = 0;
|
unsigned vectorized_loops = 0;
|
||||||
|
bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
|
||||||
|
&& !unlimited_cost_model (loop));
|
||||||
|
|
||||||
bool vect_epilogues = false;
|
bool vect_epilogues = false;
|
||||||
opt_result res = opt_result::success ();
|
opt_result res = opt_result::success ();
|
||||||
@ -2429,6 +2508,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
|
|||||||
|
|
||||||
bool fatal = false;
|
bool fatal = false;
|
||||||
|
|
||||||
|
/* When pick_lowest_cost_p is true, we should in principle iterate
|
||||||
|
over all the loop_vec_infos that LOOP_VINFO could replace and
|
||||||
|
try to vectorize LOOP_VINFO under the same conditions.
|
||||||
|
E.g. when trying to replace an epilogue loop, we should vectorize
|
||||||
|
LOOP_VINFO as an epilogue loop with the same VF limit. When trying
|
||||||
|
to replace the main loop, we should vectorize LOOP_VINFO as a main
|
||||||
|
loop too.
|
||||||
|
|
||||||
|
However, autovectorize_vector_modes is usually sorted as follows:
|
||||||
|
|
||||||
|
- Modes that naturally produce lower VFs usually follow modes that
|
||||||
|
naturally produce higher VFs.
|
||||||
|
|
||||||
|
- When modes naturally produce the same VF, maskable modes
|
||||||
|
usually follow unmaskable ones, so that the maskable mode
|
||||||
|
can be used to vectorize the epilogue of the unmaskable mode.
|
||||||
|
|
||||||
|
This order is preferred because it leads to the maximum
|
||||||
|
epilogue vectorization opportunities. Targets should only use
|
||||||
|
a different order if they want to make wide modes available while
|
||||||
|
disparaging them relative to earlier, smaller modes. The assumption
|
||||||
|
in that case is that the wider modes are more expensive in some
|
||||||
|
way that isn't reflected directly in the costs.
|
||||||
|
|
||||||
|
There should therefore be few interesting cases in which
|
||||||
|
LOOP_VINFO fails when treated as an epilogue loop, succeeds when
|
||||||
|
treated as a standalone loop, and ends up being genuinely cheaper
|
||||||
|
than FIRST_LOOP_VINFO. */
|
||||||
if (vect_epilogues)
|
if (vect_epilogues)
|
||||||
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
|
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
|
||||||
|
|
||||||
@ -2476,13 +2583,34 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
|
|||||||
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
|
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
|
||||||
simdlen = 0;
|
simdlen = 0;
|
||||||
}
|
}
|
||||||
|
else if (pick_lowest_cost_p && first_loop_vinfo)
|
||||||
|
{
|
||||||
|
/* Keep trying to roll back vectorization attempts while the
|
||||||
|
loop_vec_infos they produced were worse than this one. */
|
||||||
|
vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
|
||||||
|
while (!vinfos.is_empty ()
|
||||||
|
&& vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
|
||||||
|
{
|
||||||
|
gcc_assert (vect_epilogues);
|
||||||
|
delete vinfos.pop ();
|
||||||
|
}
|
||||||
|
if (vinfos.is_empty ()
|
||||||
|
&& vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
|
||||||
|
{
|
||||||
|
delete first_loop_vinfo;
|
||||||
|
first_loop_vinfo = opt_loop_vec_info::success (NULL);
|
||||||
|
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (first_loop_vinfo == NULL)
|
if (first_loop_vinfo == NULL)
|
||||||
{
|
{
|
||||||
first_loop_vinfo = loop_vinfo;
|
first_loop_vinfo = loop_vinfo;
|
||||||
lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
|
lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
|
||||||
}
|
}
|
||||||
else if (vect_epilogues)
|
else if (vect_epilogues
|
||||||
|
/* For now only allow one epilogue loop. */
|
||||||
|
&& first_loop_vinfo->epilogue_vinfos.is_empty ())
|
||||||
{
|
{
|
||||||
first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
|
first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
|
||||||
poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
|
poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
|
||||||
@ -2506,12 +2634,14 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
|
|||||||
&& param_vect_epilogues_nomask
|
&& param_vect_epilogues_nomask
|
||||||
&& LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
|
&& LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
|
||||||
&& !loop->simduid
|
&& !loop->simduid
|
||||||
/* For now only allow one epilogue loop. */
|
/* For now only allow one epilogue loop, but allow
|
||||||
&& first_loop_vinfo->epilogue_vinfos.is_empty ());
|
pick_lowest_cost_p to replace it. */
|
||||||
|
&& (first_loop_vinfo->epilogue_vinfos.is_empty ()
|
||||||
|
|| pick_lowest_cost_p));
|
||||||
|
|
||||||
/* Commit to first_loop_vinfo if we have no reason to try
|
/* Commit to first_loop_vinfo if we have no reason to try
|
||||||
alternatives. */
|
alternatives. */
|
||||||
if (!simdlen && !vect_epilogues)
|
if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -3509,7 +3639,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
|
|||||||
&vec_inside_cost, &vec_epilogue_cost);
|
&vec_inside_cost, &vec_epilogue_cost);
|
||||||
|
|
||||||
vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
|
vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
|
||||||
|
|
||||||
|
/* Stash the costs so that we can compare two loop_vec_infos. */
|
||||||
|
loop_vinfo->vec_inside_cost = vec_inside_cost;
|
||||||
|
loop_vinfo->vec_outside_cost = vec_outside_cost;
|
||||||
|
|
||||||
if (dump_enabled_p ())
|
if (dump_enabled_p ())
|
||||||
{
|
{
|
||||||
dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
|
dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
|
||||||
|
@ -601,6 +601,13 @@ public:
|
|||||||
/* Cost of a single scalar iteration. */
|
/* Cost of a single scalar iteration. */
|
||||||
int single_scalar_iteration_cost;
|
int single_scalar_iteration_cost;
|
||||||
|
|
||||||
|
/* The cost of the vector prologue and epilogue, including peeled
|
||||||
|
iterations and set-up code. */
|
||||||
|
int vec_outside_cost;
|
||||||
|
|
||||||
|
/* The cost of the vector loop body. */
|
||||||
|
int vec_inside_cost;
|
||||||
|
|
||||||
/* Is the loop vectorizable? */
|
/* Is the loop vectorizable? */
|
||||||
bool vectorizable;
|
bool vectorizable;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user