[vect] Add main vectorized loop unrolling

gcc/ChangeLog:

	* tree-vect-loop.cc (vect_estimate_min_profitable_iters): Pass new
	argument suggested_unroll_factor.
	(vect_analyze_loop_costing): Likewise.
	(_loop_vec_info::_loop_vec_info): Initialize new member
	suggested_unroll_factor.
	(vect_determine_partial_vectors_and_peeling): Make epilogue of unrolled
	main loop use partial vectors.
	(vect_analyze_loop_2): Pass and use new argument
	suggested_unroll_factor.
	(vect_analyze_loop_1): Change to intialize local
	suggested_unroll_factor and use it.
	(vectorizable_reduction): Don't use single_defuse_cycle when unrolling.
	* tree-vectorizer.h (_loop_vec_info::_loop_vec_info): Add new member
	suggested_unroll_factor.
	(vector_costs::vector_costs): Add new member m_suggested_unroll_factor.
	(vector_costs::suggested_unroll_factor): New getter function.
	(finish_cost): Set return argument suggested_unroll_factor.
This commit is contained in:
Andre Vieira 2022-01-18 15:57:39 +00:00
parent 254ada46ae
commit 7ca1582ca6
2 changed files with 94 additions and 12 deletions

View File

@ -154,7 +154,8 @@ along with GCC; see the file COPYING3. If not see
http://gcc.gnu.org/projects/tree-ssa/vectorization.html
*/
static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
unsigned *);
static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
bool *, bool *);
@ -831,6 +832,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
skip_main_loop_edge (nullptr),
skip_this_loop_edge (nullptr),
reusable_accumulators (),
suggested_unroll_factor (1),
max_vectorization_factor (0),
mask_skip_niters (NULL_TREE),
rgroup_compare_type (NULL_TREE),
@ -1834,7 +1836,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
definitely no, or -1 if it's worth retrying. */
static int
vect_analyze_loop_costing (loop_vec_info loop_vinfo)
vect_analyze_loop_costing (loop_vec_info loop_vinfo,
unsigned *suggested_unroll_factor)
{
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
@ -1868,7 +1871,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
int min_profitable_iters, min_profitable_estimate;
vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
&min_profitable_estimate);
&min_profitable_estimate,
suggested_unroll_factor);
if (min_profitable_iters < 0)
{
@ -2152,10 +2156,16 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
vectors to the epilogue, with the main loop continuing to operate
on full vectors.
If we are unrolling we also do not want to use partial vectors. This
is to avoid the overhead of generating multiple masks and also to
avoid having to execute entire iterations of FALSE masked instructions
when dealing with one or less full iterations.
??? We could then end up failing to use partial vectors if we
decide to peel iterations into a prologue, and if the main loop
then ends up processing fewer than VF iterations. */
if (param_vect_partial_vector_usage == 1
if ((param_vect_partial_vector_usage == 1
|| loop_vinfo->suggested_unroll_factor > 1)
&& !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
&& !vect_known_niters_smaller_than_vf (loop_vinfo))
LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
@ -2222,7 +2232,8 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
for it. The different analyses will record information in the
loop_vec_info struct. */
static opt_result
vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
unsigned *suggested_unroll_factor)
{
opt_result ok = opt_result::success ();
int res;
@ -2382,6 +2393,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
set of rgroups. */
gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
/* Apply the suggested unrolling factor, this was determined by the backend
during finish_cost the first time we ran the analyzis for this
vector mode. */
if (loop_vinfo->suggested_unroll_factor > 1)
LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
/* This is the point where we can re-start analysis with SLP forced off. */
start_over:
@ -2573,7 +2590,7 @@ start_over:
return ok;
/* Check the costings of the loop make vectorizing worthwhile. */
res = vect_analyze_loop_costing (loop_vinfo);
res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
if (res < 0)
{
ok = opt_result::failure_at (vect_location,
@ -2851,15 +2868,38 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
machine_mode vector_mode = vector_modes[mode_i];
loop_vinfo->vector_mode = vector_mode;
unsigned int suggested_unroll_factor = 1;
/* Run the main analysis. */
opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
&suggested_unroll_factor);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"***** Analysis %s with vector mode %s\n",
res ? "succeeded" : " failed",
GET_MODE_NAME (loop_vinfo->vector_mode));
if (!main_loop_vinfo && suggested_unroll_factor > 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"***** Re-trying analysis for unrolling"
" with unroll factor %d.\n",
suggested_unroll_factor);
loop_vec_info unroll_vinfo
= vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
unroll_vinfo->vector_mode = vector_mode;
unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
if (new_res)
{
delete loop_vinfo;
loop_vinfo = unroll_vinfo;
}
else
delete unroll_vinfo;
}
/* Remember the autodetected vector mode. */
if (vector_mode == VOIDmode)
autodetected_vector_mode = loop_vinfo->vector_mode;
@ -3860,7 +3900,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
static void
vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
int *ret_min_profitable_niters,
int *ret_min_profitable_estimate)
int *ret_min_profitable_estimate,
unsigned *suggested_unroll_factor)
{
int min_profitable_iters;
int min_profitable_estimate;
@ -4227,7 +4268,22 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
/* Complete the target-specific cost calculations. */
finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
&vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
&vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
suggested_unroll_factor);
if (suggested_unroll_factor && *suggested_unroll_factor > 1
&& LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
&& !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
*suggested_unroll_factor,
LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"can't unroll as unrolled vectorization factor larger"
" than maximum vectorization factor: %d\n",
LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
*suggested_unroll_factor = 1;
}
vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
@ -7194,10 +7250,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
This only works when we see both the reduction PHI and its only consumer
in vectorizable_reduction and there are no intermediate stmts
participating. */
participating. When unrolling we want each unrolled iteration to have its
own reduction accumulator since one of the main goals of unrolling a
reduction is to reduce the aggregate loop-carried latency. */
if (ncopies > 1
&& (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
&& reduc_chain_length == 1)
&& reduc_chain_length == 1
&& loop_vinfo->suggested_unroll_factor == 1)
single_defuse_cycle = true;
if (single_defuse_cycle || lane_reduc_code_p)

View File

@ -642,6 +642,13 @@ public:
about the reductions that generated them. */
hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
/* The number of times that the target suggested we unroll the vector loop
in order to promote more ILP. This value will be used to re-analyze the
loop for vectorization and if successful the value will be folded into
vectorization_factor (and therefore exactly divides
vectorization_factor). */
unsigned int suggested_unroll_factor;
/* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
if there is no particular limit. */
unsigned HOST_WIDE_INT max_vectorization_factor;
@ -1465,6 +1472,7 @@ public:
unsigned int epilogue_cost () const;
unsigned int outside_cost () const;
unsigned int total_cost () const;
unsigned int suggested_unroll_factor () const;
protected:
unsigned int record_stmt_cost (stmt_vec_info, vect_cost_model_location,
@ -1484,6 +1492,9 @@ protected:
/* The costs of the three regions, indexed by vect_cost_model_location. */
unsigned int m_costs[3];
/* The suggested unrolling factor determined at finish_cost. */
unsigned int m_suggested_unroll_factor;
/* True if finish_cost has been called. */
bool m_finished;
};
@ -1496,6 +1507,7 @@ vector_costs::vector_costs (vec_info *vinfo, bool costing_for_scalar)
: m_vinfo (vinfo),
m_costing_for_scalar (costing_for_scalar),
m_costs (),
m_suggested_unroll_factor(1),
m_finished (false)
{
}
@ -1544,6 +1556,15 @@ vector_costs::total_cost () const
return body_cost () + outside_cost ();
}
/* Return the suggested unroll factor. */
inline unsigned int
vector_costs::suggested_unroll_factor () const
{
gcc_checking_assert (m_finished);
return m_suggested_unroll_factor;
}
#define VECT_MAX_COST 1000
/* The maximum number of intermediate steps required in multi-step type
@ -1720,12 +1741,14 @@ add_stmt_cost (vector_costs *costs, stmt_info_for_cost *i)
static inline void
finish_cost (vector_costs *costs, const vector_costs *scalar_costs,
unsigned *prologue_cost, unsigned *body_cost,
unsigned *epilogue_cost)
unsigned *epilogue_cost, unsigned *suggested_unroll_factor = NULL)
{
costs->finish_cost (scalar_costs);
*prologue_cost = costs->prologue_cost ();
*body_cost = costs->body_cost ();
*epilogue_cost = costs->epilogue_cost ();
if (suggested_unroll_factor)
*suggested_unroll_factor = costs->suggested_unroll_factor ();
}
inline void