vect: Hookize better_loop_vinfo_p
One of the things we want to do on AArch64 is compare vector loops side-by-side and pick the best one. For some targets, we want this to be based on issue rates as well as the usual latency-based costs (at least for loops with relatively high iteration counts). The current approach to doing this is: when costing vectorisation candidate A, try to guess what the other main candidate B will look like and adjust A's latency-based cost up or down based on the likely difference between A and B's issue rates. This effectively means that we try to cost parts of B at the same time as A, without actually being able to see B. This is needlessly indirect and complex. It was a compromise due to the code being added (too) late in the GCC 11 cycle, so that target-independent changes weren't possible. The target-independent code already compares two candidate loop_vec_infos side-by-side, so that information about A and B above are available directly. This patch creates a way for targets to hook into this comparison. The AArch64 code can therefore hook into better_main_loop_than_p to compare issue rates. If the issue rate comparison isn't decisive, the code can fall back to the normal latency-based comparison instead. gcc/ * tree-vectorizer.h (vector_costs::better_main_loop_than_p) (vector_costs::better_epilogue_loop_than_p) (vector_costs::compare_inside_loop_cost) (vector_costs::compare_outside_loop_cost): Likewise. * tree-vectorizer.c (vector_costs::better_main_loop_than_p) (vector_costs::better_epilogue_loop_than_p) (vector_costs::compare_inside_loop_cost) (vector_costs::compare_outside_loop_cost): New functions, containing code moved from... * tree-vect-loop.c (vect_better_loop_vinfo_p): ...here.
This commit is contained in:
parent
772d76acb5
commit
5720a9d5be
|
@ -2784,144 +2784,12 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
|
|||
return new_simdlen_p;
|
||||
}
|
||||
|
||||
loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
|
||||
if (main_loop)
|
||||
{
|
||||
poly_uint64 main_poly_vf = LOOP_VINFO_VECT_FACTOR (main_loop);
|
||||
unsigned HOST_WIDE_INT main_vf;
|
||||
unsigned HOST_WIDE_INT old_factor, new_factor, old_cost, new_cost;
|
||||
/* If we can determine how many iterations are left for the epilogue
|
||||
loop, that is if both the main loop's vectorization factor and number
|
||||
of iterations are constant, then we use them to calculate the cost of
|
||||
the epilogue loop together with a 'likely value' for the epilogues
|
||||
vectorization factor. Otherwise we use the main loop's vectorization
|
||||
factor and the maximum poly value for the epilogue's. If the target
|
||||
has not provided with a sensible upper bound poly vectorization
|
||||
factors are likely to be favored over constant ones. */
|
||||
if (main_poly_vf.is_constant (&main_vf)
|
||||
&& LOOP_VINFO_NITERS_KNOWN_P (main_loop))
|
||||
{
|
||||
unsigned HOST_WIDE_INT niters
|
||||
= LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
|
||||
HOST_WIDE_INT old_likely_vf
|
||||
= estimated_poly_value (old_vf, POLY_VALUE_LIKELY);
|
||||
HOST_WIDE_INT new_likely_vf
|
||||
= estimated_poly_value (new_vf, POLY_VALUE_LIKELY);
|
||||
const auto *old_costs = old_loop_vinfo->vector_costs;
|
||||
const auto *new_costs = new_loop_vinfo->vector_costs;
|
||||
if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
|
||||
return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
|
||||
|
||||
/* If the epilogue is using partial vectors we account for the
|
||||
partial iteration here too. */
|
||||
old_factor = niters / old_likely_vf;
|
||||
if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (old_loop_vinfo)
|
||||
&& niters % old_likely_vf != 0)
|
||||
old_factor++;
|
||||
|
||||
new_factor = niters / new_likely_vf;
|
||||
if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (new_loop_vinfo)
|
||||
&& niters % new_likely_vf != 0)
|
||||
new_factor++;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned HOST_WIDE_INT main_vf_max
|
||||
= estimated_poly_value (main_poly_vf, POLY_VALUE_MAX);
|
||||
|
||||
old_factor = main_vf_max / estimated_poly_value (old_vf,
|
||||
POLY_VALUE_MAX);
|
||||
new_factor = main_vf_max / estimated_poly_value (new_vf,
|
||||
POLY_VALUE_MAX);
|
||||
|
||||
/* If the loop is not using partial vectors then it will iterate one
|
||||
time less than one that does. It is safe to subtract one here,
|
||||
because the main loop's vf is always at least 2x bigger than that
|
||||
of an epilogue. */
|
||||
if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (old_loop_vinfo))
|
||||
old_factor -= 1;
|
||||
if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (new_loop_vinfo))
|
||||
new_factor -= 1;
|
||||
}
|
||||
|
||||
/* Compute the costs by multiplying the inside costs with the factor and
|
||||
add the outside costs for a more complete picture. The factor is the
|
||||
amount of times we are expecting to iterate this epilogue. */
|
||||
old_cost = old_loop_vinfo->vector_costs->body_cost () * old_factor;
|
||||
new_cost = new_loop_vinfo->vector_costs->body_cost () * new_factor;
|
||||
old_cost += old_loop_vinfo->vector_costs->outside_cost ();
|
||||
new_cost += new_loop_vinfo->vector_costs->outside_cost ();
|
||||
return new_cost < old_cost;
|
||||
}
|
||||
|
||||
/* Limit the VFs to what is likely to be the maximum number of iterations,
|
||||
to handle cases in which at least one loop_vinfo is fully-masked. */
|
||||
HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
|
||||
if (estimated_max_niter != -1)
|
||||
{
|
||||
if (known_le (estimated_max_niter, new_vf))
|
||||
new_vf = estimated_max_niter;
|
||||
if (known_le (estimated_max_niter, old_vf))
|
||||
old_vf = estimated_max_niter;
|
||||
}
|
||||
|
||||
/* Check whether the (fractional) cost per scalar iteration is lower
|
||||
or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
|
||||
poly_int64 rel_new = new_loop_vinfo->vector_costs->body_cost () * old_vf;
|
||||
poly_int64 rel_old = old_loop_vinfo->vector_costs->body_cost () * new_vf;
|
||||
|
||||
HOST_WIDE_INT est_rel_new_min
|
||||
= estimated_poly_value (rel_new, POLY_VALUE_MIN);
|
||||
HOST_WIDE_INT est_rel_new_max
|
||||
= estimated_poly_value (rel_new, POLY_VALUE_MAX);
|
||||
|
||||
HOST_WIDE_INT est_rel_old_min
|
||||
= estimated_poly_value (rel_old, POLY_VALUE_MIN);
|
||||
HOST_WIDE_INT est_rel_old_max
|
||||
= estimated_poly_value (rel_old, POLY_VALUE_MAX);
|
||||
|
||||
/* Check first if we can make out an unambigous total order from the minimum
|
||||
and maximum estimates. */
|
||||
if (est_rel_new_min < est_rel_old_min
|
||||
&& est_rel_new_max < est_rel_old_max)
|
||||
return true;
|
||||
else if (est_rel_old_min < est_rel_new_min
|
||||
&& est_rel_old_max < est_rel_new_max)
|
||||
return false;
|
||||
/* When old_loop_vinfo uses a variable vectorization factor,
|
||||
we know that it has a lower cost for at least one runtime VF.
|
||||
However, we don't know how likely that VF is.
|
||||
|
||||
One option would be to compare the costs for the estimated VFs.
|
||||
The problem is that that can put too much pressure on the cost
|
||||
model. E.g. if the estimated VF is also the lowest possible VF,
|
||||
and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
|
||||
for the estimated VF, we'd then choose new_loop_vinfo even
|
||||
though (a) new_loop_vinfo might not actually be better than
|
||||
old_loop_vinfo for that VF and (b) it would be significantly
|
||||
worse at larger VFs.
|
||||
|
||||
Here we go for a hacky compromise: pick new_loop_vinfo if it is
|
||||
no more expensive than old_loop_vinfo even after doubling the
|
||||
estimated old_loop_vinfo VF. For all but trivial loops, this
|
||||
ensures that we only pick new_loop_vinfo if it is significantly
|
||||
better than old_loop_vinfo at the estimated VF. */
|
||||
|
||||
if (est_rel_old_min != est_rel_new_min
|
||||
|| est_rel_old_max != est_rel_new_max)
|
||||
{
|
||||
HOST_WIDE_INT est_rel_new_likely
|
||||
= estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
|
||||
HOST_WIDE_INT est_rel_old_likely
|
||||
= estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
|
||||
|
||||
return est_rel_new_likely * 2 <= est_rel_old_likely;
|
||||
}
|
||||
|
||||
/* If there's nothing to choose between the loop bodies, see whether
|
||||
there's a difference in the prologue and epilogue costs. */
|
||||
auto old_outside_cost = old_loop_vinfo->vector_costs->outside_cost ();
|
||||
auto new_outside_cost = new_loop_vinfo->vector_costs->outside_cost ();
|
||||
if (new_outside_cost != old_outside_cost)
|
||||
return new_outside_cost < old_outside_cost;
|
||||
|
||||
return false;
|
||||
return new_costs->better_main_loop_than_p (old_costs);
|
||||
}
|
||||
|
||||
/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
|
||||
|
|
|
@ -1744,3 +1744,207 @@ vector_costs::adjust_cost_for_freq (stmt_vec_info stmt_info,
|
|||
}
|
||||
return cost;
|
||||
}
|
||||
|
||||
/* See the comment above the declaration for details. */
|
||||
|
||||
bool
|
||||
vector_costs::better_main_loop_than_p (const vector_costs *other) const
|
||||
{
|
||||
int diff = compare_inside_loop_cost (other);
|
||||
if (diff != 0)
|
||||
return diff < 0;
|
||||
|
||||
/* If there's nothing to choose between the loop bodies, see whether
|
||||
there's a difference in the prologue and epilogue costs. */
|
||||
diff = compare_outside_loop_cost (other);
|
||||
if (diff != 0)
|
||||
return diff < 0;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/* See the comment above the declaration for details. */
|
||||
|
||||
bool
|
||||
vector_costs::better_epilogue_loop_than_p (const vector_costs *other,
|
||||
loop_vec_info main_loop) const
|
||||
{
|
||||
loop_vec_info this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
|
||||
loop_vec_info other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
|
||||
|
||||
poly_int64 this_vf = LOOP_VINFO_VECT_FACTOR (this_loop_vinfo);
|
||||
poly_int64 other_vf = LOOP_VINFO_VECT_FACTOR (other_loop_vinfo);
|
||||
|
||||
poly_uint64 main_poly_vf = LOOP_VINFO_VECT_FACTOR (main_loop);
|
||||
unsigned HOST_WIDE_INT main_vf;
|
||||
unsigned HOST_WIDE_INT other_factor, this_factor, other_cost, this_cost;
|
||||
/* If we can determine how many iterations are left for the epilogue
|
||||
loop, that is if both the main loop's vectorization factor and number
|
||||
of iterations are constant, then we use them to calculate the cost of
|
||||
the epilogue loop together with a 'likely value' for the epilogues
|
||||
vectorization factor. Otherwise we use the main loop's vectorization
|
||||
factor and the maximum poly value for the epilogue's. If the target
|
||||
has not provided with a sensible upper bound poly vectorization
|
||||
factors are likely to be favored over constant ones. */
|
||||
if (main_poly_vf.is_constant (&main_vf)
|
||||
&& LOOP_VINFO_NITERS_KNOWN_P (main_loop))
|
||||
{
|
||||
unsigned HOST_WIDE_INT niters
|
||||
= LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
|
||||
HOST_WIDE_INT other_likely_vf
|
||||
= estimated_poly_value (other_vf, POLY_VALUE_LIKELY);
|
||||
HOST_WIDE_INT this_likely_vf
|
||||
= estimated_poly_value (this_vf, POLY_VALUE_LIKELY);
|
||||
|
||||
/* If the epilogue is using partial vectors we account for the
|
||||
partial iteration here too. */
|
||||
other_factor = niters / other_likely_vf;
|
||||
if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (other_loop_vinfo)
|
||||
&& niters % other_likely_vf != 0)
|
||||
other_factor++;
|
||||
|
||||
this_factor = niters / this_likely_vf;
|
||||
if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (this_loop_vinfo)
|
||||
&& niters % this_likely_vf != 0)
|
||||
this_factor++;
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned HOST_WIDE_INT main_vf_max
|
||||
= estimated_poly_value (main_poly_vf, POLY_VALUE_MAX);
|
||||
|
||||
other_factor = main_vf_max / estimated_poly_value (other_vf,
|
||||
POLY_VALUE_MAX);
|
||||
this_factor = main_vf_max / estimated_poly_value (this_vf,
|
||||
POLY_VALUE_MAX);
|
||||
|
||||
/* If the loop is not using partial vectors then it will iterate one
|
||||
time less than one that does. It is safe to subtract one here,
|
||||
because the main loop's vf is always at least 2x bigger than that
|
||||
of an epilogue. */
|
||||
if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (other_loop_vinfo))
|
||||
other_factor -= 1;
|
||||
if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (this_loop_vinfo))
|
||||
this_factor -= 1;
|
||||
}
|
||||
|
||||
/* Compute the costs by multiplying the inside costs with the factor and
|
||||
add the outside costs for a more complete picture. The factor is the
|
||||
amount of times we are expecting to iterate this epilogue. */
|
||||
other_cost = other->body_cost () * other_factor;
|
||||
this_cost = this->body_cost () * this_factor;
|
||||
other_cost += other->outside_cost ();
|
||||
this_cost += this->outside_cost ();
|
||||
return this_cost < other_cost;
|
||||
}
|
||||
|
||||
/* A <=>-style subroutine of better_main_loop_than_p. Check whether we can
|
||||
determine the return value of better_main_loop_than_p by comparing the
|
||||
inside (loop body) costs of THIS and OTHER. Return:
|
||||
|
||||
* -1 if better_main_loop_than_p should return true.
|
||||
* 1 if better_main_loop_than_p should return false.
|
||||
* 0 if we can't decide. */
|
||||
|
||||
int
|
||||
vector_costs::compare_inside_loop_cost (const vector_costs *other) const
|
||||
{
|
||||
loop_vec_info this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
|
||||
loop_vec_info other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
|
||||
|
||||
struct loop *loop = LOOP_VINFO_LOOP (this_loop_vinfo);
|
||||
gcc_assert (LOOP_VINFO_LOOP (other_loop_vinfo) == loop);
|
||||
|
||||
poly_int64 this_vf = LOOP_VINFO_VECT_FACTOR (this_loop_vinfo);
|
||||
poly_int64 other_vf = LOOP_VINFO_VECT_FACTOR (other_loop_vinfo);
|
||||
|
||||
/* Limit the VFs to what is likely to be the maximum number of iterations,
|
||||
to handle cases in which at least one loop_vinfo is fully-masked. */
|
||||
HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
|
||||
if (estimated_max_niter != -1)
|
||||
{
|
||||
if (known_le (estimated_max_niter, this_vf))
|
||||
this_vf = estimated_max_niter;
|
||||
if (known_le (estimated_max_niter, other_vf))
|
||||
other_vf = estimated_max_niter;
|
||||
}
|
||||
|
||||
/* Check whether the (fractional) cost per scalar iteration is lower or
|
||||
higher: this_inside_cost / this_vf vs. other_inside_cost / other_vf. */
|
||||
poly_int64 rel_this = this_loop_vinfo->vector_costs->body_cost () * other_vf;
|
||||
poly_int64 rel_other
|
||||
= other_loop_vinfo->vector_costs->body_cost () * this_vf;
|
||||
|
||||
HOST_WIDE_INT est_rel_this_min
|
||||
= estimated_poly_value (rel_this, POLY_VALUE_MIN);
|
||||
HOST_WIDE_INT est_rel_this_max
|
||||
= estimated_poly_value (rel_this, POLY_VALUE_MAX);
|
||||
|
||||
HOST_WIDE_INT est_rel_other_min
|
||||
= estimated_poly_value (rel_other, POLY_VALUE_MIN);
|
||||
HOST_WIDE_INT est_rel_other_max
|
||||
= estimated_poly_value (rel_other, POLY_VALUE_MAX);
|
||||
|
||||
/* Check first if we can make out an unambigous total order from the minimum
|
||||
and maximum estimates. */
|
||||
if (est_rel_this_min < est_rel_other_min
|
||||
&& est_rel_this_max < est_rel_other_max)
|
||||
return -1;
|
||||
|
||||
if (est_rel_other_min < est_rel_this_min
|
||||
&& est_rel_other_max < est_rel_this_max)
|
||||
return 1;
|
||||
|
||||
/* When other_loop_vinfo uses a variable vectorization factor,
|
||||
we know that it has a lower cost for at least one runtime VF.
|
||||
However, we don't know how likely that VF is.
|
||||
|
||||
One option would be to compare the costs for the estimated VFs.
|
||||
The problem is that that can put too much pressure on the cost
|
||||
model. E.g. if the estimated VF is also the lowest possible VF,
|
||||
and if other_loop_vinfo is 1 unit worse than this_loop_vinfo
|
||||
for the estimated VF, we'd then choose this_loop_vinfo even
|
||||
though (a) this_loop_vinfo might not actually be better than
|
||||
other_loop_vinfo for that VF and (b) it would be significantly
|
||||
worse at larger VFs.
|
||||
|
||||
Here we go for a hacky compromise: pick this_loop_vinfo if it is
|
||||
no more expensive than other_loop_vinfo even after doubling the
|
||||
estimated other_loop_vinfo VF. For all but trivial loops, this
|
||||
ensures that we only pick this_loop_vinfo if it is significantly
|
||||
better than other_loop_vinfo at the estimated VF. */
|
||||
if (est_rel_other_min != est_rel_this_min
|
||||
|| est_rel_other_max != est_rel_this_max)
|
||||
{
|
||||
HOST_WIDE_INT est_rel_this_likely
|
||||
= estimated_poly_value (rel_this, POLY_VALUE_LIKELY);
|
||||
HOST_WIDE_INT est_rel_other_likely
|
||||
= estimated_poly_value (rel_other, POLY_VALUE_LIKELY);
|
||||
|
||||
return est_rel_this_likely * 2 <= est_rel_other_likely ? -1 : 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* A <=>-style subroutine of better_main_loop_than_p, used when there is
|
||||
nothing to choose between the inside (loop body) costs of THIS and OTHER.
|
||||
Check whether we can determine the return value of better_main_loop_than_p
|
||||
by comparing the outside (prologue and epilogue) costs of THIS and OTHER.
|
||||
Return:
|
||||
|
||||
* -1 if better_main_loop_than_p should return true.
|
||||
* 1 if better_main_loop_than_p should return false.
|
||||
* 0 if we can't decide. */
|
||||
|
||||
int
|
||||
vector_costs::compare_outside_loop_cost (const vector_costs *other) const
|
||||
{
|
||||
auto this_outside_cost = this->outside_cost ();
|
||||
auto other_outside_cost = other->outside_cost ();
|
||||
if (this_outside_cost != other_outside_cost)
|
||||
return this_outside_cost < other_outside_cost ? -1 : 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1419,6 +1419,21 @@ public:
|
|||
read back using the functions below. */
|
||||
virtual void finish_cost ();
|
||||
|
||||
/* The costs in THIS and OTHER both describe ways of vectorizing
|
||||
a main loop. Return true if the costs described by THIS are
|
||||
cheaper than the costs described by OTHER. Return false if any
|
||||
of the following are true:
|
||||
|
||||
- THIS and OTHER are of equal cost
|
||||
- OTHER is better than THIS
|
||||
- we can't be sure about the relative costs of THIS and OTHER. */
|
||||
virtual bool better_main_loop_than_p (const vector_costs *other) const;
|
||||
|
||||
/* Likewise, but the costs in THIS and OTHER both describe ways of
|
||||
vectorizing an epilogue loop of MAIN_LOOP. */
|
||||
virtual bool better_epilogue_loop_than_p (const vector_costs *other,
|
||||
loop_vec_info main_loop) const;
|
||||
|
||||
unsigned int prologue_cost () const;
|
||||
unsigned int body_cost () const;
|
||||
unsigned int epilogue_cost () const;
|
||||
|
@ -1429,6 +1444,8 @@ protected:
|
|||
unsigned int);
|
||||
unsigned int adjust_cost_for_freq (stmt_vec_info, vect_cost_model_location,
|
||||
unsigned int);
|
||||
int compare_inside_loop_cost (const vector_costs *) const;
|
||||
int compare_outside_loop_cost (const vector_costs *) const;
|
||||
|
||||
/* The region of code that we're considering vectorizing. */
|
||||
vec_info *m_vinfo;
|
||||
|
|
Loading…
Reference in New Issue