aarch64: Move more code into aarch64_vector_costs

This patch moves more code into aarch64_vector_costs and reuses
some of the information that is now available in the base class.

I'm planing to significantly rework this code, with more hooks
into the vectoriser, but this seemed worth doing as a first step.

gcc/
	* config/aarch64/aarch64.c (aarch64_vector_costs): Make member
	variables private and add "m_" to their names.  Remove is_loop.
	(aarch64_record_potential_advsimd_unrolling): Replace with...
	(aarch64_vector_costs::record_potential_advsimd_unrolling): ...this.
	(aarch64_analyze_loop_vinfo): Replace with...
	(aarch64_vector_costs::analyze_loop_vinfo): ...this.
	Move initialization of (m_)vec_flags to add_stmt_cost.
	(aarch64_analyze_bb_vinfo): Delete.
	(aarch64_count_ops): Replace with...
	(aarch64_vector_costs::count_ops): ...this.
	(aarch64_vector_costs::add_stmt_cost): Set m_vec_flags,
	using m_costing_for_scalar to test whether we're costing
	scalar or vector code.
	(aarch64_adjust_body_cost_sve): Replace with...
	(aarch64_vector_costs::adjust_body_cost_sve): ...this.
	(aarch64_adjust_body_cost): Replace with...
	(aarch64_vector_costs::adjust_body_cost): ...this.
	(aarch64_vector_costs::finish_cost): Use m_vinfo instead of is_loop.
This commit is contained in:
Richard Sandiford 2021-11-04 12:31:17 +00:00
parent 6239dd0512
commit d43fc1df73
1 changed files with 154 additions and 183 deletions

View File

@ -14589,8 +14589,9 @@ struct aarch64_sve_op_count : aarch64_vec_op_count
}; };
/* Information about vector code that we're in the process of costing. */ /* Information about vector code that we're in the process of costing. */
struct aarch64_vector_costs : public vector_costs class aarch64_vector_costs : public vector_costs
{ {
public:
using vector_costs::vector_costs; using vector_costs::vector_costs;
unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind, unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
@ -14599,26 +14600,31 @@ struct aarch64_vector_costs : public vector_costs
vect_cost_model_location where) override; vect_cost_model_location where) override;
void finish_cost () override; void finish_cost () override;
/* True if we have performed one-time initialization based on the vec_info. private:
void record_potential_advsimd_unrolling (loop_vec_info);
void analyze_loop_vinfo (loop_vec_info);
void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info, tree,
unsigned int, aarch64_vec_op_count *,
const aarch64_base_vec_issue_info *, unsigned int);
fractional_cost adjust_body_cost_sve (const aarch64_vec_issue_info *,
fractional_cost, fractional_cost,
bool, unsigned int, unsigned int *,
bool *);
unsigned int adjust_body_cost (unsigned int);
This variable exists because the vec_info is not passed to the /* True if we have performed one-time initialization based on the
init_cost hook. We therefore have to defer initialization based on vec_info. */
it till later. */ bool m_analyzed_vinfo = false;
bool analyzed_vinfo = false;
/* True if we're costing a vector loop, false if we're costing block-level
vectorization. */
bool is_loop = false;
/* True if we've seen an SVE operation that we cannot currently vectorize /* True if we've seen an SVE operation that we cannot currently vectorize
using Advanced SIMD. */ using Advanced SIMD. */
bool saw_sve_only_op = false; bool m_saw_sve_only_op = false;
/* - If VEC_FLAGS is zero then we're costing the original scalar code. /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
- If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
SIMD code. SIMD code.
- If VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */ - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
unsigned int vec_flags = 0; unsigned int m_vec_flags = 0;
/* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
@ -14628,39 +14634,39 @@ struct aarch64_vector_costs : public vector_costs
than length-agnostic SVE, since the SVE loop would execute an unknown than length-agnostic SVE, since the SVE loop would execute an unknown
number of times and so could not be completely unrolled in the same way. number of times and so could not be completely unrolled in the same way.
If we're applying this heuristic, UNROLLED_ADVSIMD_NITERS is the If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
number of Advanced SIMD loop iterations that would be unrolled and number of Advanced SIMD loop iterations that would be unrolled and
UNROLLED_ADVSIMD_STMTS estimates the total number of statements M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
in the unrolled loop. Both values are zero if we're not applying in the unrolled loop. Both values are zero if we're not applying
the heuristic. */ the heuristic. */
unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0; unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0; unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
/* If we're vectorizing a loop that executes a constant number of times, /* If we're vectorizing a loop that executes a constant number of times,
this variable gives the number of times that the vector loop would this variable gives the number of times that the vector loop would
iterate, otherwise it is zero. */ iterate, otherwise it is zero. */
uint64_t num_vector_iterations = 0; uint64_t m_num_vector_iterations = 0;
/* Used only when vectorizing loops. Estimates the number and kind of scalar /* Used only when vectorizing loops. Estimates the number and kind of scalar
operations that would be needed to perform the same work as one iteration operations that would be needed to perform the same work as one iteration
of the vector loop. */ of the vector loop. */
aarch64_vec_op_count scalar_ops; aarch64_vec_op_count m_scalar_ops;
/* Used only when vectorizing loops. If VEC_FLAGS & VEC_ADVSIMD, /* Used only when vectorizing loops. If M_VEC_FLAGS & VEC_ADVSIMD,
this structure estimates the number and kind of operations that the this structure estimates the number and kind of operations that the
vector loop would contain. If VEC_FLAGS & VEC_SVE, the structure vector loop would contain. If M_VEC_FLAGS & VEC_SVE, the structure
estimates what the equivalent Advanced SIMD-only code would need in estimates what the equivalent Advanced SIMD-only code would need in
order to perform the same work as one iteration of the SVE loop. */ order to perform the same work as one iteration of the SVE loop. */
aarch64_vec_op_count advsimd_ops; aarch64_vec_op_count m_advsimd_ops;
/* Used only when vectorizing loops with SVE. It estimates the number and /* Used only when vectorizing loops with SVE. It estimates the number and
kind of operations that the SVE loop would contain. */ kind of operations that the SVE loop would contain. */
aarch64_sve_op_count sve_ops; aarch64_sve_op_count m_sve_ops;
/* Used to detect cases in which we end up costing the same load twice, /* Used to detect cases in which we end up costing the same load twice,
once to account for results that are actually used and once to account once to account for results that are actually used and once to account
for unused results. */ for unused results. */
hash_map<nofree_ptr_hash<_stmt_vec_info>, unsigned int> seen_loads; hash_map<nofree_ptr_hash<_stmt_vec_info>, unsigned int> m_seen_loads;
}; };
/* Implement TARGET_VECTORIZE_CREATE_COSTS. */ /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
@ -14703,12 +14709,11 @@ aarch64_simd_vec_costs_for_flags (unsigned int flags)
} }
/* Decide whether to use the unrolling heuristic described above /* Decide whether to use the unrolling heuristic described above
aarch64_vector_costs::unrolled_advsimd_niters, updating that m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
field if so. LOOP_VINFO describes the loop that we're vectorizing describes the loop that we're vectorizing. */
and COSTS are the costs that we're calculating for it. */ void
static void aarch64_vector_costs::
aarch64_record_potential_advsimd_unrolling (loop_vec_info loop_vinfo, record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
aarch64_vector_costs *costs)
{ {
/* The heuristic only makes sense on targets that have the same /* The heuristic only makes sense on targets that have the same
vector throughput for SVE and Advanced SIMD. */ vector throughput for SVE and Advanced SIMD. */
@ -14718,7 +14723,7 @@ aarch64_record_potential_advsimd_unrolling (loop_vec_info loop_vinfo,
/* We only want to apply the heuristic if LOOP_VINFO is being /* We only want to apply the heuristic if LOOP_VINFO is being
vectorized for SVE. */ vectorized for SVE. */
if (!(costs->vec_flags & VEC_ANY_SVE)) if (!(m_vec_flags & VEC_ANY_SVE))
return; return;
/* Check whether it is possible in principle to use Advanced SIMD /* Check whether it is possible in principle to use Advanced SIMD
@ -14751,17 +14756,14 @@ aarch64_record_potential_advsimd_unrolling (loop_vec_info loop_vinfo,
/* Record that we're applying the heuristic and should try to estimate /* Record that we're applying the heuristic and should try to estimate
the number of statements in the Advanced SIMD loop. */ the number of statements in the Advanced SIMD loop. */
costs->unrolled_advsimd_niters = unrolled_advsimd_niters; m_unrolled_advsimd_niters = unrolled_advsimd_niters;
} }
/* Do one-time initialization of COSTS given that we're costing the loop /* Do one-time initialization of the aarch64_vector_costs given that we're
vectorization described by LOOP_VINFO. */ costing the loop vectorization described by LOOP_VINFO. */
static void void
aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo, aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
aarch64_vector_costs *costs)
{ {
costs->is_loop = true;
/* Record the number of times that the vector loop would execute, /* Record the number of times that the vector loop would execute,
if known. */ if known. */
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@ -14770,26 +14772,14 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
{ {
unsigned int vf = vect_vf_for_cost (loop_vinfo); unsigned int vf = vect_vf_for_cost (loop_vinfo);
if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
costs->num_vector_iterations = scalar_niters / vf; m_num_vector_iterations = scalar_niters / vf;
else else
costs->num_vector_iterations = CEIL (scalar_niters, vf); m_num_vector_iterations = CEIL (scalar_niters, vf);
} }
/* Detect whether we're costing the scalar code or the vector code. /* Detect whether we're vectorizing for SVE and should apply the unrolling
This is a bit hacky: it would be better if the vectorizer told heuristic described above m_unrolled_advsimd_niters. */
us directly. record_potential_advsimd_unrolling (loop_vinfo);
If we're costing the vector code, record whether we're vectorizing
for Advanced SIMD or SVE. */
if (costs == LOOP_VINFO_TARGET_COST_DATA (loop_vinfo))
costs->vec_flags = aarch64_classify_vector_mode (loop_vinfo->vector_mode);
else
costs->vec_flags = 0;
/* Detect whether we're vectorizing for SVE and should
apply the unrolling heuristic described above
aarch64_vector_costs::unrolled_advsimd_niters. */
aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs);
/* Record the issue information for any SVE WHILE instructions that the /* Record the issue information for any SVE WHILE instructions that the
loop needs. */ loop needs. */
@ -14804,21 +14794,10 @@ aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
if (rgm->type) if (rgm->type)
num_masks += num_vectors_m1 + 1; num_masks += num_vectors_m1 + 1;
costs->sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops; m_sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
} }
} }
/* Do one-time initialization of COSTS given that we're costing the block
vectorization described by BB_VINFO. */
static void
aarch64_analyze_bb_vinfo (bb_vec_info bb_vinfo, aarch64_vector_costs *costs)
{
/* Unfortunately, there's no easy way of telling whether we're costing
the vector code or the scalar code, so just assume that we're costing
the vector code. */
costs->vec_flags = aarch64_classify_vector_mode (bb_vinfo->vector_mode);
}
/* Implement targetm.vectorize.builtin_vectorization_cost. */ /* Implement targetm.vectorize.builtin_vectorization_cost. */
static int static int
aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
@ -15352,30 +15331,30 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
return stmt_cost; return stmt_cost;
} }
/* VINFO, COSTS, COUNT, KIND, STMT_INFO and VECTYPE are the same as for /* COUNT, KIND, STMT_INFO and VECTYPE are the same as for
vector_costs::add_stmt_cost and they describe an operation in the vector_costs::add_stmt_cost and they describe an operation in the
body of a vector loop. Record issue information relating to the vector body of a vector loop. Record issue information relating to the vector
operation in OPS, where OPS is one of COSTS->scalar_ops, COSTS->advsimd_ops operation in OPS, where OPS is one of m_scalar_ops, m_advsimd_ops
or COSTS->sve_ops; see the comments above those variables for details. or m_sve_ops; see the comments above those variables for details.
In addition: In addition:
- VEC_FLAGS is zero if OPS is COSTS->scalar_ops. - VEC_FLAGS is zero if OPS is m_scalar_ops.
- VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is COSTS->advsimd_ops. - VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is m_advsimd_ops.
- VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is COSTS->sve_ops. - VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is m_sve_ops.
ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information
associated with OPS and VEC_FLAGS. FACTOR says how many iterations of associated with OPS and VEC_FLAGS. FACTOR says how many iterations of
the loop described by VEC_FLAGS would be needed to match one iteration the loop described by VEC_FLAGS would be needed to match one iteration
of the vector loop in VINFO. */ of the vector loop in VINFO. */
static void void
aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs, aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
unsigned int count, enum vect_cost_for_stmt kind, stmt_vec_info stmt_info, tree vectype,
_stmt_vec_info *stmt_info, tree vectype, unsigned int vec_flags,
unsigned int vec_flags, aarch64_vec_op_count *ops, aarch64_vec_op_count *ops,
const aarch64_base_vec_issue_info *issue_info, const aarch64_base_vec_issue_info *issue_info,
unsigned int factor) unsigned int factor)
{ {
if (!issue_info) if (!issue_info)
return; return;
@ -15394,9 +15373,9 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
&& vect_is_reduction (stmt_info)) && vect_is_reduction (stmt_info))
{ {
unsigned int base unsigned int base
= aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype, = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, vectype,
vec_flags); vec_flags);
if (vect_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION) if (vect_reduc_type (m_vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
{ {
if (aarch64_sve_mode_p (TYPE_MODE (vectype))) if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
{ {
@ -15423,7 +15402,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
} }
/* Assume that multiply-adds will become a single operation. */ /* Assume that multiply-adds will become a single operation. */
if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags)) if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, vec_flags))
return; return;
/* When costing scalar statements in vector code, the count already /* When costing scalar statements in vector code, the count already
@ -15473,7 +15452,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
{ {
bool existed = false; bool existed = false;
unsigned int &prev_count unsigned int &prev_count
= costs->seen_loads.get_or_insert (stmt_info, &existed); = m_seen_loads.get_or_insert (stmt_info, &existed);
if (existed) if (existed)
num_copies -= prev_count; num_copies -= prev_count;
else else
@ -15504,7 +15483,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
have only accounted for one. */ have only accounted for one. */
if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar)) if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
{ {
int reduc_type = vect_reduc_type (vinfo, stmt_info); int reduc_type = vect_reduc_type (m_vinfo, stmt_info);
if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD)) if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
|| reduc_type == COND_REDUCTION) || reduc_type == COND_REDUCTION)
ops->general_ops += num_copies; ops->general_ops += num_copies;
@ -15517,7 +15496,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
unsigned int base = (FLOAT_TYPE_P (type) unsigned int base = (FLOAT_TYPE_P (type)
? sve_issue->fp_cmp_pred_ops ? sve_issue->fp_cmp_pred_ops
: sve_issue->int_cmp_pred_ops); : sve_issue->int_cmp_pred_ops);
costs->sve_ops.pred_ops += base * num_copies; m_sve_ops.pred_ops += base * num_copies;
} }
/* Add any extra overhead associated with LD[234] and ST[234] operations. */ /* Add any extra overhead associated with LD[234] and ST[234] operations. */
@ -15543,8 +15522,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
&& STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
{ {
unsigned int pairs = CEIL (count, 2); unsigned int pairs = CEIL (count, 2);
costs->sve_ops.pred_ops m_sve_ops.pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
+= sve_issue->gather_scatter_pair_pred_ops * pairs;
ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs; ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
} }
} }
@ -15564,14 +15542,17 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
/* Do one-time initialization based on the vinfo. */ /* Do one-time initialization based on the vinfo. */
loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
bb_vec_info bb_vinfo = dyn_cast<bb_vec_info> (m_vinfo); if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
if (!analyzed_vinfo && aarch64_use_new_vector_costs_p ())
{ {
/* If we're costing the vector code, record whether we're vectorizing
for Advanced SIMD or SVE. */
if (!m_costing_for_scalar)
m_vec_flags = aarch64_classify_vector_mode (m_vinfo->vector_mode);
if (loop_vinfo) if (loop_vinfo)
aarch64_analyze_loop_vinfo (loop_vinfo, this); analyze_loop_vinfo (loop_vinfo);
else
aarch64_analyze_bb_vinfo (bb_vinfo, this); m_analyzed_vinfo = true;
this->analyzed_vinfo = true;
} }
/* Try to get a more accurate cost by looking at STMT_INFO instead /* Try to get a more accurate cost by looking at STMT_INFO instead
@ -15579,7 +15560,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
if (stmt_info && aarch64_use_new_vector_costs_p ()) if (stmt_info && aarch64_use_new_vector_costs_p ())
{ {
if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype)) if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype))
this->saw_sve_only_op = true; m_saw_sve_only_op = true;
/* If we scalarize a strided store, the vectorizer costs one /* If we scalarize a strided store, the vectorizer costs one
vec_to_scalar for each element. However, we can store the first vec_to_scalar for each element. However, we can store the first
@ -15587,10 +15568,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
if (vect_is_store_elt_extraction (kind, stmt_info)) if (vect_is_store_elt_extraction (kind, stmt_info))
count -= 1; count -= 1;
stmt_cost = aarch64_detect_scalar_stmt_subtype stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
(m_vinfo, kind, stmt_info, stmt_cost); stmt_info, stmt_cost);
if (vectype && this->vec_flags) if (vectype && m_vec_flags)
stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind, stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
stmt_info, vectype, stmt_info, vectype,
where, stmt_cost); where, stmt_cost);
@ -15614,37 +15595,33 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
auto *issue_info = aarch64_tune_params.vec_costs->issue_info; auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
if (loop_vinfo if (loop_vinfo
&& issue_info && issue_info
&& this->vec_flags && m_vec_flags
&& where == vect_body && where == vect_body
&& (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p) && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
&& vectype && vectype
&& stmt_cost != 0) && stmt_cost != 0)
{ {
/* Record estimates for the scalar code. */ /* Record estimates for the scalar code. */
aarch64_count_ops (m_vinfo, this, count, kind, stmt_info, vectype, count_ops (count, kind, stmt_info, vectype, 0, &m_scalar_ops,
0, &this->scalar_ops, issue_info->scalar, issue_info->scalar, vect_nunits_for_cost (vectype));
vect_nunits_for_cost (vectype));
if (aarch64_sve_mode_p (m_vinfo->vector_mode) && issue_info->sve) if (aarch64_sve_mode_p (m_vinfo->vector_mode) && issue_info->sve)
{ {
/* Record estimates for a possible Advanced SIMD version /* Record estimates for a possible Advanced SIMD version
of the SVE code. */ of the SVE code. */
aarch64_count_ops (m_vinfo, this, count, kind, stmt_info, count_ops (count, kind, stmt_info, vectype, VEC_ADVSIMD,
vectype, VEC_ADVSIMD, &this->advsimd_ops, &m_advsimd_ops, issue_info->advsimd,
issue_info->advsimd, aarch64_estimated_sve_vq ());
aarch64_estimated_sve_vq ());
/* Record estimates for the SVE code itself. */ /* Record estimates for the SVE code itself. */
aarch64_count_ops (m_vinfo, this, count, kind, stmt_info, count_ops (count, kind, stmt_info, vectype, VEC_ANY_SVE,
vectype, VEC_ANY_SVE, &this->sve_ops, &m_sve_ops, issue_info->sve, 1);
issue_info->sve, 1);
} }
else else
/* Record estimates for the Advanced SIMD code. Treat SVE like /* Record estimates for the Advanced SIMD code. Treat SVE like
Advanced SIMD if the CPU has no specific SVE costs. */ Advanced SIMD if the CPU has no specific SVE costs. */
aarch64_count_ops (m_vinfo, this, count, kind, stmt_info, count_ops (count, kind, stmt_info, vectype, VEC_ADVSIMD,
vectype, VEC_ADVSIMD, &this->advsimd_ops, &m_advsimd_ops, issue_info->advsimd, 1);
issue_info->advsimd, 1);
} }
/* If we're applying the SVE vs. Advanced SIMD unrolling heuristic, /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
@ -15652,9 +15629,8 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
loop. For simplicitly, we assume that one iteration of the loop. For simplicitly, we assume that one iteration of the
Advanced SIMD loop would need the same number of statements Advanced SIMD loop would need the same number of statements
as one iteration of the SVE loop. */ as one iteration of the SVE loop. */
if (where == vect_body && this->unrolled_advsimd_niters) if (where == vect_body && m_unrolled_advsimd_niters)
this->unrolled_advsimd_stmts m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
+= count * this->unrolled_advsimd_niters;
} }
return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ()); return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
} }
@ -15698,32 +15674,28 @@ aarch64_estimate_min_cycles_per_iter
return cycles; return cycles;
} }
/* Subroutine of aarch64_adjust_body_cost for handling SVE. /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
Use ISSUE_INFO to work out how fast the SVE code can be issued and compare how fast the SVE code can be issued and compare it to the equivalent value
it to the equivalent value for scalar code (SCALAR_CYCLES_PER_ITER). for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
If COULD_USE_ADVSIMD is true, also compare it to the issue rate of also compare it to the issue rate of Advanced SIMD code
Advanced SIMD code (ADVSIMD_CYCLES_PER_ITER). (ADVSIMD_CYCLES_PER_ITER).
COSTS is as for aarch64_adjust_body_cost. ORIG_BODY_COST is the cost ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
originally passed to aarch64_adjust_body_cost and *BODY_COST is the current *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
value of the adjusted cost. *SHOULD_DISPARAGE is true if we think the loop is true if we think the loop body is too expensive. */
body is too expensive. */
static fractional_cost fractional_cost
aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs, aarch64_vector_costs::
const aarch64_vec_issue_info *issue_info, adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
fractional_cost scalar_cycles_per_iter, fractional_cost scalar_cycles_per_iter,
fractional_cost advsimd_cycles_per_iter, fractional_cost advsimd_cycles_per_iter,
bool could_use_advsimd, bool could_use_advsimd, unsigned int orig_body_cost,
unsigned int orig_body_cost, unsigned int *body_cost, bool *should_disparage)
unsigned int *body_cost,
bool *should_disparage)
{ {
/* Estimate the minimum number of cycles per iteration needed to issue /* Estimate the minimum number of cycles per iteration needed to issue
non-predicate operations. */ non-predicate operations. */
fractional_cost sve_nonpred_issue_cycles_per_iter fractional_cost sve_nonpred_issue_cycles_per_iter
= aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, = aarch64_estimate_min_cycles_per_iter (&m_sve_ops, issue_info->sve);
issue_info->sve);
/* Estimate the minimum number of cycles per iteration needed to rename /* Estimate the minimum number of cycles per iteration needed to rename
SVE instructions. SVE instructions.
@ -15739,9 +15711,9 @@ aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
??? This value is very much on the pessimistic side, but seems to work ??? This value is very much on the pessimistic side, but seems to work
pretty well in practice. */ pretty well in practice. */
sve_rename_cycles_per_iter sve_rename_cycles_per_iter
= { costs->sve_ops.general_ops = { m_sve_ops.general_ops
+ costs->sve_ops.loads + m_sve_ops.loads
+ costs->sve_ops.pred_ops + 1, 5 }; + m_sve_ops.pred_ops + 1, 5 };
/* Combine the rename and non-predicate issue limits into a single value. */ /* Combine the rename and non-predicate issue limits into a single value. */
fractional_cost sve_nonpred_cycles_per_iter fractional_cost sve_nonpred_cycles_per_iter
@ -15750,7 +15722,7 @@ aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
/* Separately estimate the minimum number of cycles per iteration needed /* Separately estimate the minimum number of cycles per iteration needed
to issue the predicate operations. */ to issue the predicate operations. */
fractional_cost sve_pred_issue_cycles_per_iter fractional_cost sve_pred_issue_cycles_per_iter
= { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; = { m_sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
/* Calculate the overall limit on the number of cycles per iteration. */ /* Calculate the overall limit on the number of cycles per iteration. */
fractional_cost sve_cycles_per_iter fractional_cost sve_cycles_per_iter
@ -15758,15 +15730,15 @@ aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
if (dump_enabled_p ()) if (dump_enabled_p ())
{ {
costs->sve_ops.dump (); m_sve_ops.dump ();
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
" estimated cycles per iteration = %f\n", " estimated cycles per iteration = %f\n",
sve_cycles_per_iter.as_double ()); sve_cycles_per_iter.as_double ());
if (costs->sve_ops.pred_ops) if (m_sve_ops.pred_ops)
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
" predicate issue = %f\n", " predicate issue = %f\n",
sve_pred_issue_cycles_per_iter.as_double ()); sve_pred_issue_cycles_per_iter.as_double ());
if (costs->sve_ops.pred_ops || sve_rename_cycles_per_iter) if (m_sve_ops.pred_ops || sve_rename_cycles_per_iter)
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
" non-predicate issue = %f\n", " non-predicate issue = %f\n",
sve_nonpred_issue_cycles_per_iter.as_double ()); sve_nonpred_issue_cycles_per_iter.as_double ());
@ -15843,10 +15815,10 @@ aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
return sve_cycles_per_iter; return sve_cycles_per_iter;
} }
/* BODY_COST is the cost of a vector loop body recorded in COSTS. /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
Adjust the cost as necessary and return the new cost. */ and return the new cost. */
static unsigned int unsigned int
aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
{ {
unsigned int orig_body_cost = body_cost; unsigned int orig_body_cost = body_cost;
bool should_disparage = false; bool should_disparage = false;
@ -15855,15 +15827,15 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
"Original vector body cost = %d\n", body_cost); "Original vector body cost = %d\n", body_cost);
if (costs->unrolled_advsimd_stmts) if (m_unrolled_advsimd_stmts)
{ {
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in" dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
" unrolled Advanced SIMD loop = %d\n", " unrolled Advanced SIMD loop = %d\n",
costs->unrolled_advsimd_stmts); m_unrolled_advsimd_stmts);
/* Apply the Advanced SIMD vs. SVE unrolling heuristic described above /* Apply the Advanced SIMD vs. SVE unrolling heuristic described above
aarch64_vector_costs::unrolled_advsimd_niters. m_unrolled_advsimd_niters.
The balance here is tricky. On the one hand, we can't be sure whether The balance here is tricky. On the one hand, we can't be sure whether
the code is vectorizable with Advanced SIMD or not. However, even if the code is vectorizable with Advanced SIMD or not. However, even if
@ -15871,8 +15843,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
the scalar code could also be unrolled. Some of the code might then the scalar code could also be unrolled. Some of the code might then
benefit from SLP, or from using LDP and STP. We therefore apply benefit from SLP, or from using LDP and STP. We therefore apply
the heuristic regardless of can_use_advsimd_p. */ the heuristic regardless of can_use_advsimd_p. */
if (costs->unrolled_advsimd_stmts if (m_unrolled_advsimd_stmts
&& (costs->unrolled_advsimd_stmts && (m_unrolled_advsimd_stmts
<= (unsigned int) param_max_completely_peeled_insns)) <= (unsigned int) param_max_completely_peeled_insns))
{ {
unsigned int estimated_vq = aarch64_estimated_sve_vq (); unsigned int estimated_vq = aarch64_estimated_sve_vq ();
@ -15894,28 +15866,28 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
return body_cost; return body_cost;
fractional_cost scalar_cycles_per_iter fractional_cost scalar_cycles_per_iter
= aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops, = aarch64_estimate_min_cycles_per_iter (&m_scalar_ops,
issue_info->scalar); issue_info->scalar);
fractional_cost advsimd_cycles_per_iter fractional_cost advsimd_cycles_per_iter
= aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops, = aarch64_estimate_min_cycles_per_iter (&m_advsimd_ops,
issue_info->advsimd); issue_info->advsimd);
bool could_use_advsimd bool could_use_advsimd
= ((costs->vec_flags & VEC_ADVSIMD) = ((m_vec_flags & VEC_ADVSIMD)
|| (aarch64_autovec_preference != 2 || (aarch64_autovec_preference != 2
&& (aarch64_tune_params.extra_tuning_flags && (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT) & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
&& !costs->saw_sve_only_op)); && !m_saw_sve_only_op));
if (dump_enabled_p ()) if (dump_enabled_p ())
{ {
if (IN_RANGE (costs->num_vector_iterations, 0, 65536)) if (IN_RANGE (m_num_vector_iterations, 0, 65536))
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
"Vector loop iterates at most %wd times\n", "Vector loop iterates at most %wd times\n",
costs->num_vector_iterations); m_num_vector_iterations);
dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n"); dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
costs->scalar_ops.dump (); m_scalar_ops.dump ();
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
" estimated cycles per iteration = %f\n", " estimated cycles per iteration = %f\n",
scalar_cycles_per_iter.as_double ()); scalar_cycles_per_iter.as_double ());
@ -15923,7 +15895,7 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
{ {
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
"Advanced SIMD issue estimate:\n"); "Advanced SIMD issue estimate:\n");
costs->advsimd_ops.dump (); m_advsimd_ops.dump ();
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
" estimated cycles per iteration = %f\n", " estimated cycles per iteration = %f\n",
advsimd_cycles_per_iter.as_double ()); advsimd_cycles_per_iter.as_double ());
@ -15934,19 +15906,17 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
} }
fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter; fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter;
unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency; unsigned int vector_reduction_latency = m_advsimd_ops.reduction_latency;
if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve) if ((m_vec_flags & VEC_ANY_SVE) && issue_info->sve)
{ {
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
vector_reduction_latency = costs->sve_ops.reduction_latency; vector_reduction_latency = m_sve_ops.reduction_latency;
vector_cycles_per_iter vector_cycles_per_iter
= aarch64_adjust_body_cost_sve (costs, issue_info, = adjust_body_cost_sve (issue_info, scalar_cycles_per_iter,
scalar_cycles_per_iter, advsimd_cycles_per_iter, could_use_advsimd,
advsimd_cycles_per_iter, orig_body_cost, &body_cost, &should_disparage);
could_use_advsimd, orig_body_cost,
&body_cost, &should_disparage);
if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost) if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
{ {
@ -15956,22 +15926,22 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
"Neoverse V1 estimate:\n"); "Neoverse V1 estimate:\n");
aarch64_adjust_body_cost_sve (costs, &neoversev1_vec_issue_info, adjust_body_cost_sve (&neoversev1_vec_issue_info,
scalar_cycles_per_iter * 2, scalar_cycles_per_iter * 2,
advsimd_cycles_per_iter * 2, advsimd_cycles_per_iter * 2,
could_use_advsimd, orig_body_cost, could_use_advsimd, orig_body_cost,
&body_cost, &should_disparage); &body_cost, &should_disparage);
} }
} }
/* Decide whether to stick to latency-based costs or whether to try to /* Decide whether to stick to latency-based costs or whether to try to
take issue rates into account. */ take issue rates into account. */
unsigned int threshold = aarch64_loop_vect_issue_rate_niters; unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
if (costs->vec_flags & VEC_ANY_SVE) if (m_vec_flags & VEC_ANY_SVE)
threshold = CEIL (threshold, aarch64_estimated_sve_vq ()); threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
if (costs->num_vector_iterations >= 1 if (m_num_vector_iterations >= 1
&& costs->num_vector_iterations < threshold) && m_num_vector_iterations < threshold)
{ {
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
@ -16004,8 +15974,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
vector code is an improvement, even if adding the other (non-loop-carried) vector code is an improvement, even if adding the other (non-loop-carried)
latencies tends to hide this saving. We therefore reduce the cost of the latencies tends to hide this saving. We therefore reduce the cost of the
vector loop body in proportion to the saving. */ vector loop body in proportion to the saving. */
else if (costs->scalar_ops.reduction_latency > vector_reduction_latency else if (m_scalar_ops.reduction_latency > vector_reduction_latency
&& costs->scalar_ops.reduction_latency == scalar_cycles_per_iter && m_scalar_ops.reduction_latency == scalar_cycles_per_iter
&& scalar_cycles_per_iter > vector_cycles_per_iter && scalar_cycles_per_iter > vector_cycles_per_iter
&& !should_disparage) && !should_disparage)
{ {
@ -16023,10 +15993,11 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
void void
aarch64_vector_costs::finish_cost () aarch64_vector_costs::finish_cost ()
{ {
if (this->is_loop loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
&& this->vec_flags if (loop_vinfo
&& m_vec_flags
&& aarch64_use_new_vector_costs_p ()) && aarch64_use_new_vector_costs_p ())
m_costs[vect_body] = aarch64_adjust_body_cost (this, m_costs[vect_body]); m_costs[vect_body] = adjust_body_cost (m_costs[vect_body]);
vector_costs::finish_cost (); vector_costs::finish_cost ();
} }