aarch64: Introduce --param=aarch64-autovec-preference to select autovec preference in backend

This is a patch that introduces the aarch64-autovec-preference that can
take values from 0 - 4, 0 being the default.
It can be used to override the autovectorisation preferences in the
backend:
0 - use default scheme
1 - only use Advanced SIMD
2 - only use SVE
3 - use Advanced SIMD and SVE, prefer Advanced SIMD in the event of a
tie (as determined by costs)
4 - use Advanced SIMD and SVE, prefer SVE in the event of a tie (as
determined by costs)

It can valuable for experimentation when comparing SVE and Advanced SIMD
autovectorisation strategies.

It achieves this adjusting the order of the interleaved SVE and Advanced
SIMD modes in aarch64_autovectorize_vector_modes.
It also adjusts aarch64_preferred_simd_mode to use the new comparison
function to pick Advanced SIMD or SVE to start with.

Bootstrapped and tested on aarch64-none-linux-gnu.

gcc/
	* config/aarch64/aarch64.opt
	(-param=aarch64-autovec-preference): Define.
	* config/aarch64/aarch64.c (aarch64_override_options_internal):
	Set aarch64_sve_compare_costs to 0 when preferring only Advanced
	SIMD.
	(aarch64_cmp_autovec_modes): Define.
	(aarch64_preferred_simd_mode): Adjust to use the above.
	(aarch64_autovectorize_vector_modes): Likewise.
	* doc/invoke.texi: Document aarch64-autovec-preference param.
This commit is contained in:
Kyrylo Tkachov 2020-11-27 09:19:33 +00:00
parent def0e5b83d
commit 5f29f3d5dd
3 changed files with 91 additions and 6 deletions

View File

@ -14442,6 +14442,12 @@ aarch64_override_options_internal (struct gcc_options *opts)
SET_OPTION_IF_UNSET (opts, &global_options_set,
param_sched_autopref_queue_depth, queue_depth);
/* If using Advanced SIMD only for autovectorization disable SVE vector costs
comparison. */
if (aarch64_autovec_preference == 1)
SET_OPTION_IF_UNSET (opts, &global_options_set,
aarch64_sve_compare_costs, 0);
/* Set up parameters to be used in prefetching algorithm. Do not
override the defaults unless we are tuning for a core we have
researched values for. */
@ -17282,11 +17288,65 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
return word_mode;
}
static HOST_WIDE_INT aarch64_estimated_poly_value (poly_int64);
/* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
and return whether the SVE mode should be preferred over the
Advanced SIMD one in aarch64_autovectorize_vector_modes. */
static bool
aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
{
/* Take into account the aarch64-autovec-preference param if non-zero. */
bool only_asimd_p = aarch64_autovec_preference == 1;
bool only_sve_p = aarch64_autovec_preference == 2;
if (only_asimd_p)
return false;
if (only_sve_p)
return true;
/* The preference in case of a tie in costs. */
bool prefer_asimd = aarch64_autovec_preference == 3;
bool prefer_sve = aarch64_autovec_preference == 4;
aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width;
poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
/* If the CPU information does not have an SVE width registered use the
generic poly_int comparison that prefers SVE. If a preference is
explicitly requested avoid this path. */
if (tune_width == SVE_SCALABLE
&& !prefer_asimd
&& !prefer_sve)
return maybe_gt (nunits_sve, nunits_asimd);
/* Otherwise estimate the runtime width of the modes involved. */
HOST_WIDE_INT est_sve = aarch64_estimated_poly_value (nunits_sve);
HOST_WIDE_INT est_asimd = aarch64_estimated_poly_value (nunits_asimd);
/* Preferring SVE means picking it first unless the Advanced SIMD mode
is clearly wider. */
if (prefer_sve)
return est_sve >= est_asimd;
/* Conversely, preferring Advanced SIMD means picking SVE only if SVE
is clearly wider. */
if (prefer_asimd)
return est_sve > est_asimd;
/* In the default case prefer Advanced SIMD over SVE in case of a tie. */
return est_sve > est_asimd;
}
/* Return 128-bit container as the preferred SIMD mode for MODE. */
static machine_mode
aarch64_preferred_simd_mode (scalar_mode mode)
{
poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
/* Take into account explicit auto-vectorization ISA preferences through
aarch64_cmp_autovec_modes. */
poly_int64 bits
= (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
? BITS_PER_SVE_VECTOR : 128;
return aarch64_simd_container_mode (mode, bits);
}
@ -17348,19 +17408,24 @@ aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
- If an Advanced SIMD main loop with N bytes ends up being cheaper
than an SVE main loop with N bytes then by default we'll try to
use the SVE loop to vectorize the epilogue instead. */
unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
bool only_asimd_p = aarch64_autovec_preference == 1;
bool only_sve_p = aarch64_autovec_preference == 2;
unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
unsigned int advsimd_i = 0;
while (advsimd_i < ARRAY_SIZE (advsimd_modes))
while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
{
if (sve_i < ARRAY_SIZE (sve_modes)
&& maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
&& aarch64_cmp_autovec_modes (sve_modes[sve_i],
advsimd_modes[advsimd_i]))
modes->safe_push (sve_modes[sve_i++]);
else
modes->safe_push (advsimd_modes[advsimd_i++]);
}
while (sve_i < ARRAY_SIZE (sve_modes))
modes->safe_push (sve_modes[sve_i++]);
modes->safe_push (sve_modes[sve_i++]);
unsigned int flags = 0;
/* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we

View File

@ -275,3 +275,5 @@ The number of Newton iterations for calculating the reciprocal for float type.
Target Joined UInteger Var(aarch64_double_recp_precision) Init(2) IntegerRange(1, 5) Param
The number of Newton iterations for calculating the reciprocal for double type. The precision of division is proportional to this param when division approximation is enabled. The default value is 2.
-param=aarch64-autovec-preference=
Target Joined UInteger Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4) Param

View File

@ -14125,6 +14125,24 @@ The number of Newton iterations for calculating the reciprocal for double type.
The precision of division is propotional to this param when division
approximation is enabled. The default value is 2.
@item aarch64-autovec-preference
Force an ISA selection strategy for auto-vectorization. Accepts values from
0 to 4, inclusive.
@table @samp
@item 0
Use the default heuristics.
@item 1
Use only Advanced SIMD for auto-vectorization.
@item 2
Use only SVE for auto-vectorization.
@item 3
Use both Advanced SIMD and SVE. Prefer Advanced SIMD when the costs are
deemed equal.
@item 4
Use both Advanced SIMD and SVE. Prefer SVE when the costs are deemed equal.
@end table
The default value is 0.
@end table
@end table