[Patch 01/02] Introduce prefetch-minimum stride option

This patch adds a new option to control the minimum stride, for a memory
reference, after which the loop prefetch pass may issue software prefetch
hints for. There are two motivations:

* Make the pass less aggressive, only issuing prefetch hints for bigger strides
that are more likely to benefit from prefetching. I've noticed a case in cpu2017
where we were issuing thousands of hints, for example.

* For processors that have a hardware prefetcher, like Falkor, it allows the
loop prefetch pass to defer prefetching of smaller (less than the threshold)
strides to the hardware prefetcher instead. This prevents conflicts between
the software prefetcher and the hardware prefetcher.

I've noticed considerable reduction in the number of prefetch hints and
slightly positive performance numbers. This aligns GCC and LLVM in terms of
prefetch behavior for Falkor.

The default settings should guarantee no changes for existing targets. Those
are free to tweak the settings as necessary.

gcc/ChangeLog:

2018-05-23  Luis Machado  <luis.machado@linaro.org>

	* config/aarch64/aarch64-protos.h (cpu_prefetch_tune)
	<minimum_stride>: New const int field.
	* config/aarch64/aarch64.c (generic_prefetch_tune): Update to include
	minimum_stride field defaulting to -1.
	(exynosm1_prefetch_tune): Likewise.
	(thunderxt88_prefetch_tune): Likewise.
	(thunderx_prefetch_tune): Likewise.
	(thunderx2t99_prefetch_tune): Likewise.
	(qdf24xx_prefetch_tune) <minimum_stride>: Set to 2048.
	<default_opt_level>: Set to 3.
	(aarch64_override_options_internal): Update to set
	PARAM_PREFETCH_MINIMUM_STRIDE.
	* doc/invoke.texi (prefetch-minimum-stride): Document new option.
	* params.def (PARAM_PREFETCH_MINIMUM_STRIDE): New.
	* params.h (PARAM_PREFETCH_MINIMUM_STRIDE): Define.
	* tree-ssa-loop-prefetch.c (should_issue_prefetch_p): Return false if
	stride is constant and is below the minimum stride threshold.

From-SVN: r260617
This commit is contained in:
Luis Machado 2018-05-23 16:20:30 +00:00 committed by Luis Machado
parent cf290ea325
commit 59100dfc42
7 changed files with 78 additions and 1 deletions

View File

@ -1,3 +1,23 @@
2018-05-23 Luis Machado <luis.machado@linaro.org>
* config/aarch64/aarch64-protos.h (cpu_prefetch_tune)
<minimum_stride>: New const int field.
* config/aarch64/aarch64.c (generic_prefetch_tune): Update to include
minimum_stride field defaulting to -1.
(exynosm1_prefetch_tune): Likewise.
(thunderxt88_prefetch_tune): Likewise.
(thunderx_prefetch_tune): Likewise.
(thunderx2t99_prefetch_tune): Likewise.
(qdf24xx_prefetch_tune) <minimum_stride>: Set to 2048.
<default_opt_level>: Set to 3.
(aarch64_override_options_internal): Update to set
PARAM_PREFETCH_MINIMUM_STRIDE.
* doc/invoke.texi (prefetch-minimum-stride): Document new option.
* params.def (PARAM_PREFETCH_MINIMUM_STRIDE): New.
* params.h (PARAM_PREFETCH_MINIMUM_STRIDE): Define.
* tree-ssa-loop-prefetch.c (should_issue_prefetch_p): Return false if
stride is constant and is below the minimum stride threshold.
2018-05-23 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm-cpus.in (mode26): Delete.

View File

@ -230,6 +230,9 @@ struct cpu_prefetch_tune
const int l1_cache_size;
const int l1_cache_line_size;
const int l2_cache_size;
/* The minimum constant stride beyond which we should use prefetch
hints for. */
const int minimum_stride;
const int default_opt_level;
};

View File

@ -550,6 +550,7 @@ static const cpu_prefetch_tune generic_prefetch_tune =
-1, /* l1_cache_size */
-1, /* l1_cache_line_size */
-1, /* l2_cache_size */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
@ -559,6 +560,7 @@ static const cpu_prefetch_tune exynosm1_prefetch_tune =
-1, /* l1_cache_size */
64, /* l1_cache_line_size */
-1, /* l2_cache_size */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
@ -568,7 +570,8 @@ static const cpu_prefetch_tune qdf24xx_prefetch_tune =
32, /* l1_cache_size */
64, /* l1_cache_line_size */
512, /* l2_cache_size */
-1 /* default_opt_level */
2048, /* minimum_stride */
3 /* default_opt_level */
};
static const cpu_prefetch_tune thunderxt88_prefetch_tune =
@ -577,6 +580,7 @@ static const cpu_prefetch_tune thunderxt88_prefetch_tune =
32, /* l1_cache_size */
128, /* l1_cache_line_size */
16*1024, /* l2_cache_size */
-1, /* minimum_stride */
3 /* default_opt_level */
};
@ -586,6 +590,7 @@ static const cpu_prefetch_tune thunderx_prefetch_tune =
32, /* l1_cache_size */
128, /* l1_cache_line_size */
-1, /* l2_cache_size */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
@ -595,6 +600,7 @@ static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
32, /* l1_cache_size */
64, /* l1_cache_line_size */
256, /* l2_cache_size */
-1, /* minimum_stride */
-1 /* default_opt_level */
};
@ -10629,6 +10635,11 @@ aarch64_override_options_internal (struct gcc_options *opts)
aarch64_tune_params.prefetch->l2_cache_size,
opts->x_param_values,
global_options_set.x_param_values);
if (aarch64_tune_params.prefetch->minimum_stride >= 0)
maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
aarch64_tune_params.prefetch->minimum_stride,
opts->x_param_values,
global_options_set.x_param_values);
/* Use the alternative scheduling-pressure algorithm by default. */
maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,

View File

@ -10734,6 +10734,21 @@ The size of L1 cache, in kilobytes.
@item l2-cache-size
The size of L2 cache, in kilobytes.
@item prefetch-minimum-stride
Minimum constant stride, in bytes, to start using prefetch hints for. If
the stride is less than this threshold, prefetch hints will not be issued.
This setting is useful for processors that have hardware prefetchers, in
which case there may be conflicts between the hardware prefetchers and
the software prefetchers. If the hardware prefetchers have a maximum
stride they can handle, it should be used here to improve the use of
software prefetchers.
A value of -1, the default, means we don't have a threshold and therefore
prefetch hints can be issued for any constant stride.
This setting is only useful for strides that are known and constant.
@item loop-interchange-max-num-stmts
The maximum number of stmts in a loop to be interchanged.

View File

@ -795,6 +795,15 @@ DEFPARAM (PARAM_L2_CACHE_SIZE,
"The size of L2 cache.",
512, 0, 0)
/* The minimum constant stride beyond which we should use prefetch hints
for. */
DEFPARAM (PARAM_PREFETCH_MINIMUM_STRIDE,
"prefetch-minimum-stride",
"The minimum constant stride beyond which we should use prefetch "
"hints for.",
-1, 0, 0)
/* Maximum number of statements in loop nest for loop interchange. */
DEFPARAM (PARAM_LOOP_INTERCHANGE_MAX_NUM_STMTS,

View File

@ -196,6 +196,8 @@ extern void init_param_values (int *params);
PARAM_VALUE (PARAM_L1_CACHE_LINE_SIZE)
#define L2_CACHE_SIZE \
PARAM_VALUE (PARAM_L2_CACHE_SIZE)
#define PREFETCH_MINIMUM_STRIDE \
PARAM_VALUE (PARAM_PREFETCH_MINIMUM_STRIDE)
#define USE_CANONICAL_TYPES \
PARAM_VALUE (PARAM_USE_CANONICAL_TYPES)
#define IRA_MAX_LOOPS_NUM \

View File

@ -992,6 +992,23 @@ prune_by_reuse (struct mem_ref_group *groups)
static bool
should_issue_prefetch_p (struct mem_ref *ref)
{
/* Some processors may have a hardware prefetcher that may conflict with
prefetch hints for a range of strides. Make sure we don't issue
prefetches for such cases if the stride is within this particular
range. */
if (cst_and_fits_in_hwi (ref->group->step)
&& abs_hwi (int_cst_value (ref->group->step))
< (HOST_WIDE_INT) PREFETCH_MINIMUM_STRIDE)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"Step for reference %u:%u (%ld) is less than the mininum "
"required stride of %d\n",
ref->group->uid, ref->uid, int_cst_value (ref->group->step),
PREFETCH_MINIMUM_STRIDE);
return false;
}
/* For now do not issue prefetches for only first few of the
iterations. */
if (ref->prefetch_before != PREFETCH_ALL)