Model cache auto-prefetcher in scheduler

* config/arm/arm-protos.h (struct tune_params): New field
	sched_autopref_queue_depth.
	* config/arm/arm.c (sched-int.h): Include header.
	(arm_first_cycle_multipass_dfa_lookahead_guard,)
	(TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook.
	(arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,)
	(arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,)
	(arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,)
	(arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,)
	(arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,)
	(arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune):
	Specify sched_autopref_queue_depth value.  Enabled for A15 and A57.
	* config/arm/t-arm (arm.o): Update.
	* haifa-sched.c (update_insn_after_change): Update.
	(rank_for_schedule): Use auto-prefetcher model, if requested.
	(autopref_multipass_init): New static function.
	(autopref_rank_for_schedule): New rank_for_schedule heuristic.
	(autopref_multipass_dfa_lookahead_guard_started_dump_p): New static
	variable for debug dumps.
	(autopref_multipass_dfa_lookahead_guard_1): New static helper function.
	(autopref_multipass_dfa_lookahead_guard): New global function that
	implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook.
	(init_h_i_d): Update.
	* params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob.
	* sched-int.h (enum autopref_multipass_data_status): New const enum.
	(autopref_multipass_data_): Structure for auto-prefetcher data.
	(autopref_multipass_data_def, autopref_multipass_data_t): New typedefs.
	(struct _haifa_insn_data:autopref_multipass_data): New field.
	(INSN_AUTOPREF_MULTIPASS_DATA): New access macro.
	(autopref_multipass_dfa_lookahead_guard): Declare.

From-SVN: r219789
This commit is contained in:
Maxim Kuvyrkov 2015-01-17 01:06:43 +00:00 committed by Maxim Kuvyrkov
parent 71acd4776f
commit 340c79045e
7 changed files with 389 additions and 21 deletions

View File

@ -1,3 +1,36 @@
2015-01-17 Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
* config/arm/arm-protos.h (struct tune_params): New field
sched_autopref_queue_depth.
* config/arm/arm.c (sched-int.h): Include header.
(arm_first_cycle_multipass_dfa_lookahead_guard,)
(TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook.
(arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,)
(arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,)
(arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,)
(arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,)
(arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,)
(arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune):
Specify sched_autopref_queue_depth value. Enabled for A15 and A57.
* config/arm/t-arm (arm.o): Update.
* haifa-sched.c (update_insn_after_change): Update.
(rank_for_schedule): Use auto-prefetcher model, if requested.
(autopref_multipass_init): New static function.
(autopref_rank_for_schedule): New rank_for_schedule heuristic.
(autopref_multipass_dfa_lookahead_guard_started_dump_p): New static
variable for debug dumps.
(autopref_multipass_dfa_lookahead_guard_1): New static helper function.
(autopref_multipass_dfa_lookahead_guard): New global function that
implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook.
(init_h_i_d): Update.
* params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob.
* sched-int.h (enum autopref_multipass_data_status): New const enum.
(autopref_multipass_data_): Structure for auto-prefetcher data.
(autopref_multipass_data_def, autopref_multipass_data_t): New typedefs.
(struct _haifa_insn_data:autopref_multipass_data): New field.
(INSN_AUTOPREF_MULTIPASS_DATA): New access macro.
(autopref_multipass_dfa_lookahead_guard): Declare.
2015-01-17 Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
* rtlanal.c (get_base_term): Handle SCRATCH.

View File

@ -291,6 +291,8 @@ struct tune_params
int max_insns_inline_memset;
/* Bitfield encoding the fuseable pairs of instructions. */
unsigned int fuseable_ops;
/* Depth of scheduling queue to check for L2 autoprefetcher. */
int sched_autopref_queue_depth;
};
extern const struct tune_params *current_tune;

View File

@ -97,6 +97,7 @@
#include "builtins.h"
#include "tm-constrs.h"
#include "rtl-iter.h"
#include "sched-int.h"
/* Forward definitions of types. */
typedef struct minipool_node Mnode;
@ -269,6 +270,7 @@ static bool arm_macro_fusion_p (void);
static bool arm_cannot_copy_insn_p (rtx_insn *);
static int arm_issue_rate (void);
static int arm_first_cycle_multipass_dfa_lookahead (void);
static int arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *, int);
static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
static bool arm_output_addr_const_extra (FILE *, rtx);
static bool arm_allocate_stack_slots_for_args (void);
@ -629,6 +631,10 @@ static const struct attribute_spec arm_attribute_table[] =
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
arm_first_cycle_multipass_dfa_lookahead
#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
arm_first_cycle_multipass_dfa_lookahead_guard
#undef TARGET_MANGLE_TYPE
#define TARGET_MANGLE_TYPE arm_mangle_type
@ -1690,7 +1696,8 @@ const struct tune_params arm_slowmul_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_fastmul_tune =
@ -1710,7 +1717,8 @@ const struct tune_params arm_fastmul_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
/* StrongARM has early execution of branches, so a sequence that is worth
@ -1733,7 +1741,8 @@ const struct tune_params arm_strongarm_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_xscale_tune =
@ -1753,7 +1762,8 @@ const struct tune_params arm_xscale_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_9e_tune =
@ -1773,7 +1783,8 @@ const struct tune_params arm_9e_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_v6t2_tune =
@ -1793,7 +1804,8 @@ const struct tune_params arm_v6t2_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
/* Generic Cortex tuning. Use more specific tunings if appropriate. */
@ -1814,7 +1826,8 @@ const struct tune_params arm_cortex_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_cortex_a8_tune =
@ -1834,7 +1847,8 @@ const struct tune_params arm_cortex_a8_tune =
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_cortex_a7_tune =
@ -1854,7 +1868,8 @@ const struct tune_params arm_cortex_a7_tune =
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_cortex_a15_tune =
@ -1874,7 +1889,8 @@ const struct tune_params arm_cortex_a15_tune =
true, true, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
max_insn_queue_index + 1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_cortex_a53_tune =
@ -1894,7 +1910,8 @@ const struct tune_params arm_cortex_a53_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_cortex_a57_tune =
@ -1914,7 +1931,8 @@ const struct tune_params arm_cortex_a57_tune =
true, true, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
max_insn_queue_index + 1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_xgene1_tune =
@ -1934,7 +1952,8 @@ const struct tune_params arm_xgene1_tune =
true, true, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
32, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
/* Branches can be dual-issued on Cortex-A5, so conditional execution is
@ -1957,7 +1976,8 @@ const struct tune_params arm_cortex_a5_tune =
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_cortex_a9_tune =
@ -1977,7 +1997,8 @@ const struct tune_params arm_cortex_a9_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_cortex_a12_tune =
@ -1997,7 +2018,8 @@ const struct tune_params arm_cortex_a12_tune =
true, true, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
/* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single
@ -2024,7 +2046,8 @@ const struct tune_params arm_v7m_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
/* Cortex-M7 tuning. */
@ -2046,7 +2069,8 @@ const struct tune_params arm_cortex_m7_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
/* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
@ -2068,7 +2092,8 @@ const struct tune_params arm_v6m_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
const struct tune_params arm_fa726te_tune =
@ -2088,7 +2113,8 @@ const struct tune_params arm_fa726te_tune =
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8, /* Maximum insns to inline memset. */
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
-1 /* Sched L2 autopref depth. */
};
@ -3144,6 +3170,13 @@ arm_option_override (void)
global_options.x_param_values,
global_options_set.x_param_values);
/* Look through ready list and all of queue for instructions
relevant for L2 auto-prefetcher. */
maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
current_tune->sched_autopref_queue_depth,
global_options.x_param_values,
global_options_set.x_param_values);
/* Disable shrink-wrap when optimizing function for size, since it tends to
generate additional returns. */
if (optimize_function_for_size_p (cfun) && TARGET_THUMB2)
@ -27153,6 +27186,13 @@ arm_first_cycle_multipass_dfa_lookahead (void)
return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
}
/* Enable modeling of L2 auto-prefetcher. */
static int
arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, int ready_index)
{
return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
}
const char *
arm_mangle_type (const_tree type)
{

View File

@ -91,7 +91,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
$(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
$(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
$(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) $(srcdir)/config/arm/arm-cores.def \
intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) sched-int.h \
$(srcdir)/config/arm/arm-cores.def \
$(srcdir)/config/arm/arm-arches.def $(srcdir)/config/arm/arm-fpus.def \
$(srcdir)/config/arm/arm-protos.h \
$(srcdir)/config/arm/arm_neon_builtins.def

View File

@ -841,6 +841,7 @@ add_delay_dependencies (rtx_insn *insn)
/* Forward declarations. */
static int priority (rtx_insn *);
static int autopref_rank_for_schedule (const rtx_insn *, const rtx_insn *);
static int rank_for_schedule (const void *, const void *);
static void swap_sort (rtx_insn **, int);
static void queue_insn (rtx_insn *, int, const char *);
@ -1184,6 +1185,12 @@ update_insn_after_change (rtx_insn *insn)
INSN_COST (insn) = -1;
/* Invalidate INSN_TICK, so it'll be recalculated. */
INSN_TICK (insn) = INVALID_TICK;
/* Invalidate autoprefetch data entry. */
INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
}
@ -2724,6 +2731,13 @@ rank_for_schedule (const void *x, const void *y)
if (flag_sched_critical_path_heuristic && priority_val)
return rfs_result (RFS_PRIORITY, priority_val, tmp, tmp2);
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) >= 0)
{
int autopref = autopref_rank_for_schedule (tmp, tmp2);
if (autopref != 0)
return autopref;
}
/* Prefer speculative insn with greater dependencies weakness. */
if (flag_sched_spec_insn_heuristic && spec_info)
{
@ -5500,6 +5514,241 @@ insn_finishes_cycle_p (rtx_insn *insn)
return false;
}
/* Functions to model cache auto-prefetcher.
Some of the CPUs have cache auto-prefetcher, which /seems/ to initiate
memory prefetches if it sees instructions with consequitive memory accesses
in the instruction stream. Details of such hardware units are not published,
so we can only guess what exactly is going on there.
In the scheduler, we model abstract auto-prefetcher. If there are memory
insns in the ready list (or the queue) that have same memory base, but
different offsets, then we delay the insns with larger offsets until insns
with smaller offsets get scheduled. If PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
is "1", then we look at the ready list; if it is N>1, then we also look
through N-1 queue entries.
If the param is N>=0, then rank_for_schedule will consider auto-prefetching
among its heuristics.
Param value of "-1" disables modelling of the auto-prefetcher. */
/* Initialize autoprefetcher model data for INSN. */
static void
autopref_multipass_init (const rtx_insn *insn, int write)
{
autopref_multipass_data_t data = &INSN_AUTOPREF_MULTIPASS_DATA (insn)[write];
gcc_assert (data->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED);
data->base = NULL_RTX;
data->offset = 0;
/* Set insn entry initialized, but not relevant for auto-prefetcher. */
data->status = AUTOPREF_MULTIPASS_DATA_IRRELEVANT;
rtx set = single_set (insn);
if (set == NULL_RTX)
return;
rtx mem = write ? SET_DEST (set) : SET_SRC (set);
if (!MEM_P (mem))
return;
struct address_info info;
decompose_mem_address (&info, mem);
/* TODO: Currently only (base+const) addressing is supported. */
if (info.base == NULL || !REG_P (*info.base)
|| (info.disp != NULL && !CONST_INT_P (*info.disp)))
return;
/* This insn is relevant for auto-prefetcher. */
data->base = *info.base;
data->offset = info.disp ? INTVAL (*info.disp) : 0;
data->status = AUTOPREF_MULTIPASS_DATA_NORMAL;
}
/* Helper function for rank_for_schedule sorting. */
static int
autopref_rank_for_schedule (const rtx_insn *insn1, const rtx_insn *insn2)
{
for (int write = 0; write < 2; ++write)
{
autopref_multipass_data_t data1
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
autopref_multipass_data_t data2
= &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write];
if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
autopref_multipass_init (insn1, write);
if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
continue;
if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
autopref_multipass_init (insn2, write);
if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
continue;
if (!rtx_equal_p (data1->base, data2->base))
continue;
return data1->offset - data2->offset;
}
return 0;
}
/* True if header of debug dump was printed. */
static bool autopref_multipass_dfa_lookahead_guard_started_dump_p;
/* Helper for autopref_multipass_dfa_lookahead_guard.
Return "1" if INSN1 should be delayed in favor of INSN2. */
static int
autopref_multipass_dfa_lookahead_guard_1 (const rtx_insn *insn1,
const rtx_insn *insn2, int write)
{
autopref_multipass_data_t data1
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
autopref_multipass_data_t data2
= &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write];
if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
autopref_multipass_init (insn2, write);
if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
return 0;
if (rtx_equal_p (data1->base, data2->base)
&& data1->offset > data2->offset)
{
if (sched_verbose >= 2)
{
if (!autopref_multipass_dfa_lookahead_guard_started_dump_p)
{
fprintf (sched_dump,
";;\t\tnot trying in max_issue due to autoprefetch "
"model: ");
autopref_multipass_dfa_lookahead_guard_started_dump_p = true;
}
fprintf (sched_dump, " %d(%d)", INSN_UID (insn1), INSN_UID (insn2));
}
return 1;
}
return 0;
}
/* General note:
We could have also hooked autoprefetcher model into
first_cycle_multipass_backtrack / first_cycle_multipass_issue hooks
to enable intelligent selection of "[r1+0]=r2; [r1+4]=r3" on the same cycle
(e.g., once "[r1+0]=r2" is issued in max_issue(), "[r1+4]=r3" gets
unblocked). We don't bother about this yet because target of interest
(ARM Cortex-A15) can issue only 1 memory operation per cycle. */
/* Implementation of first_cycle_multipass_dfa_lookahead_guard hook.
Return "1" if INSN1 should not be considered in max_issue due to
auto-prefetcher considerations. */
int
autopref_multipass_dfa_lookahead_guard (rtx_insn *insn1, int ready_index)
{
int r = 0;
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) <= 0)
return 0;
if (sched_verbose >= 2 && ready_index == 0)
autopref_multipass_dfa_lookahead_guard_started_dump_p = false;
for (int write = 0; write < 2; ++write)
{
autopref_multipass_data_t data1
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
autopref_multipass_init (insn1, write);
if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
continue;
if (ready_index == 0
&& data1->status == AUTOPREF_MULTIPASS_DATA_DONT_DELAY)
/* We allow only a single delay on priviledged instructions.
Doing otherwise would cause infinite loop. */
{
if (sched_verbose >= 2)
{
if (!autopref_multipass_dfa_lookahead_guard_started_dump_p)
{
fprintf (sched_dump,
";;\t\tnot trying in max_issue due to autoprefetch "
"model: ");
autopref_multipass_dfa_lookahead_guard_started_dump_p = true;
}
fprintf (sched_dump, " *%d*", INSN_UID (insn1));
}
continue;
}
for (int i2 = 0; i2 < ready.n_ready; ++i2)
{
rtx_insn *insn2 = get_ready_element (i2);
if (insn1 == insn2)
continue;
r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2, write);
if (r)
{
if (ready_index == 0)
{
r = -1;
data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY;
}
goto finish;
}
}
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) == 1)
continue;
/* Everything from the current queue slot should have been moved to
the ready list. */
gcc_assert (insn_queue[NEXT_Q_AFTER (q_ptr, 0)] == NULL_RTX);
int n_stalls = PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) - 1;
if (n_stalls > max_insn_queue_index)
n_stalls = max_insn_queue_index;
for (int stalls = 1; stalls <= n_stalls; ++stalls)
{
for (rtx_insn_list *link = insn_queue[NEXT_Q_AFTER (q_ptr, stalls)];
link != NULL_RTX;
link = link->next ())
{
rtx_insn *insn2 = link->insn ();
r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2,
write);
if (r)
{
/* Queue INSN1 until INSN2 can issue. */
r = -stalls;
if (ready_index == 0)
data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY;
goto finish;
}
}
}
}
finish:
if (sched_verbose >= 2
&& autopref_multipass_dfa_lookahead_guard_started_dump_p
&& (ready_index == ready.n_ready - 1 || r < 0))
/* This does not /always/ trigger. We don't output EOL if the last
insn is not recognized (INSN_CODE < 0) and lookahead_guard is not
called. We can live with this. */
fprintf (sched_dump, "\n");
return r;
}
/* Define type for target data used in multipass scheduling. */
#ifndef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T
# define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T int
@ -8710,6 +8959,10 @@ init_h_i_d (rtx_insn *insn)
INSN_EXACT_TICK (insn) = INVALID_TICK;
INTER_TICK (insn) = INVALID_TICK;
TODO_SPEC (insn) = HARD_DEP;
INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
}
}

View File

@ -668,6 +668,11 @@ DEFPARAM (PARAM_SCHED_MEM_TRUE_DEP_COST,
"Minimal distance between possibly conflicting store and load",
1, 0, 0)
DEFPARAM (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
"sched-autopref-queue-depth",
"Hardware autoprefetcher scheduler model control flag. Number of lookahead cycles the model looks into; at '0' only enable instruction sorting heuristic. Disabled by default.",
-1, 0, 0)
DEFPARAM(PARAM_MAX_LAST_VALUE_RTL,
"max-last-value-rtl",
"The maximum number of RTL nodes that can be recorded as combiner's last value",

View File

@ -793,6 +793,32 @@ struct reg_set_data
struct reg_set_data *next_insn_set;
};
enum autopref_multipass_data_status {
/* Entry is irrelevant for auto-prefetcher. */
AUTOPREF_MULTIPASS_DATA_IRRELEVANT = -2,
/* Entry is uninitialized. */
AUTOPREF_MULTIPASS_DATA_UNINITIALIZED = -1,
/* Entry is relevant for auto-prefetcher and insn can be delayed
to allow another insn through. */
AUTOPREF_MULTIPASS_DATA_NORMAL = 0,
/* Entry is relevant for auto-prefetcher, but insn should not be
delayed as that will break scheduling. */
AUTOPREF_MULTIPASS_DATA_DONT_DELAY = 1
};
/* Data for modeling cache auto-prefetcher. */
struct autopref_multipass_data_
{
/* Base part of memory address. */
rtx base;
/* Memory offset. */
int offset;
/* Entry status. */
enum autopref_multipass_data_status status;
};
typedef struct autopref_multipass_data_ autopref_multipass_data_def;
typedef autopref_multipass_data_def *autopref_multipass_data_t;
struct _haifa_insn_data
{
/* We can't place 'struct _deps_list' into h_i_d instead of deps_list_t
@ -893,6 +919,10 @@ struct _haifa_insn_data
/* The deciding reason for INSN's place in the ready list. */
int last_rfs_win;
/* Two entries for cache auto-prefetcher model: one for mem reads,
and one for mem writes. */
autopref_multipass_data_def autopref_multipass_data[2];
};
typedef struct _haifa_insn_data haifa_insn_data_def;
@ -915,6 +945,8 @@ extern vec<haifa_insn_data_def> h_i_d;
(HID (INSN)->reg_pressure_excess_cost_change)
#define INSN_PRIORITY_STATUS(INSN) (HID (INSN)->priority_status)
#define INSN_MODEL_INDEX(INSN) (HID (INSN)->model_index)
#define INSN_AUTOPREF_MULTIPASS_DATA(INSN) \
(HID (INSN)->autopref_multipass_data)
typedef struct _haifa_deps_insn_data haifa_deps_insn_data_def;
typedef haifa_deps_insn_data_def *haifa_deps_insn_data_t;
@ -1363,6 +1395,8 @@ extern int cycle_issued_insns;
extern int issue_rate;
extern int dfa_lookahead;
extern int autopref_multipass_dfa_lookahead_guard (rtx_insn *, int);
extern void ready_sort (struct ready_list *);
extern rtx_insn *ready_element (struct ready_list *, int);
extern rtx_insn **ready_lastpos (struct ready_list *);