Model cache auto-prefetcher in scheduler
* config/arm/arm-protos.h (struct tune_params): New field sched_autopref_queue_depth. * config/arm/arm.c (sched-int.h): Include header. (arm_first_cycle_multipass_dfa_lookahead_guard,) (TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook. (arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,) (arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,) (arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,) (arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,) (arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,) (arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune): Specify sched_autopref_queue_depth value. Enabled for A15 and A57. * config/arm/t-arm (arm.o): Update. * haifa-sched.c (update_insn_after_change): Update. (rank_for_schedule): Use auto-prefetcher model, if requested. (autopref_multipass_init): New static function. (autopref_rank_for_schedule): New rank_for_schedule heuristic. (autopref_multipass_dfa_lookahead_guard_started_dump_p): New static variable for debug dumps. (autopref_multipass_dfa_lookahead_guard_1): New static helper function. (autopref_multipass_dfa_lookahead_guard): New global function that implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook. (init_h_i_d): Update. * params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob. * sched-int.h (enum autopref_multipass_data_status): New const enum. (autopref_multipass_data_): Structure for auto-prefetcher data. (autopref_multipass_data_def, autopref_multipass_data_t): New typedefs. (struct _haifa_insn_data:autopref_multipass_data): New field. (INSN_AUTOPREF_MULTIPASS_DATA): New access macro. (autopref_multipass_dfa_lookahead_guard): Declare. From-SVN: r219789
This commit is contained in:
parent
71acd4776f
commit
340c79045e
|
@ -1,3 +1,36 @@
|
|||
2015-01-17 Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
|
||||
|
||||
* config/arm/arm-protos.h (struct tune_params): New field
|
||||
sched_autopref_queue_depth.
|
||||
* config/arm/arm.c (sched-int.h): Include header.
|
||||
(arm_first_cycle_multipass_dfa_lookahead_guard,)
|
||||
(TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook.
|
||||
(arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,)
|
||||
(arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,)
|
||||
(arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,)
|
||||
(arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,)
|
||||
(arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,)
|
||||
(arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune):
|
||||
Specify sched_autopref_queue_depth value. Enabled for A15 and A57.
|
||||
* config/arm/t-arm (arm.o): Update.
|
||||
* haifa-sched.c (update_insn_after_change): Update.
|
||||
(rank_for_schedule): Use auto-prefetcher model, if requested.
|
||||
(autopref_multipass_init): New static function.
|
||||
(autopref_rank_for_schedule): New rank_for_schedule heuristic.
|
||||
(autopref_multipass_dfa_lookahead_guard_started_dump_p): New static
|
||||
variable for debug dumps.
|
||||
(autopref_multipass_dfa_lookahead_guard_1): New static helper function.
|
||||
(autopref_multipass_dfa_lookahead_guard): New global function that
|
||||
implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook.
|
||||
(init_h_i_d): Update.
|
||||
* params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob.
|
||||
* sched-int.h (enum autopref_multipass_data_status): New const enum.
|
||||
(autopref_multipass_data_): Structure for auto-prefetcher data.
|
||||
(autopref_multipass_data_def, autopref_multipass_data_t): New typedefs.
|
||||
(struct _haifa_insn_data:autopref_multipass_data): New field.
|
||||
(INSN_AUTOPREF_MULTIPASS_DATA): New access macro.
|
||||
(autopref_multipass_dfa_lookahead_guard): Declare.
|
||||
|
||||
2015-01-17 Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
|
||||
|
||||
* rtlanal.c (get_base_term): Handle SCRATCH.
|
||||
|
|
|
@ -291,6 +291,8 @@ struct tune_params
|
|||
int max_insns_inline_memset;
|
||||
/* Bitfield encoding the fuseable pairs of instructions. */
|
||||
unsigned int fuseable_ops;
|
||||
/* Depth of scheduling queue to check for L2 autoprefetcher. */
|
||||
int sched_autopref_queue_depth;
|
||||
};
|
||||
|
||||
extern const struct tune_params *current_tune;
|
||||
|
|
|
@ -97,6 +97,7 @@
|
|||
#include "builtins.h"
|
||||
#include "tm-constrs.h"
|
||||
#include "rtl-iter.h"
|
||||
#include "sched-int.h"
|
||||
|
||||
/* Forward definitions of types. */
|
||||
typedef struct minipool_node Mnode;
|
||||
|
@ -269,6 +270,7 @@ static bool arm_macro_fusion_p (void);
|
|||
static bool arm_cannot_copy_insn_p (rtx_insn *);
|
||||
static int arm_issue_rate (void);
|
||||
static int arm_first_cycle_multipass_dfa_lookahead (void);
|
||||
static int arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *, int);
|
||||
static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
|
||||
static bool arm_output_addr_const_extra (FILE *, rtx);
|
||||
static bool arm_allocate_stack_slots_for_args (void);
|
||||
|
@ -629,6 +631,10 @@ static const struct attribute_spec arm_attribute_table[] =
|
|||
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
|
||||
arm_first_cycle_multipass_dfa_lookahead
|
||||
|
||||
#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
|
||||
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
|
||||
arm_first_cycle_multipass_dfa_lookahead_guard
|
||||
|
||||
#undef TARGET_MANGLE_TYPE
|
||||
#define TARGET_MANGLE_TYPE arm_mangle_type
|
||||
|
||||
|
@ -1690,7 +1696,8 @@ const struct tune_params arm_slowmul_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_fastmul_tune =
|
||||
|
@ -1710,7 +1717,8 @@ const struct tune_params arm_fastmul_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
/* StrongARM has early execution of branches, so a sequence that is worth
|
||||
|
@ -1733,7 +1741,8 @@ const struct tune_params arm_strongarm_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_xscale_tune =
|
||||
|
@ -1753,7 +1762,8 @@ const struct tune_params arm_xscale_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_9e_tune =
|
||||
|
@ -1773,7 +1783,8 @@ const struct tune_params arm_9e_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_v6t2_tune =
|
||||
|
@ -1793,7 +1804,8 @@ const struct tune_params arm_v6t2_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
/* Generic Cortex tuning. Use more specific tunings if appropriate. */
|
||||
|
@ -1814,7 +1826,8 @@ const struct tune_params arm_cortex_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_cortex_a8_tune =
|
||||
|
@ -1834,7 +1847,8 @@ const struct tune_params arm_cortex_a8_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
true, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_cortex_a7_tune =
|
||||
|
@ -1854,7 +1868,8 @@ const struct tune_params arm_cortex_a7_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
true, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_cortex_a15_tune =
|
||||
|
@ -1874,7 +1889,8 @@ const struct tune_params arm_cortex_a15_tune =
|
|||
true, true, /* Prefer 32-bit encodings. */
|
||||
true, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
max_insn_queue_index + 1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_cortex_a53_tune =
|
||||
|
@ -1894,7 +1910,8 @@ const struct tune_params arm_cortex_a53_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_cortex_a57_tune =
|
||||
|
@ -1914,7 +1931,8 @@ const struct tune_params arm_cortex_a57_tune =
|
|||
true, true, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
|
||||
max_insn_queue_index + 1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_xgene1_tune =
|
||||
|
@ -1934,7 +1952,8 @@ const struct tune_params arm_xgene1_tune =
|
|||
true, true, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
32, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
/* Branches can be dual-issued on Cortex-A5, so conditional execution is
|
||||
|
@ -1957,7 +1976,8 @@ const struct tune_params arm_cortex_a5_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
true, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_cortex_a9_tune =
|
||||
|
@ -1977,7 +1997,8 @@ const struct tune_params arm_cortex_a9_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_cortex_a12_tune =
|
||||
|
@ -1997,7 +2018,8 @@ const struct tune_params arm_cortex_a12_tune =
|
|||
true, true, /* Prefer 32-bit encodings. */
|
||||
true, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
/* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single
|
||||
|
@ -2024,7 +2046,8 @@ const struct tune_params arm_v7m_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
/* Cortex-M7 tuning. */
|
||||
|
@ -2046,7 +2069,8 @@ const struct tune_params arm_cortex_m7_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
/* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
|
||||
|
@ -2068,7 +2092,8 @@ const struct tune_params arm_v6m_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
const struct tune_params arm_fa726te_tune =
|
||||
|
@ -2088,7 +2113,8 @@ const struct tune_params arm_fa726te_tune =
|
|||
false, false, /* Prefer 32-bit encodings. */
|
||||
false, /* Prefer Neon for stringops. */
|
||||
8, /* Maximum insns to inline memset. */
|
||||
ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */
|
||||
ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */
|
||||
-1 /* Sched L2 autopref depth. */
|
||||
};
|
||||
|
||||
|
||||
|
@ -3144,6 +3170,13 @@ arm_option_override (void)
|
|||
global_options.x_param_values,
|
||||
global_options_set.x_param_values);
|
||||
|
||||
/* Look through ready list and all of queue for instructions
|
||||
relevant for L2 auto-prefetcher. */
|
||||
maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
|
||||
current_tune->sched_autopref_queue_depth,
|
||||
global_options.x_param_values,
|
||||
global_options_set.x_param_values);
|
||||
|
||||
/* Disable shrink-wrap when optimizing function for size, since it tends to
|
||||
generate additional returns. */
|
||||
if (optimize_function_for_size_p (cfun) && TARGET_THUMB2)
|
||||
|
@ -27153,6 +27186,13 @@ arm_first_cycle_multipass_dfa_lookahead (void)
|
|||
return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
|
||||
}
|
||||
|
||||
/* Enable modeling of L2 auto-prefetcher. */
|
||||
static int
|
||||
arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, int ready_index)
|
||||
{
|
||||
return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
|
||||
}
|
||||
|
||||
const char *
|
||||
arm_mangle_type (const_tree type)
|
||||
{
|
||||
|
|
|
@ -91,7 +91,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
|
|||
$(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
|
||||
$(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
|
||||
$(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
|
||||
intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) $(srcdir)/config/arm/arm-cores.def \
|
||||
intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) sched-int.h \
|
||||
$(srcdir)/config/arm/arm-cores.def \
|
||||
$(srcdir)/config/arm/arm-arches.def $(srcdir)/config/arm/arm-fpus.def \
|
||||
$(srcdir)/config/arm/arm-protos.h \
|
||||
$(srcdir)/config/arm/arm_neon_builtins.def
|
||||
|
|
|
@ -841,6 +841,7 @@ add_delay_dependencies (rtx_insn *insn)
|
|||
/* Forward declarations. */
|
||||
|
||||
static int priority (rtx_insn *);
|
||||
static int autopref_rank_for_schedule (const rtx_insn *, const rtx_insn *);
|
||||
static int rank_for_schedule (const void *, const void *);
|
||||
static void swap_sort (rtx_insn **, int);
|
||||
static void queue_insn (rtx_insn *, int, const char *);
|
||||
|
@ -1184,6 +1185,12 @@ update_insn_after_change (rtx_insn *insn)
|
|||
INSN_COST (insn) = -1;
|
||||
/* Invalidate INSN_TICK, so it'll be recalculated. */
|
||||
INSN_TICK (insn) = INVALID_TICK;
|
||||
|
||||
/* Invalidate autoprefetch data entry. */
|
||||
INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status
|
||||
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
|
||||
INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status
|
||||
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
|
||||
}
|
||||
|
||||
|
||||
|
@ -2724,6 +2731,13 @@ rank_for_schedule (const void *x, const void *y)
|
|||
if (flag_sched_critical_path_heuristic && priority_val)
|
||||
return rfs_result (RFS_PRIORITY, priority_val, tmp, tmp2);
|
||||
|
||||
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) >= 0)
|
||||
{
|
||||
int autopref = autopref_rank_for_schedule (tmp, tmp2);
|
||||
if (autopref != 0)
|
||||
return autopref;
|
||||
}
|
||||
|
||||
/* Prefer speculative insn with greater dependencies weakness. */
|
||||
if (flag_sched_spec_insn_heuristic && spec_info)
|
||||
{
|
||||
|
@ -5500,6 +5514,241 @@ insn_finishes_cycle_p (rtx_insn *insn)
|
|||
return false;
|
||||
}
|
||||
|
||||
/* Functions to model cache auto-prefetcher.
|
||||
|
||||
Some of the CPUs have cache auto-prefetcher, which /seems/ to initiate
|
||||
memory prefetches if it sees instructions with consequitive memory accesses
|
||||
in the instruction stream. Details of such hardware units are not published,
|
||||
so we can only guess what exactly is going on there.
|
||||
In the scheduler, we model abstract auto-prefetcher. If there are memory
|
||||
insns in the ready list (or the queue) that have same memory base, but
|
||||
different offsets, then we delay the insns with larger offsets until insns
|
||||
with smaller offsets get scheduled. If PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
|
||||
is "1", then we look at the ready list; if it is N>1, then we also look
|
||||
through N-1 queue entries.
|
||||
If the param is N>=0, then rank_for_schedule will consider auto-prefetching
|
||||
among its heuristics.
|
||||
Param value of "-1" disables modelling of the auto-prefetcher. */
|
||||
|
||||
/* Initialize autoprefetcher model data for INSN. */
|
||||
static void
|
||||
autopref_multipass_init (const rtx_insn *insn, int write)
|
||||
{
|
||||
autopref_multipass_data_t data = &INSN_AUTOPREF_MULTIPASS_DATA (insn)[write];
|
||||
|
||||
gcc_assert (data->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED);
|
||||
data->base = NULL_RTX;
|
||||
data->offset = 0;
|
||||
/* Set insn entry initialized, but not relevant for auto-prefetcher. */
|
||||
data->status = AUTOPREF_MULTIPASS_DATA_IRRELEVANT;
|
||||
|
||||
rtx set = single_set (insn);
|
||||
if (set == NULL_RTX)
|
||||
return;
|
||||
|
||||
rtx mem = write ? SET_DEST (set) : SET_SRC (set);
|
||||
if (!MEM_P (mem))
|
||||
return;
|
||||
|
||||
struct address_info info;
|
||||
decompose_mem_address (&info, mem);
|
||||
|
||||
/* TODO: Currently only (base+const) addressing is supported. */
|
||||
if (info.base == NULL || !REG_P (*info.base)
|
||||
|| (info.disp != NULL && !CONST_INT_P (*info.disp)))
|
||||
return;
|
||||
|
||||
/* This insn is relevant for auto-prefetcher. */
|
||||
data->base = *info.base;
|
||||
data->offset = info.disp ? INTVAL (*info.disp) : 0;
|
||||
data->status = AUTOPREF_MULTIPASS_DATA_NORMAL;
|
||||
}
|
||||
|
||||
/* Helper function for rank_for_schedule sorting. */
|
||||
static int
|
||||
autopref_rank_for_schedule (const rtx_insn *insn1, const rtx_insn *insn2)
|
||||
{
|
||||
for (int write = 0; write < 2; ++write)
|
||||
{
|
||||
autopref_multipass_data_t data1
|
||||
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
|
||||
autopref_multipass_data_t data2
|
||||
= &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write];
|
||||
|
||||
if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
|
||||
autopref_multipass_init (insn1, write);
|
||||
if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
|
||||
continue;
|
||||
|
||||
if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
|
||||
autopref_multipass_init (insn2, write);
|
||||
if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
|
||||
continue;
|
||||
|
||||
if (!rtx_equal_p (data1->base, data2->base))
|
||||
continue;
|
||||
|
||||
return data1->offset - data2->offset;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* True if header of debug dump was printed. */
|
||||
static bool autopref_multipass_dfa_lookahead_guard_started_dump_p;
|
||||
|
||||
/* Helper for autopref_multipass_dfa_lookahead_guard.
|
||||
Return "1" if INSN1 should be delayed in favor of INSN2. */
|
||||
static int
|
||||
autopref_multipass_dfa_lookahead_guard_1 (const rtx_insn *insn1,
|
||||
const rtx_insn *insn2, int write)
|
||||
{
|
||||
autopref_multipass_data_t data1
|
||||
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
|
||||
autopref_multipass_data_t data2
|
||||
= &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write];
|
||||
|
||||
if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
|
||||
autopref_multipass_init (insn2, write);
|
||||
if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
|
||||
return 0;
|
||||
|
||||
if (rtx_equal_p (data1->base, data2->base)
|
||||
&& data1->offset > data2->offset)
|
||||
{
|
||||
if (sched_verbose >= 2)
|
||||
{
|
||||
if (!autopref_multipass_dfa_lookahead_guard_started_dump_p)
|
||||
{
|
||||
fprintf (sched_dump,
|
||||
";;\t\tnot trying in max_issue due to autoprefetch "
|
||||
"model: ");
|
||||
autopref_multipass_dfa_lookahead_guard_started_dump_p = true;
|
||||
}
|
||||
|
||||
fprintf (sched_dump, " %d(%d)", INSN_UID (insn1), INSN_UID (insn2));
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* General note:
|
||||
|
||||
We could have also hooked autoprefetcher model into
|
||||
first_cycle_multipass_backtrack / first_cycle_multipass_issue hooks
|
||||
to enable intelligent selection of "[r1+0]=r2; [r1+4]=r3" on the same cycle
|
||||
(e.g., once "[r1+0]=r2" is issued in max_issue(), "[r1+4]=r3" gets
|
||||
unblocked). We don't bother about this yet because target of interest
|
||||
(ARM Cortex-A15) can issue only 1 memory operation per cycle. */
|
||||
|
||||
/* Implementation of first_cycle_multipass_dfa_lookahead_guard hook.
|
||||
Return "1" if INSN1 should not be considered in max_issue due to
|
||||
auto-prefetcher considerations. */
|
||||
int
|
||||
autopref_multipass_dfa_lookahead_guard (rtx_insn *insn1, int ready_index)
|
||||
{
|
||||
int r = 0;
|
||||
|
||||
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) <= 0)
|
||||
return 0;
|
||||
|
||||
if (sched_verbose >= 2 && ready_index == 0)
|
||||
autopref_multipass_dfa_lookahead_guard_started_dump_p = false;
|
||||
|
||||
for (int write = 0; write < 2; ++write)
|
||||
{
|
||||
autopref_multipass_data_t data1
|
||||
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
|
||||
|
||||
if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
|
||||
autopref_multipass_init (insn1, write);
|
||||
if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
|
||||
continue;
|
||||
|
||||
if (ready_index == 0
|
||||
&& data1->status == AUTOPREF_MULTIPASS_DATA_DONT_DELAY)
|
||||
/* We allow only a single delay on priviledged instructions.
|
||||
Doing otherwise would cause infinite loop. */
|
||||
{
|
||||
if (sched_verbose >= 2)
|
||||
{
|
||||
if (!autopref_multipass_dfa_lookahead_guard_started_dump_p)
|
||||
{
|
||||
fprintf (sched_dump,
|
||||
";;\t\tnot trying in max_issue due to autoprefetch "
|
||||
"model: ");
|
||||
autopref_multipass_dfa_lookahead_guard_started_dump_p = true;
|
||||
}
|
||||
|
||||
fprintf (sched_dump, " *%d*", INSN_UID (insn1));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i2 = 0; i2 < ready.n_ready; ++i2)
|
||||
{
|
||||
rtx_insn *insn2 = get_ready_element (i2);
|
||||
if (insn1 == insn2)
|
||||
continue;
|
||||
r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2, write);
|
||||
if (r)
|
||||
{
|
||||
if (ready_index == 0)
|
||||
{
|
||||
r = -1;
|
||||
data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY;
|
||||
}
|
||||
goto finish;
|
||||
}
|
||||
}
|
||||
|
||||
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) == 1)
|
||||
continue;
|
||||
|
||||
/* Everything from the current queue slot should have been moved to
|
||||
the ready list. */
|
||||
gcc_assert (insn_queue[NEXT_Q_AFTER (q_ptr, 0)] == NULL_RTX);
|
||||
|
||||
int n_stalls = PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) - 1;
|
||||
if (n_stalls > max_insn_queue_index)
|
||||
n_stalls = max_insn_queue_index;
|
||||
|
||||
for (int stalls = 1; stalls <= n_stalls; ++stalls)
|
||||
{
|
||||
for (rtx_insn_list *link = insn_queue[NEXT_Q_AFTER (q_ptr, stalls)];
|
||||
link != NULL_RTX;
|
||||
link = link->next ())
|
||||
{
|
||||
rtx_insn *insn2 = link->insn ();
|
||||
r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2,
|
||||
write);
|
||||
if (r)
|
||||
{
|
||||
/* Queue INSN1 until INSN2 can issue. */
|
||||
r = -stalls;
|
||||
if (ready_index == 0)
|
||||
data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY;
|
||||
goto finish;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
finish:
|
||||
if (sched_verbose >= 2
|
||||
&& autopref_multipass_dfa_lookahead_guard_started_dump_p
|
||||
&& (ready_index == ready.n_ready - 1 || r < 0))
|
||||
/* This does not /always/ trigger. We don't output EOL if the last
|
||||
insn is not recognized (INSN_CODE < 0) and lookahead_guard is not
|
||||
called. We can live with this. */
|
||||
fprintf (sched_dump, "\n");
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/* Define type for target data used in multipass scheduling. */
|
||||
#ifndef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T
|
||||
# define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T int
|
||||
|
@ -8710,6 +8959,10 @@ init_h_i_d (rtx_insn *insn)
|
|||
INSN_EXACT_TICK (insn) = INVALID_TICK;
|
||||
INTER_TICK (insn) = INVALID_TICK;
|
||||
TODO_SPEC (insn) = HARD_DEP;
|
||||
INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status
|
||||
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
|
||||
INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status
|
||||
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -668,6 +668,11 @@ DEFPARAM (PARAM_SCHED_MEM_TRUE_DEP_COST,
|
|||
"Minimal distance between possibly conflicting store and load",
|
||||
1, 0, 0)
|
||||
|
||||
DEFPARAM (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
|
||||
"sched-autopref-queue-depth",
|
||||
"Hardware autoprefetcher scheduler model control flag. Number of lookahead cycles the model looks into; at '0' only enable instruction sorting heuristic. Disabled by default.",
|
||||
-1, 0, 0)
|
||||
|
||||
DEFPARAM(PARAM_MAX_LAST_VALUE_RTL,
|
||||
"max-last-value-rtl",
|
||||
"The maximum number of RTL nodes that can be recorded as combiner's last value",
|
||||
|
|
|
@ -793,6 +793,32 @@ struct reg_set_data
|
|||
struct reg_set_data *next_insn_set;
|
||||
};
|
||||
|
||||
enum autopref_multipass_data_status {
|
||||
/* Entry is irrelevant for auto-prefetcher. */
|
||||
AUTOPREF_MULTIPASS_DATA_IRRELEVANT = -2,
|
||||
/* Entry is uninitialized. */
|
||||
AUTOPREF_MULTIPASS_DATA_UNINITIALIZED = -1,
|
||||
/* Entry is relevant for auto-prefetcher and insn can be delayed
|
||||
to allow another insn through. */
|
||||
AUTOPREF_MULTIPASS_DATA_NORMAL = 0,
|
||||
/* Entry is relevant for auto-prefetcher, but insn should not be
|
||||
delayed as that will break scheduling. */
|
||||
AUTOPREF_MULTIPASS_DATA_DONT_DELAY = 1
|
||||
};
|
||||
|
||||
/* Data for modeling cache auto-prefetcher. */
|
||||
struct autopref_multipass_data_
|
||||
{
|
||||
/* Base part of memory address. */
|
||||
rtx base;
|
||||
/* Memory offset. */
|
||||
int offset;
|
||||
/* Entry status. */
|
||||
enum autopref_multipass_data_status status;
|
||||
};
|
||||
typedef struct autopref_multipass_data_ autopref_multipass_data_def;
|
||||
typedef autopref_multipass_data_def *autopref_multipass_data_t;
|
||||
|
||||
struct _haifa_insn_data
|
||||
{
|
||||
/* We can't place 'struct _deps_list' into h_i_d instead of deps_list_t
|
||||
|
@ -893,6 +919,10 @@ struct _haifa_insn_data
|
|||
|
||||
/* The deciding reason for INSN's place in the ready list. */
|
||||
int last_rfs_win;
|
||||
|
||||
/* Two entries for cache auto-prefetcher model: one for mem reads,
|
||||
and one for mem writes. */
|
||||
autopref_multipass_data_def autopref_multipass_data[2];
|
||||
};
|
||||
|
||||
typedef struct _haifa_insn_data haifa_insn_data_def;
|
||||
|
@ -915,6 +945,8 @@ extern vec<haifa_insn_data_def> h_i_d;
|
|||
(HID (INSN)->reg_pressure_excess_cost_change)
|
||||
#define INSN_PRIORITY_STATUS(INSN) (HID (INSN)->priority_status)
|
||||
#define INSN_MODEL_INDEX(INSN) (HID (INSN)->model_index)
|
||||
#define INSN_AUTOPREF_MULTIPASS_DATA(INSN) \
|
||||
(HID (INSN)->autopref_multipass_data)
|
||||
|
||||
typedef struct _haifa_deps_insn_data haifa_deps_insn_data_def;
|
||||
typedef haifa_deps_insn_data_def *haifa_deps_insn_data_t;
|
||||
|
@ -1363,6 +1395,8 @@ extern int cycle_issued_insns;
|
|||
extern int issue_rate;
|
||||
extern int dfa_lookahead;
|
||||
|
||||
extern int autopref_multipass_dfa_lookahead_guard (rtx_insn *, int);
|
||||
|
||||
extern void ready_sort (struct ready_list *);
|
||||
extern rtx_insn *ready_element (struct ready_list *, int);
|
||||
extern rtx_insn **ready_lastpos (struct ready_list *);
|
||||
|
|
Loading…
Reference in New Issue