From 340c79045e35c6bceeb60afb30b1001c238c93b6 Mon Sep 17 00:00:00 2001 From: Maxim Kuvyrkov Date: Sat, 17 Jan 2015 01:06:43 +0000 Subject: [PATCH] Model cache auto-prefetcher in scheduler * config/arm/arm-protos.h (struct tune_params): New field sched_autopref_queue_depth. * config/arm/arm.c (sched-int.h): Include header. (arm_first_cycle_multipass_dfa_lookahead_guard,) (TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook. (arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,) (arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,) (arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,) (arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,) (arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,) (arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune): Specify sched_autopref_queue_depth value. Enabled for A15 and A57. * config/arm/t-arm (arm.o): Update. * haifa-sched.c (update_insn_after_change): Update. (rank_for_schedule): Use auto-prefetcher model, if requested. (autopref_multipass_init): New static function. (autopref_rank_for_schedule): New rank_for_schedule heuristic. (autopref_multipass_dfa_lookahead_guard_started_dump_p): New static variable for debug dumps. (autopref_multipass_dfa_lookahead_guard_1): New static helper function. (autopref_multipass_dfa_lookahead_guard): New global function that implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook. (init_h_i_d): Update. * params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob. * sched-int.h (enum autopref_multipass_data_status): New const enum. (autopref_multipass_data_): Structure for auto-prefetcher data. (autopref_multipass_data_def, autopref_multipass_data_t): New typedefs. (struct _haifa_insn_data:autopref_multipass_data): New field. (INSN_AUTOPREF_MULTIPASS_DATA): New access macro. (autopref_multipass_dfa_lookahead_guard): Declare. From-SVN: r219789 --- gcc/ChangeLog | 33 +++++ gcc/config/arm/arm-protos.h | 2 + gcc/config/arm/arm.c | 80 +++++++++--- gcc/config/arm/t-arm | 3 +- gcc/haifa-sched.c | 253 ++++++++++++++++++++++++++++++++++++ gcc/params.def | 5 + gcc/sched-int.h | 34 +++++ 7 files changed, 389 insertions(+), 21 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8cc1dc7bf99..4f0414a6b16 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,36 @@ +2015-01-17 Maxim Kuvyrkov + + * config/arm/arm-protos.h (struct tune_params): New field + sched_autopref_queue_depth. + * config/arm/arm.c (sched-int.h): Include header. + (arm_first_cycle_multipass_dfa_lookahead_guard,) + (TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook. + (arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,) + (arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,) + (arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,) + (arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,) + (arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,) + (arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune): + Specify sched_autopref_queue_depth value. Enabled for A15 and A57. + * config/arm/t-arm (arm.o): Update. + * haifa-sched.c (update_insn_after_change): Update. + (rank_for_schedule): Use auto-prefetcher model, if requested. + (autopref_multipass_init): New static function. + (autopref_rank_for_schedule): New rank_for_schedule heuristic. + (autopref_multipass_dfa_lookahead_guard_started_dump_p): New static + variable for debug dumps. + (autopref_multipass_dfa_lookahead_guard_1): New static helper function. + (autopref_multipass_dfa_lookahead_guard): New global function that + implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook. + (init_h_i_d): Update. + * params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob. + * sched-int.h (enum autopref_multipass_data_status): New const enum. + (autopref_multipass_data_): Structure for auto-prefetcher data. + (autopref_multipass_data_def, autopref_multipass_data_t): New typedefs. + (struct _haifa_insn_data:autopref_multipass_data): New field. + (INSN_AUTOPREF_MULTIPASS_DATA): New access macro. + (autopref_multipass_dfa_lookahead_guard): Declare. + 2015-01-17 Maxim Kuvyrkov * rtlanal.c (get_base_term): Handle SCRATCH. diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 320215bcaf6..3db7e1695f8 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -291,6 +291,8 @@ struct tune_params int max_insns_inline_memset; /* Bitfield encoding the fuseable pairs of instructions. */ unsigned int fuseable_ops; + /* Depth of scheduling queue to check for L2 autoprefetcher. */ + int sched_autopref_queue_depth; }; extern const struct tune_params *current_tune; diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 337a69b43e0..fddd7708972 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -97,6 +97,7 @@ #include "builtins.h" #include "tm-constrs.h" #include "rtl-iter.h" +#include "sched-int.h" /* Forward definitions of types. */ typedef struct minipool_node Mnode; @@ -269,6 +270,7 @@ static bool arm_macro_fusion_p (void); static bool arm_cannot_copy_insn_p (rtx_insn *); static int arm_issue_rate (void); static int arm_first_cycle_multipass_dfa_lookahead (void); +static int arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *, int); static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED; static bool arm_output_addr_const_extra (FILE *, rtx); static bool arm_allocate_stack_slots_for_args (void); @@ -629,6 +631,10 @@ static const struct attribute_spec arm_attribute_table[] = #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ arm_first_cycle_multipass_dfa_lookahead +#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD +#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \ + arm_first_cycle_multipass_dfa_lookahead_guard + #undef TARGET_MANGLE_TYPE #define TARGET_MANGLE_TYPE arm_mangle_type @@ -1690,7 +1696,8 @@ const struct tune_params arm_slowmul_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_fastmul_tune = @@ -1710,7 +1717,8 @@ const struct tune_params arm_fastmul_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; /* StrongARM has early execution of branches, so a sequence that is worth @@ -1733,7 +1741,8 @@ const struct tune_params arm_strongarm_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_xscale_tune = @@ -1753,7 +1762,8 @@ const struct tune_params arm_xscale_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_9e_tune = @@ -1773,7 +1783,8 @@ const struct tune_params arm_9e_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_v6t2_tune = @@ -1793,7 +1804,8 @@ const struct tune_params arm_v6t2_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; /* Generic Cortex tuning. Use more specific tunings if appropriate. */ @@ -1814,7 +1826,8 @@ const struct tune_params arm_cortex_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_cortex_a8_tune = @@ -1834,7 +1847,8 @@ const struct tune_params arm_cortex_a8_tune = false, false, /* Prefer 32-bit encodings. */ true, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_cortex_a7_tune = @@ -1854,7 +1868,8 @@ const struct tune_params arm_cortex_a7_tune = false, false, /* Prefer 32-bit encodings. */ true, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_cortex_a15_tune = @@ -1874,7 +1889,8 @@ const struct tune_params arm_cortex_a15_tune = true, true, /* Prefer 32-bit encodings. */ true, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + max_insn_queue_index + 1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_cortex_a53_tune = @@ -1894,7 +1910,8 @@ const struct tune_params arm_cortex_a53_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */ + ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_cortex_a57_tune = @@ -1914,7 +1931,8 @@ const struct tune_params arm_cortex_a57_tune = true, true, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */ + ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */ + max_insn_queue_index + 1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_xgene1_tune = @@ -1934,7 +1952,8 @@ const struct tune_params arm_xgene1_tune = true, true, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 32, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; /* Branches can be dual-issued on Cortex-A5, so conditional execution is @@ -1957,7 +1976,8 @@ const struct tune_params arm_cortex_a5_tune = false, false, /* Prefer 32-bit encodings. */ true, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_cortex_a9_tune = @@ -1977,7 +1997,8 @@ const struct tune_params arm_cortex_a9_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_cortex_a12_tune = @@ -1997,7 +2018,8 @@ const struct tune_params arm_cortex_a12_tune = true, true, /* Prefer 32-bit encodings. */ true, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_MOVW_MOVT /* Fuseable pairs of instructions. */ + ARM_FUSE_MOVW_MOVT, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; /* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single @@ -2024,7 +2046,8 @@ const struct tune_params arm_v7m_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; /* Cortex-M7 tuning. */ @@ -2046,7 +2069,8 @@ const struct tune_params arm_cortex_m7_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than @@ -2068,7 +2092,8 @@ const struct tune_params arm_v6m_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; const struct tune_params arm_fa726te_tune = @@ -2088,7 +2113,8 @@ const struct tune_params arm_fa726te_tune = false, false, /* Prefer 32-bit encodings. */ false, /* Prefer Neon for stringops. */ 8, /* Maximum insns to inline memset. */ - ARM_FUSE_NOTHING /* Fuseable pairs of instructions. */ + ARM_FUSE_NOTHING, /* Fuseable pairs of instructions. */ + -1 /* Sched L2 autopref depth. */ }; @@ -3144,6 +3170,13 @@ arm_option_override (void) global_options.x_param_values, global_options_set.x_param_values); + /* Look through ready list and all of queue for instructions + relevant for L2 auto-prefetcher. */ + maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH, + current_tune->sched_autopref_queue_depth, + global_options.x_param_values, + global_options_set.x_param_values); + /* Disable shrink-wrap when optimizing function for size, since it tends to generate additional returns. */ if (optimize_function_for_size_p (cfun) && TARGET_THUMB2) @@ -27153,6 +27186,13 @@ arm_first_cycle_multipass_dfa_lookahead (void) return issue_rate > 1 && !sched_fusion ? issue_rate : 0; } +/* Enable modeling of L2 auto-prefetcher. */ +static int +arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, int ready_index) +{ + return autopref_multipass_dfa_lookahead_guard (insn, ready_index); +} + const char * arm_mangle_type (const_tree type) { diff --git a/gcc/config/arm/t-arm b/gcc/config/arm/t-arm index 4ef38a87f01..ab5b6e7d598 100644 --- a/gcc/config/arm/t-arm +++ b/gcc/config/arm/t-arm @@ -91,7 +91,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \ $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \ $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \ - intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) $(srcdir)/config/arm/arm-cores.def \ + intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) sched-int.h \ + $(srcdir)/config/arm/arm-cores.def \ $(srcdir)/config/arm/arm-arches.def $(srcdir)/config/arm/arm-fpus.def \ $(srcdir)/config/arm/arm-protos.h \ $(srcdir)/config/arm/arm_neon_builtins.def diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c index 98cb9e4ba56..795ff79e898 100644 --- a/gcc/haifa-sched.c +++ b/gcc/haifa-sched.c @@ -841,6 +841,7 @@ add_delay_dependencies (rtx_insn *insn) /* Forward declarations. */ static int priority (rtx_insn *); +static int autopref_rank_for_schedule (const rtx_insn *, const rtx_insn *); static int rank_for_schedule (const void *, const void *); static void swap_sort (rtx_insn **, int); static void queue_insn (rtx_insn *, int, const char *); @@ -1184,6 +1185,12 @@ update_insn_after_change (rtx_insn *insn) INSN_COST (insn) = -1; /* Invalidate INSN_TICK, so it'll be recalculated. */ INSN_TICK (insn) = INVALID_TICK; + + /* Invalidate autoprefetch data entry. */ + INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status + = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED; + INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status + = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED; } @@ -2724,6 +2731,13 @@ rank_for_schedule (const void *x, const void *y) if (flag_sched_critical_path_heuristic && priority_val) return rfs_result (RFS_PRIORITY, priority_val, tmp, tmp2); + if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) >= 0) + { + int autopref = autopref_rank_for_schedule (tmp, tmp2); + if (autopref != 0) + return autopref; + } + /* Prefer speculative insn with greater dependencies weakness. */ if (flag_sched_spec_insn_heuristic && spec_info) { @@ -5500,6 +5514,241 @@ insn_finishes_cycle_p (rtx_insn *insn) return false; } +/* Functions to model cache auto-prefetcher. + + Some of the CPUs have cache auto-prefetcher, which /seems/ to initiate + memory prefetches if it sees instructions with consequitive memory accesses + in the instruction stream. Details of such hardware units are not published, + so we can only guess what exactly is going on there. + In the scheduler, we model abstract auto-prefetcher. If there are memory + insns in the ready list (or the queue) that have same memory base, but + different offsets, then we delay the insns with larger offsets until insns + with smaller offsets get scheduled. If PARAM_SCHED_AUTOPREF_QUEUE_DEPTH + is "1", then we look at the ready list; if it is N>1, then we also look + through N-1 queue entries. + If the param is N>=0, then rank_for_schedule will consider auto-prefetching + among its heuristics. + Param value of "-1" disables modelling of the auto-prefetcher. */ + +/* Initialize autoprefetcher model data for INSN. */ +static void +autopref_multipass_init (const rtx_insn *insn, int write) +{ + autopref_multipass_data_t data = &INSN_AUTOPREF_MULTIPASS_DATA (insn)[write]; + + gcc_assert (data->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED); + data->base = NULL_RTX; + data->offset = 0; + /* Set insn entry initialized, but not relevant for auto-prefetcher. */ + data->status = AUTOPREF_MULTIPASS_DATA_IRRELEVANT; + + rtx set = single_set (insn); + if (set == NULL_RTX) + return; + + rtx mem = write ? SET_DEST (set) : SET_SRC (set); + if (!MEM_P (mem)) + return; + + struct address_info info; + decompose_mem_address (&info, mem); + + /* TODO: Currently only (base+const) addressing is supported. */ + if (info.base == NULL || !REG_P (*info.base) + || (info.disp != NULL && !CONST_INT_P (*info.disp))) + return; + + /* This insn is relevant for auto-prefetcher. */ + data->base = *info.base; + data->offset = info.disp ? INTVAL (*info.disp) : 0; + data->status = AUTOPREF_MULTIPASS_DATA_NORMAL; +} + +/* Helper function for rank_for_schedule sorting. */ +static int +autopref_rank_for_schedule (const rtx_insn *insn1, const rtx_insn *insn2) +{ + for (int write = 0; write < 2; ++write) + { + autopref_multipass_data_t data1 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write]; + autopref_multipass_data_t data2 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write]; + + if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED) + autopref_multipass_init (insn1, write); + if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT) + continue; + + if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED) + autopref_multipass_init (insn2, write); + if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT) + continue; + + if (!rtx_equal_p (data1->base, data2->base)) + continue; + + return data1->offset - data2->offset; + } + + return 0; +} + +/* True if header of debug dump was printed. */ +static bool autopref_multipass_dfa_lookahead_guard_started_dump_p; + +/* Helper for autopref_multipass_dfa_lookahead_guard. + Return "1" if INSN1 should be delayed in favor of INSN2. */ +static int +autopref_multipass_dfa_lookahead_guard_1 (const rtx_insn *insn1, + const rtx_insn *insn2, int write) +{ + autopref_multipass_data_t data1 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write]; + autopref_multipass_data_t data2 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write]; + + if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED) + autopref_multipass_init (insn2, write); + if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT) + return 0; + + if (rtx_equal_p (data1->base, data2->base) + && data1->offset > data2->offset) + { + if (sched_verbose >= 2) + { + if (!autopref_multipass_dfa_lookahead_guard_started_dump_p) + { + fprintf (sched_dump, + ";;\t\tnot trying in max_issue due to autoprefetch " + "model: "); + autopref_multipass_dfa_lookahead_guard_started_dump_p = true; + } + + fprintf (sched_dump, " %d(%d)", INSN_UID (insn1), INSN_UID (insn2)); + } + + return 1; + } + + return 0; +} + +/* General note: + + We could have also hooked autoprefetcher model into + first_cycle_multipass_backtrack / first_cycle_multipass_issue hooks + to enable intelligent selection of "[r1+0]=r2; [r1+4]=r3" on the same cycle + (e.g., once "[r1+0]=r2" is issued in max_issue(), "[r1+4]=r3" gets + unblocked). We don't bother about this yet because target of interest + (ARM Cortex-A15) can issue only 1 memory operation per cycle. */ + +/* Implementation of first_cycle_multipass_dfa_lookahead_guard hook. + Return "1" if INSN1 should not be considered in max_issue due to + auto-prefetcher considerations. */ +int +autopref_multipass_dfa_lookahead_guard (rtx_insn *insn1, int ready_index) +{ + int r = 0; + + if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) <= 0) + return 0; + + if (sched_verbose >= 2 && ready_index == 0) + autopref_multipass_dfa_lookahead_guard_started_dump_p = false; + + for (int write = 0; write < 2; ++write) + { + autopref_multipass_data_t data1 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write]; + + if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED) + autopref_multipass_init (insn1, write); + if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT) + continue; + + if (ready_index == 0 + && data1->status == AUTOPREF_MULTIPASS_DATA_DONT_DELAY) + /* We allow only a single delay on priviledged instructions. + Doing otherwise would cause infinite loop. */ + { + if (sched_verbose >= 2) + { + if (!autopref_multipass_dfa_lookahead_guard_started_dump_p) + { + fprintf (sched_dump, + ";;\t\tnot trying in max_issue due to autoprefetch " + "model: "); + autopref_multipass_dfa_lookahead_guard_started_dump_p = true; + } + + fprintf (sched_dump, " *%d*", INSN_UID (insn1)); + } + continue; + } + + for (int i2 = 0; i2 < ready.n_ready; ++i2) + { + rtx_insn *insn2 = get_ready_element (i2); + if (insn1 == insn2) + continue; + r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2, write); + if (r) + { + if (ready_index == 0) + { + r = -1; + data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY; + } + goto finish; + } + } + + if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) == 1) + continue; + + /* Everything from the current queue slot should have been moved to + the ready list. */ + gcc_assert (insn_queue[NEXT_Q_AFTER (q_ptr, 0)] == NULL_RTX); + + int n_stalls = PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) - 1; + if (n_stalls > max_insn_queue_index) + n_stalls = max_insn_queue_index; + + for (int stalls = 1; stalls <= n_stalls; ++stalls) + { + for (rtx_insn_list *link = insn_queue[NEXT_Q_AFTER (q_ptr, stalls)]; + link != NULL_RTX; + link = link->next ()) + { + rtx_insn *insn2 = link->insn (); + r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2, + write); + if (r) + { + /* Queue INSN1 until INSN2 can issue. */ + r = -stalls; + if (ready_index == 0) + data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY; + goto finish; + } + } + } + } + + finish: + if (sched_verbose >= 2 + && autopref_multipass_dfa_lookahead_guard_started_dump_p + && (ready_index == ready.n_ready - 1 || r < 0)) + /* This does not /always/ trigger. We don't output EOL if the last + insn is not recognized (INSN_CODE < 0) and lookahead_guard is not + called. We can live with this. */ + fprintf (sched_dump, "\n"); + + return r; +} + /* Define type for target data used in multipass scheduling. */ #ifndef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T # define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T int @@ -8710,6 +8959,10 @@ init_h_i_d (rtx_insn *insn) INSN_EXACT_TICK (insn) = INVALID_TICK; INTER_TICK (insn) = INVALID_TICK; TODO_SPEC (insn) = HARD_DEP; + INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status + = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED; + INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status + = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED; } } diff --git a/gcc/params.def b/gcc/params.def index 3f69ce0c6a4..192c1e021c2 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -668,6 +668,11 @@ DEFPARAM (PARAM_SCHED_MEM_TRUE_DEP_COST, "Minimal distance between possibly conflicting store and load", 1, 0, 0) +DEFPARAM (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH, + "sched-autopref-queue-depth", + "Hardware autoprefetcher scheduler model control flag. Number of lookahead cycles the model looks into; at '0' only enable instruction sorting heuristic. Disabled by default.", + -1, 0, 0) + DEFPARAM(PARAM_MAX_LAST_VALUE_RTL, "max-last-value-rtl", "The maximum number of RTL nodes that can be recorded as combiner's last value", diff --git a/gcc/sched-int.h b/gcc/sched-int.h index 9392d04d5bf..28e95ea97b6 100644 --- a/gcc/sched-int.h +++ b/gcc/sched-int.h @@ -793,6 +793,32 @@ struct reg_set_data struct reg_set_data *next_insn_set; }; +enum autopref_multipass_data_status { + /* Entry is irrelevant for auto-prefetcher. */ + AUTOPREF_MULTIPASS_DATA_IRRELEVANT = -2, + /* Entry is uninitialized. */ + AUTOPREF_MULTIPASS_DATA_UNINITIALIZED = -1, + /* Entry is relevant for auto-prefetcher and insn can be delayed + to allow another insn through. */ + AUTOPREF_MULTIPASS_DATA_NORMAL = 0, + /* Entry is relevant for auto-prefetcher, but insn should not be + delayed as that will break scheduling. */ + AUTOPREF_MULTIPASS_DATA_DONT_DELAY = 1 +}; + +/* Data for modeling cache auto-prefetcher. */ +struct autopref_multipass_data_ +{ + /* Base part of memory address. */ + rtx base; + /* Memory offset. */ + int offset; + /* Entry status. */ + enum autopref_multipass_data_status status; +}; +typedef struct autopref_multipass_data_ autopref_multipass_data_def; +typedef autopref_multipass_data_def *autopref_multipass_data_t; + struct _haifa_insn_data { /* We can't place 'struct _deps_list' into h_i_d instead of deps_list_t @@ -893,6 +919,10 @@ struct _haifa_insn_data /* The deciding reason for INSN's place in the ready list. */ int last_rfs_win; + + /* Two entries for cache auto-prefetcher model: one for mem reads, + and one for mem writes. */ + autopref_multipass_data_def autopref_multipass_data[2]; }; typedef struct _haifa_insn_data haifa_insn_data_def; @@ -915,6 +945,8 @@ extern vec h_i_d; (HID (INSN)->reg_pressure_excess_cost_change) #define INSN_PRIORITY_STATUS(INSN) (HID (INSN)->priority_status) #define INSN_MODEL_INDEX(INSN) (HID (INSN)->model_index) +#define INSN_AUTOPREF_MULTIPASS_DATA(INSN) \ + (HID (INSN)->autopref_multipass_data) typedef struct _haifa_deps_insn_data haifa_deps_insn_data_def; typedef haifa_deps_insn_data_def *haifa_deps_insn_data_t; @@ -1363,6 +1395,8 @@ extern int cycle_issued_insns; extern int issue_rate; extern int dfa_lookahead; +extern int autopref_multipass_dfa_lookahead_guard (rtx_insn *, int); + extern void ready_sort (struct ready_list *); extern rtx_insn *ready_element (struct ready_list *, int); extern rtx_insn **ready_lastpos (struct ready_list *);