haifa-sched.c (modulo_ii, [...]): New static variables.

* haifa-sched.c (modulo_ii, modulo_max_states, modulo_n_insns,
	modulo_insns_scheduled, modulo_iter0_max_uid, modulo_backtracks_left,
	modulo_last_stage): New static variables.
	(set_modulo_params, discard_delay_pairs_above): New functions.
	(struct delay_pair): New member stages.
	(htab_i2_traverse, htab_i1_traverse): New static functions.
	(record_delay_slot_pair): New arg stages.  All callers changed.
	Record it.
	(pair_delay): Take stages into account.
	(add_delay_dependencies): Don't do so for stage pairs.
	(struct sched_block_state): New member modulo_epilogue.
	(save_backtrack_point): Don't set SHADOW_P for stage pairs.
	(unschedule_insns_until): Decrease modulo_insns_scheduled.
	Set HARD_DEP without using or.
	(resolve_dependencies): New static function.
	(prune_ready_list): New arg modulo_epilogue_p.  All callers changed.
	If it is true, allow only insns with INSN_EXACT_TICK set.
	(schedule_block): Return bool, always true for normal scheduling,
	true or false depending on modulo scheduling success otherwise.
	Add bookkeeping for modulo scheduling, and call resolve_dependencies
	on everything left over after a modulo schedule.
	(haifa_sched_init): Remove check_cfg call.  Clear modulo_ii.
	* sched-int.h (schedule_block, record_delay_slot_pair): Adjust
	declarations.
	(set_modulo_params, discard_delay_pairs_above): Declare.
	* params.def (PARAM_MAX_MODULO_BACKTRACK_ATTEMPS): New.
	* doc/invoke.texi (--param): Document it.

From-SVN: r179383
This commit is contained in:
Bernd Schmidt 2011-09-30 13:33:54 +00:00 committed by Bernd Schmidt
parent ccb3dd5a14
commit 06d7e8e706
6 changed files with 356 additions and 32 deletions

View File

@ -1,3 +1,33 @@
2011-09-30 Bernd Schmidt <bernds@codesourcery.com>
* haifa-sched.c (modulo_ii, modulo_max_states, modulo_n_insns,
modulo_insns_scheduled, modulo_iter0_max_uid, modulo_backtracks_left,
modulo_last_stage): New static variables.
(set_modulo_params, discard_delay_pairs_above): New functions.
(struct delay_pair): New member stages.
(htab_i2_traverse, htab_i1_traverse): New static functions.
(record_delay_slot_pair): New arg stages. All callers changed.
Record it.
(pair_delay): Take stages into account.
(add_delay_dependencies): Don't do so for stage pairs.
(struct sched_block_state): New member modulo_epilogue.
(save_backtrack_point): Don't set SHADOW_P for stage pairs.
(unschedule_insns_until): Decrease modulo_insns_scheduled.
Set HARD_DEP without using or.
(resolve_dependencies): New static function.
(prune_ready_list): New arg modulo_epilogue_p. All callers changed.
If it is true, allow only insns with INSN_EXACT_TICK set.
(schedule_block): Return bool, always true for normal scheduling,
true or false depending on modulo scheduling success otherwise.
Add bookkeeping for modulo scheduling, and call resolve_dependencies
on everything left over after a modulo schedule.
(haifa_sched_init): Remove check_cfg call. Clear modulo_ii.
* sched-int.h (schedule_block, record_delay_slot_pair): Adjust
declarations.
(set_modulo_params, discard_delay_pairs_above): Declare.
* params.def (PARAM_MAX_MODULO_BACKTRACK_ATTEMPS): New.
* doc/invoke.texi (--param): Document it.
2011-09-30 Richard Guenther <rguenther@suse.de>
PR middle-end/50574

View File

@ -4810,7 +4810,7 @@ split_delayed_branch (rtx insn)
i1 = emit_insn_before (pat, insn);
PATTERN (insn) = newpat;
INSN_CODE (insn) = -1;
record_delay_slot_pair (i1, insn, 5);
record_delay_slot_pair (i1, insn, 5, 0);
}
/* Split every insn (i.e. jumps and calls) which can have delay slots into

View File

@ -8474,6 +8474,11 @@ before flushing the current state and starting over. Large functions
with few branches or calls can create excessively large lists which
needlessly consume memory and resources.
@item max-modulo-backtrack-attempts
The maximum number of backtrack attempts the scheduler should make
when modulo scheduling a loop. Larger values can exponentially increase
compile time.
@item max-inline-insns-single
Several parameters control the tree inliner used in gcc.
This number sets the maximum number of instructions (counted in GCC's

View File

@ -163,6 +163,31 @@ int issue_rate;
enable a DCE pass. */
bool sched_no_dce;
/* The current initiation interval used when modulo scheduling. */
static int modulo_ii;
/* The maximum number of stages we are prepared to handle. */
static int modulo_max_stages;
/* The number of insns that exist in each iteration of the loop. We use this
to detect when we've scheduled all insns from the first iteration. */
static int modulo_n_insns;
/* The current count of insns in the first iteration of the loop that have
already been scheduled. */
static int modulo_insns_scheduled;
/* The maximum uid of insns from the first iteration of the loop. */
static int modulo_iter0_max_uid;
/* The number of times we should attempt to backtrack when modulo scheduling.
Decreased each time we have to backtrack. */
static int modulo_backtracks_left;
/* The stage in which the last insn from the original loop was
scheduled. */
static int modulo_last_stage;
/* sched-verbose controls the amount of debugging output the
scheduler prints. It is controlled by -fsched-verbose=N:
N>0 and no -DSR : the output is directed to stderr.
@ -507,6 +532,29 @@ haifa_classify_insn (const_rtx insn)
{
return haifa_classify_rtx (PATTERN (insn));
}
/* After the scheduler initialization function has been called, this function
can be called to enable modulo scheduling. II is the initiation interval
we should use, it affects the delays for delay_pairs that were recorded as
separated by a given number of stages.
MAX_STAGES provides us with a limit
after which we give up scheduling; the caller must have unrolled at least
as many copies of the loop body and recorded delay_pairs for them.
INSNS is the number of real (non-debug) insns in one iteration of
the loop. MAX_UID can be used to test whether an insn belongs to
the first iteration of the loop; all of them have a uid lower than
MAX_UID. */
void
set_modulo_params (int ii, int max_stages, int insns, int max_uid)
{
modulo_ii = ii;
modulo_max_stages = max_stages;
modulo_n_insns = insns;
modulo_iter0_max_uid = max_uid;
modulo_backtracks_left = PARAM_VALUE (PARAM_MAX_MODULO_BACKTRACK_ATTEMPTS);
}
/* A structure to record a pair of insns where the first one is a real
insn that has delay slots, and the second is its delayed shadow.
@ -518,6 +566,10 @@ struct delay_pair
struct delay_pair *next_same_i1;
rtx i1, i2;
int cycles;
/* When doing modulo scheduling, we a delay_pair can also be used to
show that I1 and I2 are the same insn in a different stage. If that
is the case, STAGES will be nonzero. */
int stages;
};
/* Two hash tables to record delay_pairs, one indexed by I1 and the other
@ -525,6 +577,62 @@ struct delay_pair
static htab_t delay_htab;
static htab_t delay_htab_i2;
/* Called through htab_traverse. Walk the hashtable using I2 as
index, and delete all elements involving an UID higher than
that pointed to by *DATA. */
static int
htab_i2_traverse (void **slot, void *data)
{
int maxuid = *(int *)data;
struct delay_pair *p = *(struct delay_pair **)slot;
if (INSN_UID (p->i2) >= maxuid || INSN_UID (p->i1) >= maxuid)
{
htab_clear_slot (delay_htab_i2, slot);
}
return 1;
}
/* Called through htab_traverse. Walk the hashtable using I2 as
index, and delete all elements involving an UID higher than
that pointed to by *DATA. */
static int
htab_i1_traverse (void **slot, void *data)
{
int maxuid = *(int *)data;
struct delay_pair **pslot = (struct delay_pair **)slot;
struct delay_pair *p, *first, **pprev;
if (INSN_UID ((*pslot)->i1) >= maxuid)
{
htab_clear_slot (delay_htab, slot);
return 1;
}
pprev = &first;
for (p = *pslot; p; p = p->next_same_i1)
{
if (INSN_UID (p->i2) < maxuid)
{
*pprev = p;
pprev = &p->next_same_i1;
}
}
*pprev = NULL;
if (first == NULL)
htab_clear_slot (delay_htab, slot);
else
*pslot = first;
return 1;
}
/* Discard all delay pairs which involve an insn with an UID higher
than MAX_UID. */
void
discard_delay_pairs_above (int max_uid)
{
htab_traverse (delay_htab, htab_i1_traverse, &max_uid);
htab_traverse (delay_htab_i2, htab_i2_traverse, &max_uid);
}
/* Returns a hash value for X (which really is a delay_pair), based on
hashing just I1. */
static hashval_t
@ -555,18 +663,24 @@ delay_i2_eq (const void *x, const void *y)
return ((const struct delay_pair *) x)->i2 == y;
}
/* This function can be called by a port just before it starts the
final scheduling pass. It records the fact that an instruction
with delay slots has been split into two insns, I1 and I2. The
first one will be scheduled normally and initiates the operation.
The second one is a shadow which must follow a specific number of
CYCLES after I1; its only purpose is to show the side effect that
occurs at that cycle in the RTL. If a JUMP_INSN or a CALL_INSN has
been split, I1 should be a normal INSN, while I2 retains the
original insn type. */
/* This function can be called by a port just before it starts the final
scheduling pass. It records the fact that an instruction with delay
slots has been split into two insns, I1 and I2. The first one will be
scheduled normally and initiates the operation. The second one is a
shadow which must follow a specific number of cycles after I1; its only
purpose is to show the side effect that occurs at that cycle in the RTL.
If a JUMP_INSN or a CALL_INSN has been split, I1 should be a normal INSN,
while I2 retains the original insn type.
There are two ways in which the number of cycles can be specified,
involving the CYCLES and STAGES arguments to this function. If STAGES
is zero, we just use the value of CYCLES. Otherwise, STAGES is a factor
which is multiplied by MODULO_II to give the number of cycles. This is
only useful if the caller also calls set_modulo_params to enable modulo
scheduling. */
void
record_delay_slot_pair (rtx i1, rtx i2, int cycles)
record_delay_slot_pair (rtx i1, rtx i2, int cycles, int stages)
{
struct delay_pair *p = XNEW (struct delay_pair);
struct delay_pair **slot;
@ -574,6 +688,7 @@ record_delay_slot_pair (rtx i1, rtx i2, int cycles)
p->i1 = i1;
p->i2 = i2;
p->cycles = cycles;
p->stages = stages;
if (!delay_htab)
{
@ -596,7 +711,10 @@ record_delay_slot_pair (rtx i1, rtx i2, int cycles)
static int
pair_delay (struct delay_pair *p)
{
return p->cycles;
if (p->stages == 0)
return p->cycles;
else
return p->stages * modulo_ii;
}
/* Given an insn INSN, add a dependence on its delayed shadow if it
@ -619,6 +737,8 @@ add_delay_dependencies (rtx insn)
if (!pair)
return;
add_dependence (insn, pair->i1, REG_DEP_ANTI);
if (pair->stages)
return;
FOR_EACH_DEP (pair->i2, SD_LIST_BACK, sd_it, dep)
{
@ -626,7 +746,7 @@ add_delay_dependencies (rtx insn)
struct delay_pair *other_pair
= (struct delay_pair *)htab_find_with_hash (delay_htab_i2, pro,
htab_hash_pointer (pro));
if (!other_pair)
if (!other_pair || other_pair->stages)
continue;
if (pair_delay (other_pair) >= pair_delay (pair))
{
@ -1851,6 +1971,9 @@ struct sched_block_state
/* True if a shadow insn has been scheduled in the current cycle, which
means that no more normal insns can be issued. */
bool shadows_only_p;
/* True if we're winding down a modulo schedule, which means that we only
issue insns with INSN_EXACT_TICK set. */
bool modulo_epilogue;
/* Initialized with the machine's issue rate every cycle, and updated
by calls to the variable_issue hook. */
int can_issue_more;
@ -2223,7 +2346,7 @@ save_backtrack_point (struct delay_pair *pair,
mark_backtrack_feeds (pair->i2, 1);
INSN_TICK (pair->i2) = INVALID_TICK;
INSN_EXACT_TICK (pair->i2) = clock_var + pair_delay (pair);
SHADOW_P (pair->i2) = true;
SHADOW_P (pair->i2) = pair->stages == 0;
pair = pair->next_same_i1;
}
}
@ -2249,11 +2372,14 @@ unschedule_insns_until (rtx insn)
if (last != insn)
INSN_TICK (last) = INVALID_TICK;
if (modulo_ii > 0 && INSN_UID (last) < modulo_iter0_max_uid)
modulo_insns_scheduled--;
for (sd_it = sd_iterator_start (last, SD_LIST_RES_FORW);
sd_iterator_cond (&sd_it, &dep);)
{
rtx con = DEP_CON (dep);
TODO_SPEC (con) |= HARD_DEP;
TODO_SPEC (con) = HARD_DEP;
INSN_TICK (con) = INVALID_TICK;
sd_unresolve_dep (sd_it);
}
@ -2467,6 +2593,56 @@ estimate_shadow_tick (struct delay_pair *p)
return 0;
}
/* If INSN has no unresolved backwards dependencies, add it to the schedule and
recursively resolve all its forward dependencies. */
static void
resolve_dependencies (rtx insn)
{
sd_iterator_def sd_it;
dep_t dep;
/* Don't use sd_lists_empty_p; it ignores debug insns. */
if (DEPS_LIST_FIRST (INSN_HARD_BACK_DEPS (insn)) != NULL
|| DEPS_LIST_FIRST (INSN_SPEC_BACK_DEPS (insn)) != NULL)
return;
if (sched_verbose >= 4)
fprintf (sched_dump, ";;\tquickly resolving %d\n", INSN_UID (insn));
if (QUEUE_INDEX (insn) >= 0)
queue_remove (insn);
VEC_safe_push (rtx, heap, scheduled_insns, insn);
/* Update dependent instructions. */
for (sd_it = sd_iterator_start (insn, SD_LIST_FORW);
sd_iterator_cond (&sd_it, &dep);)
{
rtx next = DEP_CON (dep);
if (sched_verbose >= 4)
fprintf (sched_dump, ";;\t\tdep %d against %d\n", INSN_UID (insn),
INSN_UID (next));
/* Resolve the dependence between INSN and NEXT.
sd_resolve_dep () moves current dep to another list thus
advancing the iterator. */
sd_resolve_dep (sd_it);
if (!IS_SPECULATION_BRANCHY_CHECK_P (insn))
{
resolve_dependencies (next);
}
else
/* Check always has only one forward dependence (to the first insn in
the recovery block), therefore, this will be executed only once. */
{
gcc_assert (sd_lists_empty_p (insn, SD_LIST_FORW));
}
}
}
/* Return the head and tail pointers of ebb starting at BEG and ending
at END. */
void
@ -3448,15 +3624,12 @@ commit_schedule (rtx prev_head, rtx tail, basic_block *target_bb)
issue an asm statement.
If SHADOWS_ONLY_P is true, we eliminate all real insns and only
leave those for which SHADOW_P is true.
Return the number of cycles we must
advance to find the next ready instruction, or zero if there remain
insns on the ready list. */
leave those for which SHADOW_P is true. If MODULO_EPILOGUE is true,
we only leave insns which have an INSN_EXACT_TICK. */
static void
prune_ready_list (state_t temp_state, bool first_cycle_insn_p,
bool shadows_only_p)
bool shadows_only_p, bool modulo_epilogue_p)
{
int i;
@ -3467,6 +3640,12 @@ prune_ready_list (state_t temp_state, bool first_cycle_insn_p,
int cost = 0;
const char *reason = "resource conflict";
if (modulo_epilogue_p && !DEBUG_INSN_P (insn)
&& INSN_EXACT_TICK (insn) == INVALID_TICK)
{
cost = max_insn_queue_index;
reason = "not an epilogue insn";
}
if (shadows_only_p && !DEBUG_INSN_P (insn) && !SHADOW_P (insn))
{
cost = 1;
@ -3580,10 +3759,11 @@ verify_shadows (void)
TARGET_BB, possibly bringing insns from subsequent blocks in the same
region. */
void
bool
schedule_block (basic_block *target_bb)
{
int i;
bool success = modulo_ii == 0;
struct sched_block_state ls;
state_t temp_state = NULL; /* It is used for multipass scheduling. */
int sort_p, advance, start_clock_var;
@ -3704,6 +3884,9 @@ schedule_block (basic_block *target_bb)
gcc_assert (VEC_length (rtx, scheduled_insns) == 0);
sort_p = TRUE;
must_backtrack = false;
modulo_insns_scheduled = 0;
ls.modulo_epilogue = false;
/* Loop until all the insns in BB are scheduled. */
while ((*current_sched_info->schedule_more_p) ())
@ -3733,8 +3916,41 @@ schedule_block (basic_block *target_bb)
}
while (advance > 0);
if (ready.n_ready > 0)
prune_ready_list (temp_state, true, false);
if (ls.modulo_epilogue)
{
int stage = clock_var / modulo_ii;
if (stage > modulo_last_stage * 2 + 2)
{
if (sched_verbose >= 2)
fprintf (sched_dump,
";;\t\tmodulo scheduled succeeded at II %d\n",
modulo_ii);
success = true;
goto end_schedule;
}
}
else if (modulo_ii > 0)
{
int stage = clock_var / modulo_ii;
if (stage > modulo_max_stages)
{
if (sched_verbose >= 2)
fprintf (sched_dump,
";;\t\tfailing schedule due to excessive stages\n");
goto end_schedule;
}
if (modulo_n_insns == modulo_insns_scheduled
&& stage > modulo_last_stage)
{
if (sched_verbose >= 2)
fprintf (sched_dump,
";;\t\tfound kernel after %d stages, II %d\n",
stage, modulo_ii);
ls.modulo_epilogue = true;
}
}
prune_ready_list (temp_state, true, false, ls.modulo_epilogue);
if (ready.n_ready == 0)
continue;
if (must_backtrack)
@ -3912,6 +4128,11 @@ schedule_block (basic_block *target_bb)
/* DECISION is made. */
if (modulo_ii > 0 && INSN_UID (insn) < modulo_iter0_max_uid)
{
modulo_insns_scheduled++;
modulo_last_stage = clock_var / modulo_ii;
}
if (TODO_SPEC (insn) & SPECULATIVE)
generate_recovery_code (insn);
@ -3964,7 +4185,8 @@ schedule_block (basic_block *target_bb)
ls.first_cycle_insn_p = false;
if (ready.n_ready > 0)
prune_ready_list (temp_state, false, ls.shadows_only_p);
prune_ready_list (temp_state, false, ls.shadows_only_p,
ls.modulo_epilogue);
}
do_backtrack:
@ -3979,6 +4201,12 @@ schedule_block (basic_block *target_bb)
break;
}
}
if (must_backtrack && modulo_ii > 0)
{
if (modulo_backtracks_left == 0)
goto end_schedule;
modulo_backtracks_left--;
}
while (must_backtrack)
{
struct haifa_saved_data *failed;
@ -4012,7 +4240,48 @@ schedule_block (basic_block *target_bb)
}
}
}
if (ls.modulo_epilogue)
success = true;
end_schedule:
if (modulo_ii > 0)
{
/* Once again, debug insn suckiness: they can be on the ready list
even if they have unresolved dependencies. To make our view
of the world consistent, remove such "ready" insns. */
restart_debug_insn_loop:
for (i = ready.n_ready - 1; i >= 0; i--)
{
rtx x;
x = ready_element (&ready, i);
if (DEPS_LIST_FIRST (INSN_HARD_BACK_DEPS (x)) != NULL
|| DEPS_LIST_FIRST (INSN_SPEC_BACK_DEPS (x)) != NULL)
{
ready_remove (&ready, i);
goto restart_debug_insn_loop;
}
}
for (i = ready.n_ready - 1; i >= 0; i--)
{
rtx x;
x = ready_element (&ready, i);
resolve_dependencies (x);
}
for (i = 0; i <= max_insn_queue_index; i++)
{
rtx link;
while ((link = insn_queue[i]) != NULL)
{
rtx x = XEXP (link, 0);
insn_queue[i] = XEXP (link, 1);
QUEUE_INDEX (x) = QUEUE_NOWHERE;
free_INSN_LIST_node (link);
resolve_dependencies (x);
}
}
}
/* Debug info. */
if (sched_verbose)
{
@ -4020,11 +4289,11 @@ schedule_block (basic_block *target_bb)
debug_ready_list (&ready);
}
if (current_sched_info->queue_must_finish_empty)
if (modulo_ii == 0 && current_sched_info->queue_must_finish_empty)
/* Sanity check -- queue must be empty now. Meaningless if region has
multiple bbs. */
gcc_assert (!q_size && !ready.n_ready && !ready.n_debug);
else
else if (modulo_ii == 0)
{
/* We must maintain QUEUE_INDEX between blocks in region. */
for (i = ready.n_ready - 1; i >= 0; i--)
@ -4052,9 +4321,16 @@ schedule_block (basic_block *target_bb)
}
}
commit_schedule (prev_head, tail, target_bb);
if (sched_verbose)
fprintf (sched_dump, ";; total time = %d\n", clock_var);
if (success)
{
commit_schedule (prev_head, tail, target_bb);
if (sched_verbose)
fprintf (sched_dump, ";; total time = %d\n", clock_var);
}
else
last_scheduled_insn = tail;
VEC_truncate (rtx, scheduled_insns, 0);
if (!current_sched_info->queue_must_finish_empty
|| haifa_recovery_bb_recently_added_p)
@ -4092,6 +4368,8 @@ schedule_block (basic_block *target_bb)
current_sched_info->tail = tail;
free_backtrack_queue ();
return success;
}
/* Set_priorities: compute priority of each insn in the block. */
@ -4302,6 +4580,8 @@ haifa_sched_init (void)
nr_begin_data = nr_begin_control = nr_be_in_data = nr_be_in_control = 0;
before_recovery = 0;
after_recovery = 0;
modulo_ii = 0;
}
/* Finish work with the data specific to the Haifa scheduler. */

View File

@ -165,6 +165,13 @@ DEFPARAM(PARAM_MAX_PENDING_LIST_LENGTH,
"The maximum length of scheduling's pending operations list",
32, 0, 0)
/* This parameter limits the number of backtracking attempts when using the
haifa scheduler for modulo scheduling. */
DEFPARAM(PARAM_MAX_MODULO_BACKTRACK_ATTEMPTS,
"max-modulo-backtrack-attempts",
"The maximum number of backtrack attempts the scheduler should make when modulo scheduling a loop",
40, 0, 0)
DEFPARAM(PARAM_LARGE_FUNCTION_INSNS,
"large-function-insns",
"The size of function body to be considered large",

View File

@ -1257,7 +1257,7 @@ extern int dep_cost (dep_t);
extern int set_priorities (rtx, rtx);
extern void sched_setup_bb_reg_pressure_info (basic_block, rtx);
extern void schedule_block (basic_block *);
extern bool schedule_block (basic_block *);
extern int cycle_issued_insns;
extern int issue_rate;
@ -1330,7 +1330,9 @@ extern int current_blocks;
extern int target_bb;
extern bool sched_no_dce;
extern void record_delay_slot_pair (rtx, rtx, int);
extern void set_modulo_params (int, int, int, int);
extern void record_delay_slot_pair (rtx, rtx, int, int);
extern void discard_delay_pairs_above (int);
extern void free_delay_pairs (void);
extern void add_delay_dependencies (rtx);
extern bool sched_is_disabled_for_current_region_p (void);