|
|
|
@ -159,7 +159,6 @@ static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int histor
|
|
|
|
|
static void free_partial_schedule (partial_schedule_ptr);
|
|
|
|
|
static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
|
|
|
|
|
void print_partial_schedule (partial_schedule_ptr, FILE *);
|
|
|
|
|
static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
|
|
|
|
|
static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
|
|
|
|
|
ddg_node_ptr node, int cycle,
|
|
|
|
|
sbitmap must_precede,
|
|
|
|
@ -365,7 +364,7 @@ set_node_sched_params (ddg_ptr g)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
print_node_sched_params (FILE * file, int num_nodes)
|
|
|
|
|
print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
@ -377,7 +376,8 @@ print_node_sched_params (FILE * file, int num_nodes)
|
|
|
|
|
rtx reg_move = nsp->first_reg_move;
|
|
|
|
|
int j;
|
|
|
|
|
|
|
|
|
|
fprintf (file, "Node %d:\n", i);
|
|
|
|
|
fprintf (file, "Node = %d; INSN = %d\n", i,
|
|
|
|
|
(INSN_UID (g->nodes[i].insn)));
|
|
|
|
|
fprintf (file, " asap = %d:\n", nsp->asap);
|
|
|
|
|
fprintf (file, " time = %d:\n", nsp->time);
|
|
|
|
|
fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
|
|
|
|
@ -390,29 +390,6 @@ print_node_sched_params (FILE * file, int num_nodes)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Calculate an upper bound for II. SMS should not schedule the loop if it
|
|
|
|
|
requires more cycles than this bound. Currently set to the sum of the
|
|
|
|
|
longest latency edge for each node. Reset based on experiments. */
|
|
|
|
|
static int
|
|
|
|
|
calculate_maxii (ddg_ptr g)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
int maxii = 0;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < g->num_nodes; i++)
|
|
|
|
|
{
|
|
|
|
|
ddg_node_ptr u = &g->nodes[i];
|
|
|
|
|
ddg_edge_ptr e;
|
|
|
|
|
int max_edge_latency = 0;
|
|
|
|
|
|
|
|
|
|
for (e = u->out; e; e = e->next_out)
|
|
|
|
|
max_edge_latency = MAX (max_edge_latency, e->latency);
|
|
|
|
|
|
|
|
|
|
maxii += max_edge_latency;
|
|
|
|
|
}
|
|
|
|
|
return maxii;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
Breaking intra-loop register anti-dependences:
|
|
|
|
|
Each intra-loop register anti-dependence implies a cross-iteration true
|
|
|
|
@ -533,40 +510,6 @@ generate_reg_moves (partial_schedule_ptr ps, bool rescan)
|
|
|
|
|
return reg_move_replaces;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We call this when we want to undo the SMS schedule for a given loop.
|
|
|
|
|
One of the things that we do is to delete the register moves generated
|
|
|
|
|
for the sake of SMS; this function deletes the register move instructions
|
|
|
|
|
recorded in the undo buffer. */
|
|
|
|
|
static void
|
|
|
|
|
undo_generate_reg_moves (partial_schedule_ptr ps,
|
|
|
|
|
struct undo_replace_buff_elem *reg_move_replaces)
|
|
|
|
|
{
|
|
|
|
|
int i,j;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < ps->g->num_nodes; i++)
|
|
|
|
|
{
|
|
|
|
|
ddg_node_ptr u = &ps->g->nodes[i];
|
|
|
|
|
rtx prev;
|
|
|
|
|
rtx crr = SCHED_FIRST_REG_MOVE (u);
|
|
|
|
|
|
|
|
|
|
for (j = 0; j < SCHED_NREG_MOVES (u); j++)
|
|
|
|
|
{
|
|
|
|
|
prev = PREV_INSN (crr);
|
|
|
|
|
delete_insn (crr);
|
|
|
|
|
crr = prev;
|
|
|
|
|
}
|
|
|
|
|
SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
while (reg_move_replaces)
|
|
|
|
|
{
|
|
|
|
|
struct undo_replace_buff_elem *rep = reg_move_replaces;
|
|
|
|
|
|
|
|
|
|
reg_move_replaces = reg_move_replaces->next;
|
|
|
|
|
replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Free memory allocated for the undo buffer. */
|
|
|
|
|
static void
|
|
|
|
|
free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
|
|
|
|
@ -638,28 +581,6 @@ permute_partial_schedule (partial_schedule_ptr ps, rtx last)
|
|
|
|
|
PREV_INSN (last));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* As part of undoing SMS we return to the original ordering of the
|
|
|
|
|
instructions inside the loop kernel. Given the partial schedule PS, this
|
|
|
|
|
function returns the ordering of the instruction according to their CUID
|
|
|
|
|
in the DDG (PS->G), which is the original order of the instruction before
|
|
|
|
|
performing SMS. */
|
|
|
|
|
static void
|
|
|
|
|
undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
for (i = 0 ; i < ps->g->num_nodes; i++)
|
|
|
|
|
if (last == ps->g->nodes[i].insn
|
|
|
|
|
|| last == ps->g->nodes[i].first_note)
|
|
|
|
|
break;
|
|
|
|
|
else if (PREV_INSN (last) != ps->g->nodes[i].insn)
|
|
|
|
|
reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
|
|
|
|
|
PREV_INSN (last));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Used to generate the prologue & epilogue. Duplicate the subset of
|
|
|
|
|
nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
|
|
|
|
|
of both), together with a prefix/suffix of their reg_moves. */
|
|
|
|
|
static void
|
|
|
|
|
duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
|
|
|
|
|
int to_stage, int for_prolog)
|
|
|
|
@ -869,6 +790,9 @@ canon_loop (struct loop *loop)
|
|
|
|
|
version may be entered. Just a guess. */
|
|
|
|
|
#define PROB_SMS_ENOUGH_ITERATIONS 80
|
|
|
|
|
|
|
|
|
|
/* Used to calculate the upper bound of ii. */
|
|
|
|
|
#define MAXII_FACTOR 2
|
|
|
|
|
|
|
|
|
|
/* Main entry point, perform SMS scheduling on the loops of the function
|
|
|
|
|
that consist of single basic blocks. */
|
|
|
|
|
static void
|
|
|
|
@ -1097,7 +1021,7 @@ sms_schedule (void)
|
|
|
|
|
mii = 1; /* Need to pass some estimate of mii. */
|
|
|
|
|
rec_mii = sms_order_nodes (g, mii, node_order);
|
|
|
|
|
mii = MAX (res_MII (g), rec_mii);
|
|
|
|
|
maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
|
|
|
|
|
maxii = MAXII_FACTOR * mii;
|
|
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
|
fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
|
|
|
|
@ -1131,8 +1055,6 @@ sms_schedule (void)
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
|
|
|
|
|
int new_cycles;
|
|
|
|
|
struct undo_replace_buff_elem *reg_move_replaces;
|
|
|
|
|
|
|
|
|
|
if (dump_file)
|
|
|
|
@ -1154,68 +1076,46 @@ sms_schedule (void)
|
|
|
|
|
normalize_sched_times (ps);
|
|
|
|
|
rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
|
|
|
|
|
set_columns_for_ps (ps);
|
|
|
|
|
|
|
|
|
|
canon_loop (loop);
|
|
|
|
|
|
|
|
|
|
/* Generate the kernel just to be able to measure its cycles. */
|
|
|
|
|
/* case the BCT count is not known , Do loop-versioning */
|
|
|
|
|
if (count_reg && ! count_init)
|
|
|
|
|
{
|
|
|
|
|
rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
|
|
|
|
|
GEN_INT(stage_count));
|
|
|
|
|
unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
|
|
|
|
|
* REG_BR_PROB_BASE) / 100;
|
|
|
|
|
|
|
|
|
|
loop_version (loop, comp_rtx, &condition_bb,
|
|
|
|
|
prob, prob, REG_BR_PROB_BASE - prob,
|
|
|
|
|
true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Set new iteration count of loop kernel. */
|
|
|
|
|
if (count_reg && count_init)
|
|
|
|
|
SET_SRC (single_set (count_init)) = GEN_INT (loop_count
|
|
|
|
|
- stage_count + 1);
|
|
|
|
|
|
|
|
|
|
/* Now apply the scheduled kernel to the RTL of the loop. */
|
|
|
|
|
permute_partial_schedule (ps, g->closing_branch->first_note);
|
|
|
|
|
reg_move_replaces = generate_reg_moves (ps, false);
|
|
|
|
|
|
|
|
|
|
/* Get the number of cycles the new kernel expect to execute in. */
|
|
|
|
|
new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
|
|
|
|
|
/* Mark this loop as software pipelined so the later
|
|
|
|
|
scheduling passes doesn't touch it. */
|
|
|
|
|
if (! flag_resched_modulo_sched)
|
|
|
|
|
g->bb->flags |= BB_DISABLE_SCHEDULE;
|
|
|
|
|
/* The life-info is not valid any more. */
|
|
|
|
|
df_set_bb_dirty (g->bb);
|
|
|
|
|
|
|
|
|
|
/* Get back to the original loop so we can do loop versioning. */
|
|
|
|
|
undo_permute_partial_schedule (ps, g->closing_branch->first_note);
|
|
|
|
|
if (reg_move_replaces)
|
|
|
|
|
undo_generate_reg_moves (ps, reg_move_replaces);
|
|
|
|
|
|
|
|
|
|
if ( new_cycles >= orig_cycles)
|
|
|
|
|
{
|
|
|
|
|
/* SMS is not profitable so undo the permutation and reg move generation
|
|
|
|
|
and return the kernel to its original state. */
|
|
|
|
|
if (dump_file)
|
|
|
|
|
fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
reg_move_replaces = generate_reg_moves (ps, true);
|
|
|
|
|
if (dump_file)
|
|
|
|
|
print_node_sched_params (dump_file, g->num_nodes, g);
|
|
|
|
|
/* Generate prolog and epilog. */
|
|
|
|
|
if (count_reg && !count_init)
|
|
|
|
|
generate_prolog_epilog (ps, loop, count_reg);
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
canon_loop (loop);
|
|
|
|
|
|
|
|
|
|
/* case the BCT count is not known , Do loop-versioning */
|
|
|
|
|
if (count_reg && ! count_init)
|
|
|
|
|
{
|
|
|
|
|
rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
|
|
|
|
|
GEN_INT(stage_count));
|
|
|
|
|
unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
|
|
|
|
|
* REG_BR_PROB_BASE) / 100;
|
|
|
|
|
|
|
|
|
|
loop_version (loop, comp_rtx, &condition_bb,
|
|
|
|
|
prob, prob, REG_BR_PROB_BASE - prob,
|
|
|
|
|
true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Set new iteration count of loop kernel. */
|
|
|
|
|
if (count_reg && count_init)
|
|
|
|
|
SET_SRC (single_set (count_init)) = GEN_INT (loop_count
|
|
|
|
|
- stage_count + 1);
|
|
|
|
|
|
|
|
|
|
/* Now apply the scheduled kernel to the RTL of the loop. */
|
|
|
|
|
permute_partial_schedule (ps, g->closing_branch->first_note);
|
|
|
|
|
|
|
|
|
|
/* Mark this loop as software pipelined so the later
|
|
|
|
|
scheduling passes doesn't touch it. */
|
|
|
|
|
if (! flag_resched_modulo_sched)
|
|
|
|
|
g->bb->flags |= BB_DISABLE_SCHEDULE;
|
|
|
|
|
/* The life-info is not valid any more. */
|
|
|
|
|
df_set_bb_dirty (g->bb);
|
|
|
|
|
|
|
|
|
|
reg_move_replaces = generate_reg_moves (ps, true);
|
|
|
|
|
if (dump_file)
|
|
|
|
|
print_node_sched_params (dump_file, g->num_nodes);
|
|
|
|
|
/* Generate prolog and epilog. */
|
|
|
|
|
if (count_reg && !count_init)
|
|
|
|
|
generate_prolog_epilog (ps, loop, count_reg);
|
|
|
|
|
else
|
|
|
|
|
generate_prolog_epilog (ps, loop, NULL_RTX);
|
|
|
|
|
}
|
|
|
|
|
generate_prolog_epilog (ps, loop, NULL_RTX);
|
|
|
|
|
|
|
|
|
|
free_undo_replace_buff (reg_move_replaces);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -1529,17 +1429,21 @@ sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
|
|
|
|
|
of nodes within the cycle. */
|
|
|
|
|
sbitmap_zero (must_precede);
|
|
|
|
|
sbitmap_zero (must_follow);
|
|
|
|
|
for (e = u_node->in; e != 0; e = e->next_in)
|
|
|
|
|
/* TODO: We can add an insn to the must_precede or must_follow
|
|
|
|
|
bitmaps only if it has tight dependence to U and they
|
|
|
|
|
both scheduled in the same row. The current check is less
|
|
|
|
|
conservative and content with the fact that both U and the
|
|
|
|
|
insn are scheduled in the same row. */
|
|
|
|
|
for (e = u_node->in; e != 0; e = e->next_in)
|
|
|
|
|
if (TEST_BIT (sched_nodes, e->src->cuid)
|
|
|
|
|
&& e->latency == (ii * e->distance)
|
|
|
|
|
&& start == SCHED_TIME (e->src))
|
|
|
|
|
SET_BIT (must_precede, e->src->cuid);
|
|
|
|
|
&& (SMODULO (SCHED_TIME (e->src), ii) == SMODULO (start, ii)))
|
|
|
|
|
SET_BIT (must_precede, e->src->cuid);
|
|
|
|
|
|
|
|
|
|
for (e = u_node->out; e != 0; e = e->next_out)
|
|
|
|
|
for (e = u_node->out; e != 0; e = e->next_out)
|
|
|
|
|
if (TEST_BIT (sched_nodes, e->dest->cuid)
|
|
|
|
|
&& e->latency == (ii * e->distance)
|
|
|
|
|
&& end == SCHED_TIME (e->dest))
|
|
|
|
|
SET_BIT (must_follow, e->dest->cuid);
|
|
|
|
|
&& (SMODULO (SCHED_TIME (e->dest), ii) ==
|
|
|
|
|
SMODULO ((end - step), ii)))
|
|
|
|
|
SET_BIT (must_follow, e->dest->cuid);
|
|
|
|
|
|
|
|
|
|
success = 0;
|
|
|
|
|
if ((step > 0 && start < end) || (step < 0 && start > end))
|
|
|
|
@ -2259,57 +2163,7 @@ advance_one_cycle (void)
|
|
|
|
|
targetm.sched.dfa_post_cycle_insn ());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
|
|
|
|
|
the number of cycles according to DFA that the kernel fits in,
|
|
|
|
|
we use this to check if we done well with SMS after we add
|
|
|
|
|
register moves. In some cases register moves overhead makes
|
|
|
|
|
it even worse than the original loop. We want SMS to be performed
|
|
|
|
|
when it gives less cycles after register moves are added. */
|
|
|
|
|
static int
|
|
|
|
|
kernel_number_of_cycles (rtx first_insn, rtx last_insn)
|
|
|
|
|
{
|
|
|
|
|
int cycles = 0;
|
|
|
|
|
rtx insn;
|
|
|
|
|
int can_issue_more = issue_rate;
|
|
|
|
|
|
|
|
|
|
state_reset (curr_state);
|
|
|
|
|
|
|
|
|
|
for (insn = first_insn;
|
|
|
|
|
insn != NULL_RTX && insn != last_insn;
|
|
|
|
|
insn = NEXT_INSN (insn))
|
|
|
|
|
{
|
|
|
|
|
if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* Check if there is room for the current insn. */
|
|
|
|
|
if (!can_issue_more || state_dead_lock_p (curr_state))
|
|
|
|
|
{
|
|
|
|
|
cycles ++;
|
|
|
|
|
advance_one_cycle ();
|
|
|
|
|
can_issue_more = issue_rate;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Update the DFA state and return with failure if the DFA found
|
|
|
|
|
recource conflicts. */
|
|
|
|
|
if (state_transition (curr_state, insn) >= 0)
|
|
|
|
|
{
|
|
|
|
|
cycles ++;
|
|
|
|
|
advance_one_cycle ();
|
|
|
|
|
can_issue_more = issue_rate;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (targetm.sched.variable_issue)
|
|
|
|
|
can_issue_more =
|
|
|
|
|
targetm.sched.variable_issue (sched_dump, sched_verbose,
|
|
|
|
|
insn, can_issue_more);
|
|
|
|
|
/* A naked CLOBBER or USE generates no instruction, so don't
|
|
|
|
|
let them consume issue slots. */
|
|
|
|
|
else if (GET_CODE (PATTERN (insn)) != USE
|
|
|
|
|
&& GET_CODE (PATTERN (insn)) != CLOBBER)
|
|
|
|
|
can_issue_more--;
|
|
|
|
|
}
|
|
|
|
|
return cycles;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Checks if PS has resource conflicts according to DFA, starting from
|
|
|
|
|
FROM cycle to TO cycle; returns true if there are conflicts and false
|
|
|
|
|