Modulo-scheduling improvements. Patch 2 of 2

Co-Authored-By: Andrey Belevantsev <abel@ispras.ru>
Co-Authored-By: Revital Eres <eres@il.ibm.com>
Co-Authored-By: Vladimir Yanovsky <yanov@il.ibm.com>

From-SVN: r127848
This commit is contained in:
Mircea Namolaru 2007-08-28 06:52:16 +00:00 committed by Revital Eres
parent 7368348cb7
commit 46dc0789fe
7 changed files with 293 additions and 62 deletions

View File

@ -1,3 +1,31 @@
2007-08-28 Mircea Namolaru <namolaru@il.ibm.com>
Vladimir Yanovsky <yanov@il.ibm.com>
Revital Eres <eres@il.ibm.com>
Andrey Belevantsev <abel@ispras.ru>
* config/spu/spu.md: Recognize doloop pattern when -fmodulo-sched
is set.
* modulo-sched.c: Add documentation regarding do-loop.
(doloop_register_get): Change number of arguments to support
the new do-loop pattern and check whether COUNT_REG has no other
occurences in the loop besides in the control part.
(duplicate_insns_of_cycles): Do not duplicate the insn which
changes count_reg as it is already adjusted.
(generate_prolog_epilog): New argument to support the new
do-loop pattern. Change the subtract instruction to use
expand_simple_binop. Call duplicate_insns_of_cycles with new
argument.
(sms_schedule): Call doloop_register_get and
generate_prolog_epilog with new argument. Do not handle loops
with single sets insns with subreg in their lhs.
* loop-doloop.c (doloop_optimize): Support for another do-loop
pattern.
(doloop_condition_get): Gets an instruction instead of a pattern
and change the return condition when the do-loop pattern is
not parallel.
* ddg.c (create_ddg_dep_from_intra_loop_link): Handle only reg
deps when considering to not create edges.
2007-08-27 Alexandre Oliva <aoliva@redhat.com>
* doc/extend.texi (gnu_inline funtion attribute): Document C++

View File

@ -3887,6 +3887,48 @@ selb\t%0,%4,%0,%3"
[(set_attr "type" "br")])
;; Define the subtract-one-and-jump insns so loop.c
;; knows what to generate.
(define_expand "doloop_end"
[(use (match_operand 0 "" "")) ; loop pseudo
(use (match_operand 1 "" "")) ; iterations; zero if unknown
(use (match_operand 2 "" "")) ; max iterations
(use (match_operand 3 "" "")) ; loop level
(use (match_operand 4 "" ""))] ; label
""
"
{
/* Currently SMS relies on the do-loop pattern to recognize loops
where (1) the control part comprises of all insns defining and/or
using a certain 'count' register and (2) the loop count can be
adjusted by modifying this register prior to the loop.
. ??? The possible introduction of a new block to initialize the
new IV can potentially effects branch optimizations. */
if (optimize > 0 && flag_modulo_sched)
{
rtx s0;
rtx bcomp;
rtx loc_ref;
/* Only use this on innermost loops. */
if (INTVAL (operands[3]) > 1)
FAIL;
if (GET_MODE (operands[0]) != SImode)
FAIL;
s0 = operands [0];
emit_move_insn (s0, gen_rtx_PLUS (SImode, s0, GEN_INT (-1)));
bcomp = gen_rtx_NE(SImode, s0, const0_rtx);
loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [4]);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
loc_ref, pc_rtx)));
DONE;
}
}")
;; convert between any two modes, avoiding any GCC assumptions
(define_expand "spu_convert"
[(set (match_operand 0 "spu_reg_operand" "")

View File

@ -176,13 +176,17 @@ create_ddg_dep_from_intra_loop_link (ddg_ptr g, ddg_node_ptr src_node,
rtx set;
set = single_set (dest_node->insn);
if (set)
/* TODO: Handle registers that REG_P is not true for them, i.e.
subregs and special registers. */
if (set && REG_P (SET_DEST (set)))
{
int regno = REGNO (SET_DEST (set));
struct df_ref *first_def =
df_bb_regno_first_def_find (g->bb, regno);
struct df_ref *first_def;
struct df_rd_bb_info *bb_info = DF_RD_BB_INFO (g->bb);
first_def = df_bb_regno_first_def_find (g->bb, regno);
gcc_assert (first_def);
if (bitmap_bit_p (bb_info->gen, first_def->id))
return;
}

View File

@ -69,35 +69,59 @@ along with GCC; see the file COPYING3. If not see
if it is not a decrement and branch jump insn. */
rtx
doloop_condition_get (rtx pattern)
doloop_condition_get (rtx doloop_pat)
{
rtx cmp;
rtx inc;
rtx reg;
rtx inc_src;
rtx condition;
rtx pattern;
/* The canonical doloop pattern we expect is:
/* The canonical doloop pattern we expect has one of the following
forms:
(parallel [(set (pc) (if_then_else (condition)
(label_ref (label))
(pc)))
(set (reg) (plus (reg) (const_int -1)))
(additional clobbers and uses)])
1) (parallel [(set (pc) (if_then_else (condition)
(label_ref (label))
(pc)))
(set (reg) (plus (reg) (const_int -1)))
(additional clobbers and uses)])
Some targets (IA-64) wrap the set of the loop counter in
an if_then_else too.
The branch must be the first entry of the parallel (also required
by jump.c), and the second entry of the parallel must be a set of
the loop counter register. Some targets (IA-64) wrap the set of
the loop counter in an if_then_else too.
In summary, the branch must be the first entry of the
parallel (also required by jump.c), and the second
entry of the parallel must be a set of the loop counter
register. */
2) (set (reg) (plus (reg) (const_int -1))
(set (pc) (if_then_else (reg != 0)
(label_ref (label))
(pc))). */
pattern = PATTERN (doloop_pat);
if (GET_CODE (pattern) != PARALLEL)
return 0;
{
rtx cond;
cmp = XVECEXP (pattern, 0, 0);
inc = XVECEXP (pattern, 0, 1);
/* We expect the decrement to immediately precede the branch. */
if ((PREV_INSN (doloop_pat) == NULL_RTX)
|| !INSN_P (PREV_INSN (doloop_pat)))
return 0;
cmp = pattern;
inc = PATTERN (PREV_INSN (doloop_pat));
/* We expect the condition to be of the form (reg != 0) */
cond = XEXP (SET_SRC (cmp), 0);
if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
return 0;
}
else
{
cmp = XVECEXP (pattern, 0, 0);
inc = XVECEXP (pattern, 0, 1);
}
/* Check for (set (reg) (something)). */
if (GET_CODE (inc) != SET)
@ -139,7 +163,29 @@ doloop_condition_get (rtx pattern)
if ((XEXP (condition, 0) == reg)
|| (GET_CODE (XEXP (condition, 0)) == PLUS
&& XEXP (XEXP (condition, 0), 0) == reg))
{
if (GET_CODE (pattern) != PARALLEL)
/* The second form we expect:
(set (reg) (plus (reg) (const_int -1))
(set (pc) (if_then_else (reg != 0)
(label_ref (label))
(pc))).
is equivalent to the following:
(parallel [(set (pc) (if_then_else (reg != 1)
(label_ref (label))
(pc)))
(set (reg) (plus (reg) (const_int -1)))
(additional clobbers and uses)])
So we return that form instead.
*/
condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
return condition;
}
/* ??? If a machine uses a funny comparison, we could return a
canonicalized form here. */
@ -597,9 +643,7 @@ doloop_optimize (struct loop *loop)
{
while (NEXT_INSN (doloop_pat) != NULL_RTX)
doloop_pat = NEXT_INSN (doloop_pat);
if (JUMP_P (doloop_pat))
doloop_pat = PATTERN (doloop_pat);
else
if (!JUMP_P (doloop_pat))
doloop_pat = NULL_RTX;
}

View File

@ -82,8 +82,21 @@ along with GCC; see the file COPYING3. If not see
perform modulo variable expansion for live ranges that span more than
II cycles (i.e. use register copies to prevent a def from overwriting
itself before reaching the use).
*/
SMS works with countable loops (1) whose control part can be easily
decoupled from the rest of the loop and (2) whose loop count can
be easily adjusted. This is because we peel a constant number of
iterations into a prologue and epilogue for which we want to avoid
emitting the control part, and a kernel which is to iterate that
constant number of iterations less than the original loop. So the
control part should be a set of insns clearly identified and having
its own iv, not otherwise used in the loop (at-least for now), which
initializes a register before the loop to the number of iterations.
Currently SMS relies on the do-loop pattern to recognize such loops,
where (1) the control part comprises of all insns defining and/or
using a certain 'count' register and (2) the loop count can be
adjusted by modifying this register prior to the loop.
TODO: Rely on cfgloop analysis instead. */
/* This page defines partial-schedule structures and functions for
modulo scheduling. */
@ -182,10 +195,11 @@ static int sms_order_nodes (ddg_ptr, int, int * result);
static void set_node_sched_params (ddg_ptr);
static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
static void generate_prolog_epilog (partial_schedule_ptr, struct loop *loop,
rtx, rtx);
static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
int from_stage, int to_stage,
int is_prolog);
int is_prolog, rtx count_reg);
#define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
#define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
@ -261,20 +275,22 @@ static struct sched_info sms_sched_info =
};
/* Return the register decremented and tested in INSN,
or zero if it is not a decrement-and-branch insn. */
/* Given HEAD and TAIL which are the first and last insns in a loop;
return the register which controls the loop. Return zero if it has
more than one occurrence in the loop besides the control part or the
do-loop pattern is not of the form we expect. */
static rtx
doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED)
{
#ifdef HAVE_doloop_end
rtx pattern, reg, condition;
rtx reg, condition, insn;
bool found = false;
if (! JUMP_P (insn))
if (!JUMP_P (tail))
return NULL_RTX;
pattern = PATTERN (insn);
condition = doloop_condition_get (pattern);
/* TODO: Free SMS's dependence on doloop_condition_get. */
condition = doloop_condition_get (tail);
if (! condition)
return NULL_RTX;
@ -286,6 +302,29 @@ doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
else
gcc_unreachable ();
/* Check that the COUNT_REG has no other occurrences in the loop
until the decrement. We assume the control part consists of
either a single (parallel) branch-on-count or a (non-parallel)
branch immediately preceded by a single (decrement) insn. */
for (insn = head; insn != PREV_INSN (tail); insn = NEXT_INSN (insn))
if ((found = reg_mentioned_p (reg, insn)) == true)
break;
if (found)
{
if (dump_file)
fprintf (dump_file, "SMS count_reg found outside control\n");
return NULL_RTX;
}
/* One last check in case the do-loop pattern is parallel. */
if (GET_CODE (PATTERN (tail)) == PARALLEL)
if (reg_mentioned_p (reg, PREV_INSN (tail)))
{
if (dump_file)
fprintf (dump_file, "SMS count_reg found outside control\n");
return NULL_RTX;
}
return reg;
#else
return NULL_RTX;
@ -583,7 +622,7 @@ permute_partial_schedule (partial_schedule_ptr ps, rtx last)
static void
duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
int to_stage, int for_prolog)
int to_stage, int for_prolog, rtx count_reg)
{
int row;
ps_insn_ptr ps_ij;
@ -595,6 +634,13 @@ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
int j, i_reg_moves;
rtx reg_move = NULL_RTX;
/* Do not duplicate any insn which refers to count_reg as it
belongs to the control part.
TODO: This should be done by analyzing the control part of
the loop. */
if (reg_mentioned_p (count_reg, u_node->insn))
continue;
if (for_prolog)
{
/* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
@ -643,23 +689,35 @@ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
/* Generate the instructions (including reg_moves) for prolog & epilog. */
static void
generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
rtx count_reg, rtx count_init)
{
int i;
int last_stage = PS_STAGE_COUNT (ps) - 1;
edge e;
/* Generate the prolog, inserting its insns on the loop-entry edge. */
start_sequence ();
if (count_reg)
/* Generate a subtract instruction at the beginning of the prolog to
adjust the loop count by STAGE_COUNT. */
emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
if (!count_init)
{
/* Generate instructions at the beginning of the prolog to
adjust the loop count by STAGE_COUNT. If loop count is constant
(count_init), this constant is adjusted by STAGE_COUNT in
generate_prolog_epilog function. */
rtx sub_reg = NULL_RTX;
sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS,
count_reg, GEN_INT (last_stage),
count_reg, 1, OPTAB_DIRECT);
gcc_assert (REG_P (sub_reg));
if (REGNO (sub_reg) != REGNO (count_reg))
emit_move_insn (count_reg, sub_reg);
}
for (i = 0; i < last_stage; i++)
duplicate_insns_of_cycles (ps, 0, i, 1);
duplicate_insns_of_cycles (ps, 0, i, 1, count_reg);
/* Put the prolog on the entry edge. */
e = loop_preheader_edge (loop);
split_edge_and_insert (e, get_insns ());
@ -670,8 +728,8 @@ generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_r
start_sequence ();
for (i = 0; i < last_stage; i++)
duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg);
/* Put the epilogue on the exit edge. */
gcc_assert (single_exit (loop));
e = single_exit (loop);
@ -907,20 +965,27 @@ sms_schedule (void)
}
/* Make sure this is a doloop. */
if ( !(count_reg = doloop_register_get (tail)))
if ( !(count_reg = doloop_register_get (head, tail)))
continue;
/* Don't handle BBs with calls or barriers, or !single_set insns,
or auto-increment insns (to avoid creating invalid reg-moves
for the auto-increment insns).
??? Should handle auto-increment insns. */
for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
if (CALL_P (insn)
|| BARRIER_P (insn)
|| (INSN_P (insn) && !JUMP_P (insn)
&& !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)
|| (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0))
break;
??? Should handle auto-increment insns.
??? Should handle insns defining subregs. */
for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
{
rtx set;
if (CALL_P (insn)
|| BARRIER_P (insn)
|| (INSN_P (insn) && !JUMP_P (insn)
&& !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)
|| (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
|| (INSN_P (insn) && (set = single_set (insn))
&& GET_CODE (SET_DEST (set)) == SUBREG))
break;
}
if (insn != NEXT_INSN (tail))
{
@ -932,8 +997,11 @@ sms_schedule (void)
fprintf (dump_file, "SMS loop-with-barrier\n");
else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
fprintf (dump_file, "SMS reg inc\n");
else
fprintf (dump_file, "SMS loop-with-not-single-set\n");
else if ((INSN_P (insn) && !JUMP_P (insn)
&& !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
fprintf (dump_file, "SMS loop-with-not-single-set\n");
else
fprintf (dump_file, "SMS loop with subreg in lhs\n");
print_rtl_single (dump_file, insn);
}
@ -998,7 +1066,7 @@ sms_schedule (void)
/* In case of th loop have doloop register it gets special
handling. */
count_init = NULL_RTX;
if ((count_reg = doloop_register_get (tail)))
if ((count_reg = doloop_register_get (head, tail)))
{
basic_block pre_header;
@ -1072,7 +1140,10 @@ sms_schedule (void)
the closing_branch was scheduled and should appear in the last (ii-1)
row. Otherwise, we are free to schedule the branch, and we let nodes
that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
row; this should reduce stage_count to minimum. */
row; this should reduce stage_count to minimum.
TODO: Revisit the issue of scheduling the insns of the
control part relative to the branch when the control part
has more than one insn. */
normalize_sched_times (ps);
rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
set_columns_for_ps (ps);
@ -1111,11 +1182,8 @@ sms_schedule (void)
if (dump_file)
print_node_sched_params (dump_file, g->num_nodes, g);
/* Generate prolog and epilog. */
if (count_reg && !count_init)
generate_prolog_epilog (ps, loop, count_reg);
else
generate_prolog_epilog (ps, loop, NULL_RTX);
generate_prolog_epilog (ps, loop, count_reg, count_init);
free_undo_replace_buff (reg_move_replaces);
}

View File

@ -1,3 +1,10 @@
2007-08-28 Mircea Namolaru <namolaru@il.ibm.com>
Vladimir Yanovsky <yanov@il.ibm.com>
Revital Eres <eres@il.ibm.com>
Andrey Belevantsev <abel@ispras.ru>
* gcc.dg/sms-1.c: New test.
2007-08-27 Alexandre Oliva <aoliva@redhat.com>
* g++.dg/ext/gnu-inline-common.h: New.

View File

@ -0,0 +1,38 @@
/* The same test as loop-3c.c. It failed on ia64
due to not handling of subreg in the lhs that is fixed. */
/* { dg-do run } */
/* { dg-options "-O2 -fmodulo-sched -fmodulo-sched-allow-regmoves -w" } */
#include <limits.h>
void * a[255];
f (m)
{
int i;
int sh = 0x100;
i = m;
do
{
a[sh >>= 1] = ((unsigned)i << 3) + (char*)a;
i += 4;
}
while (i < INT_MAX/2 + 1 + 4 * 4);
}
main ()
{
a[0x10] = 0;
a[0x08] = 0;
f (INT_MAX/2 + INT_MAX/4 + 2);
if (a[0x10] || a[0x08])
abort ();
a[0x10] = 0;
a[0x08] = 0;
f (INT_MAX/2 + 1);
if (! a[0x10] || a[0x08])
abort ();
exit (0);
}