Modulo-scheduling improvements. Patch 2 of 2

Co-Authored-By: Andrey Belevantsev <abel@ispras.ru> Co-Authored-By: Revital Eres <eres@il.ibm.com> Co-Authored-By: Vladimir Yanovsky <yanov@il.ibm.com> From-SVN: r127848
2007-08-28 06:52:16 +00:00 · 2007-08-28 06:52:16 +00:00 · 46dc0789fe
parent 7368348cb7
commit 46dc0789fe
7 changed files with 293 additions and 62 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,31 @@
+2007-08-28  Mircea Namolaru  <namolaru@il.ibm.com>
+            Vladimir Yanovsky  <yanov@il.ibm.com>
+            Revital Eres  <eres@il.ibm.com>
+            Andrey Belevantsev  <abel@ispras.ru>
+
+	* config/spu/spu.md: Recognize doloop pattern when -fmodulo-sched
+	is set.
+	* modulo-sched.c: Add documentation regarding do-loop.
+	(doloop_register_get): Change number of arguments to support
+	the new do-loop pattern and check whether COUNT_REG has no other
+	occurences in the loop besides in the control part.
+	(duplicate_insns_of_cycles): Do not duplicate the insn which
+	changes count_reg as it is already adjusted.
+	(generate_prolog_epilog): New argument to support the new
+	do-loop pattern.  Change the subtract instruction to use
+	expand_simple_binop.  Call duplicate_insns_of_cycles with new
+	argument.
+	(sms_schedule): Call doloop_register_get and
+	generate_prolog_epilog with new argument.  Do not handle loops
+	with single sets insns with subreg in their lhs.
+	* loop-doloop.c (doloop_optimize): Support for another do-loop
+	pattern.
+	(doloop_condition_get): Gets an instruction instead of a pattern
+	and change the return condition when the do-loop pattern is
+	not parallel.
+	* ddg.c (create_ddg_dep_from_intra_loop_link): Handle only reg
+	deps when considering to not create edges.
+
 2007-08-27  Alexandre Oliva  <aoliva@redhat.com>

 	* doc/extend.texi (gnu_inline funtion attribute): Document C++
--- a/gcc/config/spu/spu.md
+++ b/gcc/config/spu/spu.md
@ -3887,6 +3887,48 @@ selb\t%0,%4,%0,%3"
  [(set_attr "type" "br")])


+
+ ;; Define the subtract-one-and-jump insns so loop.c
+ ;; knows what to generate.
+ (define_expand "doloop_end"
+   [(use (match_operand 0 "" ""))      ; loop pseudo
+    (use (match_operand 1 "" ""))      ; iterations; zero if unknown
+    (use (match_operand 2 "" ""))      ; max iterations
+    (use (match_operand 3 "" ""))      ; loop level
+    (use (match_operand 4 "" ""))]     ; label
+   ""
+   "
+ {
+   /* Currently SMS relies on the do-loop pattern to recognize loops
+      where (1) the control part comprises of all insns defining and/or
+      using a certain 'count' register and (2) the loop count can be
+      adjusted by modifying this register prior to the loop.
+.     ??? The possible introduction of a new block to initialize the
+      new IV can potentially effects branch optimizations.  */
+   if (optimize > 0 && flag_modulo_sched)
+   {
+     rtx s0;
+     rtx bcomp;
+     rtx loc_ref;
+
+     /* Only use this on innermost loops.  */
+     if (INTVAL (operands[3]) > 1)
+       FAIL;
+     if (GET_MODE (operands[0]) != SImode)
+       FAIL;
+
+     s0 = operands [0];
+     emit_move_insn (s0, gen_rtx_PLUS (SImode, s0, GEN_INT (-1)));
+     bcomp = gen_rtx_NE(SImode, s0, const0_rtx);
+     loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [4]);
+     emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
+                                  gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+                                                        loc_ref, pc_rtx)));
+
+     DONE;
+   }
+ }")
+
 ;; convert between any two modes, avoiding any GCC assumptions
 (define_expand "spu_convert"
  [(set (match_operand 0 "spu_reg_operand" "")
--- a/gcc/ddg.c
+++ b/gcc/ddg.c
@ -176,13 +176,17 @@ create_ddg_dep_from_intra_loop_link (ddg_ptr g, ddg_node_ptr src_node,
      rtx set;

      set = single_set (dest_node->insn);
-      if (set)
+      /* TODO: Handle registers that REG_P is not true for them, i.e.
+         subregs and special registers.  */
+      if (set && REG_P (SET_DEST (set)))
        {
          int regno = REGNO (SET_DEST (set));
-          struct df_ref *first_def =
-            df_bb_regno_first_def_find (g->bb, regno);
+          struct df_ref *first_def;
          struct df_rd_bb_info *bb_info = DF_RD_BB_INFO (g->bb);

+          first_def = df_bb_regno_first_def_find (g->bb, regno);
+          gcc_assert (first_def);
+
          if (bitmap_bit_p (bb_info->gen, first_def->id))
            return;
        }
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@ -69,35 +69,59 @@ along with GCC; see the file COPYING3.  If not see
   if it is not a decrement and branch jump insn.  */

 rtx
-doloop_condition_get (rtx pattern)
+doloop_condition_get (rtx doloop_pat)
 {
  rtx cmp;
  rtx inc;
  rtx reg;
  rtx inc_src;
  rtx condition;
+  rtx pattern;

-  /* The canonical doloop pattern we expect is:
+  /* The canonical doloop pattern we expect has one of the following
+     forms:

-     (parallel [(set (pc) (if_then_else (condition)
-                                        (label_ref (label))
-                                        (pc)))
-                (set (reg) (plus (reg) (const_int -1)))
-                (additional clobbers and uses)])
+     1)  (parallel [(set (pc) (if_then_else (condition)
+	  			            (label_ref (label))
+				            (pc)))
+	             (set (reg) (plus (reg) (const_int -1)))
+	             (additional clobbers and uses)])

-     Some targets (IA-64) wrap the set of the loop counter in
-     an if_then_else too.
+     The branch must be the first entry of the parallel (also required
+     by jump.c), and the second entry of the parallel must be a set of
+     the loop counter register.  Some targets (IA-64) wrap the set of
+     the loop counter in an if_then_else too.

-     In summary, the branch must be the first entry of the
-     parallel (also required by jump.c), and the second
-     entry of the parallel must be a set of the loop counter
-     register.  */
+     2)  (set (reg) (plus (reg) (const_int -1))
+         (set (pc) (if_then_else (reg != 0)
+	                         (label_ref (label))
+			         (pc))).  */
+
+  pattern = PATTERN (doloop_pat);

  if (GET_CODE (pattern) != PARALLEL)
-    return 0;
+    {
+      rtx cond;

-  cmp = XVECEXP (pattern, 0, 0);
-  inc = XVECEXP (pattern, 0, 1);
+      /* We expect the decrement to immediately precede the branch.  */
+
+      if ((PREV_INSN (doloop_pat) == NULL_RTX)
+          || !INSN_P (PREV_INSN (doloop_pat)))
+        return 0;
+
+      cmp = pattern;
+      inc = PATTERN (PREV_INSN (doloop_pat));
+      /* We expect the condition to be of the form (reg != 0)  */
+      cond = XEXP (SET_SRC (cmp), 0);
+      if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+        return 0;
+
+    }
+  else
+    {
+      cmp = XVECEXP (pattern, 0, 0);
+      inc = XVECEXP (pattern, 0, 1);
+    }

  /* Check for (set (reg) (something)).  */
  if (GET_CODE (inc) != SET)
@ -139,7 +163,29 @@ doloop_condition_get (rtx pattern)
  if ((XEXP (condition, 0) == reg)
      || (GET_CODE (XEXP (condition, 0)) == PLUS
 		   && XEXP (XEXP (condition, 0), 0) == reg))
+   {
+     if (GET_CODE (pattern) != PARALLEL)
+     /*  The second form we expect:
+
+         (set (reg) (plus (reg) (const_int -1))
+         (set (pc) (if_then_else (reg != 0)
+                                 (label_ref (label))
+                                 (pc))).
+
+         is equivalent to the following:
+
+         (parallel [(set (pc) (if_then_else (reg != 1)
+                                            (label_ref (label))
+                                            (pc)))
+                     (set (reg) (plus (reg) (const_int -1)))
+                     (additional clobbers and uses)])
+
+         So we return that form instead.
+     */
+        condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
+
    return condition;
+   }

  /* ??? If a machine uses a funny comparison, we could return a
     canonicalized form here.  */
@ -597,9 +643,7 @@ doloop_optimize (struct loop *loop)
    {
      while (NEXT_INSN (doloop_pat) != NULL_RTX)
 	doloop_pat = NEXT_INSN (doloop_pat);
-      if (JUMP_P (doloop_pat))
-	doloop_pat = PATTERN (doloop_pat);
-      else
+      if (!JUMP_P (doloop_pat))
 	doloop_pat = NULL_RTX;
    }

--- a/gcc/modulo-sched.c
+++ b/gcc/modulo-sched.c
@ -82,8 +82,21 @@ along with GCC; see the file COPYING3.  If not see
      perform modulo variable expansion for live ranges that span more than
      II cycles (i.e. use register copies to prevent a def from overwriting
      itself before reaching the use).
-*/

+    SMS works with countable loops (1) whose control part can be easily
+    decoupled from the rest of the loop and (2) whose loop count can
+    be easily adjusted.  This is because we peel a constant number of
+    iterations into a prologue and epilogue for which we want to avoid
+    emitting the control part, and a kernel which is to iterate that
+    constant number of iterations less than the original loop.  So the
+    control part should be a set of insns clearly identified and having
+    its own iv, not otherwise used in the loop (at-least for now), which
+    initializes a register before the loop to the number of iterations.
+    Currently SMS relies on the do-loop pattern to recognize such loops,
+    where (1) the control part comprises of all insns defining and/or
+    using a certain 'count' register and (2) the loop count can be
+    adjusted by modifying this register prior to the loop.  
+    TODO: Rely on cfgloop analysis instead.  */

 /* This page defines partial-schedule structures and functions for
   modulo scheduling.  */
@ -182,10 +195,11 @@ static int sms_order_nodes (ddg_ptr, int, int * result);
 static void set_node_sched_params (ddg_ptr);
 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
-static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
+static void generate_prolog_epilog (partial_schedule_ptr, struct loop *loop,
+                                    rtx, rtx);
 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
 				       int from_stage, int to_stage,
-				       int is_prolog);
+				       int is_prolog, rtx count_reg);

 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
@ -261,20 +275,22 @@ static struct sched_info sms_sched_info =
 };


-/* Return the register decremented and tested in INSN,
-   or zero if it is not a decrement-and-branch insn.  */
-
+/* Given HEAD and TAIL which are the first and last insns in a loop;
+   return the register which controls the loop.  Return zero if it has
+   more than one occurrence in the loop besides the control part or the
+   do-loop pattern is not of the form we expect.  */
 static rtx
-doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
+doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED)
 {
 #ifdef HAVE_doloop_end
-  rtx pattern, reg, condition;
+  rtx reg, condition, insn;
+  bool found = false;

-  if (! JUMP_P (insn))
+  if (!JUMP_P (tail))
    return NULL_RTX;

-  pattern = PATTERN (insn);
-  condition = doloop_condition_get (pattern);
+  /* TODO: Free SMS's dependence on doloop_condition_get.  */
+  condition = doloop_condition_get (tail);
  if (! condition)
    return NULL_RTX;

@ -286,6 +302,29 @@ doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
  else
    gcc_unreachable ();

+  /* Check that the COUNT_REG has no other occurrences in the loop
+     until the decrement.  We assume the control part consists of
+     either a single (parallel) branch-on-count or a (non-parallel)
+     branch immediately preceded by a single (decrement) insn.  */
+  for (insn = head; insn != PREV_INSN (tail); insn = NEXT_INSN (insn))
+    if ((found = reg_mentioned_p (reg, insn)) == true)
+      break;
+  if (found)
+    {
+      if (dump_file)
+        fprintf (dump_file, "SMS count_reg found outside control\n");
+
+      return NULL_RTX;
+    }
+  /* One last check in case the do-loop pattern is parallel.  */
+  if (GET_CODE (PATTERN (tail)) == PARALLEL)
+    if (reg_mentioned_p (reg, PREV_INSN (tail)))
+      {
+        if (dump_file)
+          fprintf (dump_file, "SMS count_reg found outside control\n");
+
+        return NULL_RTX;
+      }
  return reg;
 #else
  return NULL_RTX;
@ -583,7 +622,7 @@ permute_partial_schedule (partial_schedule_ptr ps, rtx last)

 static void
 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
-			   int to_stage, int for_prolog)
+			   int to_stage, int for_prolog, rtx count_reg)
 {
  int row;
  ps_insn_ptr ps_ij;
@ -595,6 +634,13 @@ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 	int j, i_reg_moves;
 	rtx reg_move = NULL_RTX;

+        /* Do not duplicate any insn which refers to count_reg as it
+           belongs to the control part.
+           TODO: This should be done by analyzing the control part of
+           the loop.  */
+        if (reg_mentioned_p (count_reg, u_node->insn))
+          continue;
+
 	if (for_prolog)
 	  {
 	    /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
@ -643,23 +689,35 @@ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,

 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 static void
-generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
+generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
+                        rtx count_reg, rtx count_init)
 {
  int i;
  int last_stage = PS_STAGE_COUNT (ps) - 1;
  edge e;
-
+  
  /* Generate the prolog, inserting its insns on the loop-entry edge.  */
  start_sequence ();

-  if (count_reg)
-   /* Generate a subtract instruction at the beginning of the prolog to
-      adjust the loop count by STAGE_COUNT.  */
-   emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
+  if (!count_init)
+    {
+      /* Generate instructions at the beginning of the prolog to
+         adjust the loop count by STAGE_COUNT.  If loop count is constant
+         (count_init), this constant is adjusted by STAGE_COUNT in
+         generate_prolog_epilog function.  */
+      rtx sub_reg = NULL_RTX;
+
+      sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS,
+                                     count_reg, GEN_INT (last_stage),
+                                     count_reg, 1, OPTAB_DIRECT);
+      gcc_assert (REG_P (sub_reg));
+      if (REGNO (sub_reg) != REGNO (count_reg))
+        emit_move_insn (count_reg, sub_reg);
+    }

  for (i = 0; i < last_stage; i++)
-    duplicate_insns_of_cycles (ps, 0, i, 1);
-
+    duplicate_insns_of_cycles (ps, 0, i, 1, count_reg);
+  
  /* Put the prolog on the entry edge.  */
  e = loop_preheader_edge (loop);
  split_edge_and_insert (e, get_insns ());
@ -670,8 +728,8 @@ generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_r
  start_sequence ();

  for (i = 0; i < last_stage; i++)
-    duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
-
+    duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg);
+  
  /* Put the epilogue on the exit edge.  */
  gcc_assert (single_exit (loop));
  e = single_exit (loop);
@ -907,20 +965,27 @@ sms_schedule (void)
        }

      /* Make sure this is a doloop.  */
-      if ( !(count_reg = doloop_register_get (tail)))
+      if ( !(count_reg = doloop_register_get (head, tail)))
 	continue;

      /* Don't handle BBs with calls or barriers, or !single_set insns,
         or auto-increment insns (to avoid creating invalid reg-moves
         for the auto-increment insns).  
-         ??? Should handle auto-increment insns.  */
-      for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
-	if (CALL_P (insn)
-	    || BARRIER_P (insn)
-	    || (INSN_P (insn) && !JUMP_P (insn)
-		&& !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)
-            || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0))
-	  break;
+         ??? Should handle auto-increment insns.
+         ??? Should handle insns defining subregs.  */
+     for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
+      {
+         rtx set;
+
+        if (CALL_P (insn)
+            || BARRIER_P (insn)
+            || (INSN_P (insn) && !JUMP_P (insn)
+                && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)
+            || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
+            || (INSN_P (insn) && (set = single_set (insn))
+                && GET_CODE (SET_DEST (set)) == SUBREG))
+        break;
+      }

      if (insn != NEXT_INSN (tail))
 	{
@ -932,8 +997,11 @@ sms_schedule (void)
 		fprintf (dump_file, "SMS loop-with-barrier\n");
              else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
                fprintf (dump_file, "SMS reg inc\n");
-	      else
-		fprintf (dump_file, "SMS loop-with-not-single-set\n");
+              else if ((INSN_P (insn) && !JUMP_P (insn)
+                && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
+                fprintf (dump_file, "SMS loop-with-not-single-set\n");
+              else
+               fprintf (dump_file, "SMS loop with subreg in lhs\n");
 	      print_rtl_single (dump_file, insn);
 	    }

@ -998,7 +1066,7 @@ sms_schedule (void)
      /* In case of th loop have doloop register it gets special
 	 handling.  */
      count_init = NULL_RTX;
-      if ((count_reg = doloop_register_get (tail)))
+      if ((count_reg = doloop_register_get (head, tail)))
 	{
 	  basic_block pre_header;

@ -1072,7 +1140,10 @@ sms_schedule (void)
 	     the closing_branch was scheduled and should appear in the last (ii-1)
 	     row.  Otherwise, we are free to schedule the branch, and we let nodes
 	     that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
-	     row; this should reduce stage_count to minimum.  */
+	     row; this should reduce stage_count to minimum.  
+             TODO: Revisit the issue of scheduling the insns of the
+             control part relative to the branch when the control part
+             has more than one insn.  */
 	  normalize_sched_times (ps);
 	  rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
 	  set_columns_for_ps (ps);
@ -1111,11 +1182,8 @@ sms_schedule (void)
 	  if (dump_file)
 	    print_node_sched_params (dump_file, g->num_nodes, g);
 	  /* Generate prolog and epilog.  */
-	  if (count_reg && !count_init)
-	    generate_prolog_epilog (ps, loop, count_reg);
-	  else
-	    generate_prolog_epilog (ps, loop, NULL_RTX);
-	    
+          generate_prolog_epilog (ps, loop, count_reg, count_init);
+ 
 	  free_undo_replace_buff (reg_move_replaces);
 	}

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,10 @@
+2007-08-28  Mircea Namolaru  <namolaru@il.ibm.com>
+            Vladimir Yanovsky  <yanov@il.ibm.com>
+            Revital Eres  <eres@il.ibm.com>
+            Andrey Belevantsev  <abel@ispras.ru>
+
+	* gcc.dg/sms-1.c: New test.
+
 2007-08-27  Alexandre Oliva  <aoliva@redhat.com>

 	* g++.dg/ext/gnu-inline-common.h: New.
--- a/gcc/testsuite/gcc.dg/sms-1.c
+++ b/gcc/testsuite/gcc.dg/sms-1.c
@ -0,0 +1,38 @@
+/* The same test as loop-3c.c.  It failed on ia64
+   due to not handling of subreg in the lhs that is fixed.  */
+/* { dg-do run } */
+/* { dg-options "-O2 -fmodulo-sched -fmodulo-sched-allow-regmoves -w" } */
+
+
+#include <limits.h>
+
+void * a[255];
+
+f (m)
+{
+  int i;
+  int sh = 0x100;
+  i = m;
+  do
+    {
+      a[sh >>= 1] = ((unsigned)i << 3)  + (char*)a;
+      i += 4;
+    }
+  while (i < INT_MAX/2 + 1 + 4 * 4);
+}
+
+main ()
+{
+  a[0x10] = 0;
+  a[0x08] = 0;
+  f (INT_MAX/2 + INT_MAX/4 + 2);
+  if (a[0x10] || a[0x08])
+    abort ();
+  a[0x10] = 0;
+  a[0x08] = 0;
+  f (INT_MAX/2 + 1);
+  if (! a[0x10] || a[0x08])
+    abort ();
+  exit (0);
+}
+