* Integrate Haifa instruction scheduler.

* Integrate regmove pass. See ChangeLog for deatils. From-SVN: r14770
1997-08-12 04:07:19 +00:00 · 1997-08-12 04:07:19 +00:00 · 8c660648ca
parent aa32d84158
commit 8c660648ca
15 changed files with 10737 additions and 10 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,76 @@
+Mon Aug 11 14:50:55 1997  Jeffrey A Law  (law@cygnus.com)
+
+	* Integrate Haifa instruction scheduler.
+	* Makefile.in (ALL_CFLAGS): Add SCHED_CFLAGS.  Prefix all references
+	to sched with $(SCHED_CFLAGS.
+	* configure.in: Handle --enable-haifa.
+	* configure: Rebuilt.
+	* flags.h: Add new flags for haifa instruction scheduler.
+	* genattrtab.c (expand_units): For haifa, don't subtract one
+	when computing blockage.
+	* toplev.h (flag_schedule_interblock): Haifa scheduler flag.
+	(flag_schedule_speculative): Ditto.
+	(flag_schedule_speculative_load): Ditto.
+	(flag_schedule_speculative_load_dangerous): Ditto.
+	(flag_schedule_reverse_before_reload): Ditto.
+	(flag_schedule_reverse_after_reload): Ditto.
+	(flag_branch_on_count_reg): Ditto.
+	(f_options): Add Haifa switches.
+	(main): Turn off some Haifa options if appropriate macro is
+	defined.  Process Haifa switches.
+	* unroll.c (iteration_info): No longer static, since Haifa
+	scheduler uses it.
+	(unroll_loop): Inform HAIFA scheduler about loop unrolling factor.
+	* unroll.c (unroll_loop): Set loop_unroll_iter, loop_start_value.
+	* loop.h (loop_unroll_factor, loop_number): Add HAIFA decls.
+	* loop.h (loop_initial_value,loop_unroll_iter): New globals.
+	* loop.c (loop_optimize): If HAIFA is defined, allocate additional
+	storage for the Haifa scheduler.
+	(mark_loop_jump): If HAIFA defined, set LABEL_OUTSIDE_LOOP_P and
+	LABEL_NEXTREF.
+	(strength_reduce): If HAIFA and HAVE_decrement_and_branch_on_count
+	are defined, call analyze_loop_iterations and insert_bct to use
+	countdown loops.
+	(record_giv): Refine test for jumps out of loops if HAIFA is
+	defined.
+	(analyze_loop_iterations): New function to identify if we can use
+	a countdown loop.
+	(insert_bct): Insert countdown loop.
+	(instrument_loop_bct): Low level code to insert countdown loop.
+	(loop_number): Calculate UID of loop.
+	(indirect_jump_in_function_p): Return true if an indirect jump is
+	in the function.
+	(is_power_of_2): Return true if value is a power of 2.
+	(is_conditional_branch): Return true if insn is a conditional
+	jump.
+	(fix_bct_param): Process -fbct-{min,max}-N switches.
+	(check_bct_param): Return true if loop should be instrumented.
+	* loop.c (loop_initial_value,loop_unroll_iter): New globals.
+	(loop_optimize): Initialize.
+	(get_condition_for_loop): Ditto.
+	* loop.c (strength_reduce): Inside of code that uses #ifdef
+	HAVE_decrement_and_branch_on_count code, test it to make sure the
+	condition is true.
+	(instrument_loop_bct): Ditto.
+	* haifa-sched.c: New file.
+	
+
+	* Integrate regmove pass.
+	* Makefile.in (OBJS): Add regmove.o
+	(regmove.o): Add dependencies.
+	* flow.c (find_use_as_address): No longer static.
+	* rtl.h (find_use_as_address): Declare.
+	* toplev.c (regmove_dump, flag_regmove): Define.
+	(f_options): Add -fregmove.
+	(regmove_dump_file, regmove_time): Define.
+	(fatal_insn): Close the regmove dump file.
+	(compile_file): Initialize regmove_time; open/close the regmove dump
+	file as needed.  Print regmove time as needed.
+	(rest_of_compilation): Run regmove pass if requested, dump
+	RTL after regmove if requested.
+	(main): If -O2 or more, turn on regmove.  Handle dump switches.
+	* regmove.c: New file.
+	
 Mon Aug 11 14:15:02 1997  Jeffrey A Law  (law@cygnus.com)

 	* Integrate tlink patch from jason@cygnus.com
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@ -444,7 +444,7 @@ INTERNAL_CFLAGS = $(CROSS) -DIN_GCC @extra_c_flags@

 # This is the variable actually used when we compile.
 ALL_CFLAGS = $(INTERNAL_CFLAGS) $(X_CFLAGS) $(T_CFLAGS) $(CFLAGS) $(XCFLAGS) \
-	@DEFS@
+	@DEFS@ $(SCHED_CFLAGS)

 # Likewise.
 ALL_CPPFLAGS = $(CPPFLAGS) $(X_CPPFLAGS) $(T_CPPFLAGS)
@ -548,14 +548,17 @@ BC_OBJS = bc-emit.o bc-optab.o
 # Bytecode header files constructed at build time; vmsconfig.com wants this.
 BC_ALL = bc-arity.h bc-opcode.h bc-opname.h

+SCHED_PREFIX = @sched_prefix@
+SCHED_CFLAGS = @sched_cflags@
+
 # Language-independent object files.
 OBJS = toplev.o version.o tree.o print-tree.o stor-layout.o fold-const.o \
 function.o stmt.o except.o expr.o calls.o expmed.o explow.o optabs.o \
- varasm.o rtl.o print-rtl.o rtlanal.o emit-rtl.o real.o \
- dbxout.o sdbout.o dwarfout.o dwarf2out.o xcoffout.o bitmap.o \
+ varasm.o rtl.o print-rtl.o rtlanal.o emit-rtl.o real.o regmove.o \
+ dbxout.o sdbout.o dwarfout.o dwarf2out.o xcoffout.o bitmap.o alias.o \
 integrate.o jump.o cse.o loop.o unroll.o flow.o stupid.o combine.o \
 regclass.o local-alloc.o global.o reload.o reload1.o caller-save.o \
- insn-peep.o reorg.o alias.o sched.o final.o recog.o reg-stack.o \
+ insn-peep.o reorg.o $(SCHED_PREFIX)sched.o final.o recog.o reg-stack.o \
 insn-opinit.o insn-recog.o insn-extract.o insn-output.o insn-emit.o \
 profile.o insn-attrtab.o $(out_object_file) getpwd.o convert.o $(EXTRA_OBJS)

@ -1326,7 +1329,9 @@ reorg.o : reorg.c $(CONFIG_H) $(RTL_H) conditions.h hard-reg-set.h \
   flags.h output.h
 alias.o : alias.c $(CONFIG_H) $(RTL_H) flags.h hard-reg-set.h regs.h \
   insn-codes.h
-sched.o : $(SCHED_PREFIX)sched.c $(CONFIG_H) $(RTL_H) $(BASIC_BLOCK_H) regs.h hard-reg-set.h \
+regmove.o : regmove.c $(CONFIG_H) $(RTL_H) insn-config.h recog.h output.h \
+  reload.h regs.h hard-reg-set.h flags.h expr.h insn-flags.h
+$(SCHED_PREFIX)sched.o : $(SCHED_PREFIX)sched.c $(CONFIG_H) $(RTL_H) $(BASIC_BLOCK_H) regs.h hard-reg-set.h \
   flags.h insn-config.h insn-attr.h
 final.o : final.c $(CONFIG_H) $(RTL_H) $(TREE_H) flags.h regs.h \
   recog.h conditions.h insn-config.h insn-attr.h except.h real.h output.h \
--- a/gcc/configure
+++ b/gcc/configure
@ -4361,6 +4361,26 @@ if [ ! -f Makefile.in ]; then
 	echo "source ${srcdir}/.gdbinit" >> .gdbinit
 fi

+# Override SCHED_OBJ and SCHED_CFLAGS to enable the Haifa scheduler.
+sched_prefix=
+sched_cflags=
+if [[ x$enable_haifa = xyes ]]; then
+    echo "Using the Haifa scheduler."
+    sched_prefix=haifa-
+    sched_cflags=-DHAIFA
+fi
+
+
+if [[ x$enable_haifa != x ]]; then
+    # Explicitly remove files that need to be recompiled for the Haifa scheduler.
+    for x in genattrtab.o toplev.o loop.o unroll.o *sched.o; do
+	if [ -f $x ]; then
+	    echo "Removing $x"
+	    rm -f $x
+	fi
+    done
+fi
+
 # Process the language and host/target makefile fragments.
 ${CONFIG_SHELL-/bin/sh} $srcdir/configure.frag $srcdir "$subdirs" "$dep_host_xmake_file" "$dep_tmake_file"

@ -4602,6 +4622,8 @@ s%@CC@%$CC%g
 s%@SET_MAKE@%$SET_MAKE%g
 s%@CPP@%$CPP%g
 s%@manext@%$manext%g
+s%@sched_prefix@%$sched_prefix%g
+s%@sched_cflags@%$sched_cflags%g
 s%@objext@%$objext%g
 s%@subdirs@%$subdirs%g
 s%@all_languages@%$all_languages%g
--- a/gcc/configure.in
+++ b/gcc/configure.in
@ -3027,6 +3027,26 @@ if [[ ! -f Makefile.in ]]; then
 	echo "source ${srcdir}/.gdbinit" >> .gdbinit
 fi

+# Override SCHED_OBJ and SCHED_CFLAGS to enable the Haifa scheduler.
+sched_prefix=
+sched_cflags=
+if [[ x$enable_haifa = xyes ]]; then
+    echo "Using the Haifa scheduler."
+    sched_prefix=haifa-
+    sched_cflags=-DHAIFA
+fi
+AC_SUBST(sched_prefix)
+AC_SUBST(sched_cflags)
+if [[ x$enable_haifa != x ]]; then
+    # Explicitly remove files that need to be recompiled for the Haifa scheduler.
+    for x in genattrtab.o toplev.o loop.o unroll.o *sched.o; do
+	if [ -f $x ]; then
+	    echo "Removing $x"
+	    rm -f $x
+	fi
+    done
+fi
+
 # Process the language and host/target makefile fragments.
 ${CONFIG_SHELL-/bin/sh} $srcdir/configure.frag $srcdir "$subdirs" "$dep_host_xmake_file" "$dep_tmake_file"

--- a/gcc/flags.h
+++ b/gcc/flags.h
@ -304,6 +304,34 @@ extern int flag_shared_data;
 extern int flag_schedule_insns;
 extern int flag_schedule_insns_after_reload;

+#ifdef HAIFA
+/* The following flags have effect only for scheduling before register
+   allocation:
+
+   flag_schedule_interblock means schedule insns accross basic blocks.
+   flag_schedule_speculative means allow speculative motion of non-load insns.
+   flag_schedule_speculative_load means allow speculative motion of some
+   load insns.
+   flag_schedule_speculative_load_dangerous allows speculative motion of more
+   load insns.
+   flag_schedule_reverse_before_reload means try to reverse original order
+   of insns (S).
+   flag_schedule_reverse_after_reload means try to reverse original order
+   of insns (R).  */
+
+extern int flag_schedule_interblock;
+extern int flag_schedule_speculative;
+extern int flag_schedule_speculative_load;
+extern int flag_schedule_speculative_load_dangerous;
+extern int flag_schedule_reverse_before_reload;
+extern int flag_schedule_reverse_after_reload;
+
+/* flag_on_branch_count_reg means try to replace add-1,compare,branch tupple
+   by a cheaper branch, on a count register. */
+extern int flag_branch_on_count_reg;
+#endif  /* HAIFA */
+
+
 /* Nonzero means put things in delayed-branch slots if supported. */

 extern int flag_delayed_branch;
--- a/gcc/flow.c
+++ b/gcc/flow.c
@ -268,7 +268,6 @@ static void find_auto_inc		PROTO((regset, rtx, rtx));
 static void mark_used_regs		PROTO((regset, regset, rtx, int, rtx));
 static int try_pre_increment_1		PROTO((rtx));
 static int try_pre_increment		PROTO((rtx, rtx, HOST_WIDE_INT));
-static rtx find_use_as_address		PROTO((rtx, rtx, HOST_WIDE_INT));
 void dump_flow_info			PROTO((FILE *));

 /* Find basic blocks of the current function and perform data flow analysis.
@ -2795,7 +2794,7 @@ try_pre_increment (insn, reg, amount)
   If REG appears more than once, or is used other than in such an address,
   return (rtx)1.  */

-static rtx
+rtx
 find_use_as_address (x, reg, plusconst)
     register rtx x;
     rtx reg;
--- a/gcc/genattrtab.c
+++ b/gcc/genattrtab.c
@ -2003,6 +2003,9 @@ expand_units ()

 	  for (op = unit->ops; op; op = op->next)
 	    {
+#ifdef HAIFA
+	      rtx blockage = op->issue_exp;
+#else
 	      rtx blockage = operate_exp (POS_MINUS_OP, readycost,
 					  make_numeric_value (1));

@ -2018,6 +2021,7 @@ expand_units ()
 				      blockage);

 	      blockage = operate_exp (MAX_OP, blockage, op->issue_exp);
+#endif
 	      blockage = simplify_knowing (blockage, unit->condexp);

 	      /* Add this op's contribution to MAX (BLOCKAGE (E,*)) and
--- a/gcc/haifa-sched.c
+++ b/gcc/haifa-sched.c
--- a/gcc/invoke.texi
+++ b/gcc/invoke.texi
@ -147,7 +147,7 @@ in the following sections.
 -ffunction-sections -finline-functions
 -fkeep-inline-functions -fno-default-inline
 -fno-defer-pop  -fno-function-cse
-fno-inline  -fno-peephole  -fomit-frame-pointer
+-fno-inline  -fno-peephole  -fomit-frame-pointer -fregmove
 -frerun-cse-after-loop  -fschedule-insns
 -fschedule-insns2  -fstrength-reduce  -fthread-jumps
 -funroll-all-loops  -funroll-loops
@ -2195,6 +2195,12 @@ used in one place: in @file{reorg.c}, instead of guessing which path a
 branch is mostly to take, the @samp{REG_BR_PROB} values are used to
 exactly determine which path is taken more often.
@end ifset
+
+@item -fregmove
+Some machines only support 2 operands per instruction.  On such
+machines, GNU CC might have to do extra copies.  The @samp{-fregmove}
+option overrides the default for the machine to do the copy before
+register allocation.
@end table

@node Preprocessor Options
--- a/gcc/loop.c
+++ b/gcc/loop.c
@ -81,6 +81,42 @@ static rtx *loop_number_loop_starts, *loop_number_loop_ends;

 int *loop_outer_loop;

+#ifdef HAIFA
+/* The main output of analyze_loop_iterations is placed here */
+
+int *loop_can_insert_bct;
+
+/* For each loop, determines whether some of its inner loops has used
+   count register */
+
+int *loop_used_count_register;
+
+/* For each loop, remember its unrolling factor (if at all).
+   contents of the array:
+   0/1: not unrolled.
+   -1: completely unrolled - no further instrumentation is needed.
+   >1: holds the exact amount of unrolling.  */
+
+int *loop_unroll_factor;
+int *loop_unroll_iter;
+
+/* loop parameters for arithmetic loops. These loops have a loop variable
+   which is initialized to loop_start_value, incremented in each iteration
+   by "loop_increment".  At the end of the iteration the loop variable is
+   compared to the loop_comparison_value (using loop_comparison_code).  */
+
+rtx *loop_increment;
+rtx *loop_comparison_value;
+rtx *loop_start_value;
+enum rtx_code *loop_comparison_code;
+
+/* for debugging: selects sub-range of loops for which the bct optimization
+   is invoked.  The numbering is per compilation-unit.  */
+int dbg_bct_min = -1;
+int dbg_bct_max = -1;
+#endif  /* HAIFA */
+
+
 /* Indexed by loop number, contains a nonzero value if the "loop" isn't
   really a loop (an insn outside the loop branches into it).  */

@ -286,6 +322,32 @@ static int maybe_eliminate_biv_1 ();
 static int last_use_this_basic_block ();
 static void record_initial ();
 static void update_reg_last_use ();
+
+#ifdef HAIFA
+/* This is extern from unroll.c */
+void iteration_info ();
+
+/* Two main functions for implementing bct:
+   first - to be called before loop unrolling, and the second - after */
+static void analyze_loop_iterations ();
+static void insert_bct ();
+
+/* Auxiliary function that inserts the bct pattern into the loop */
+static void instrument_loop_bct ();
+
+/* Indirect_jump_in_function is computed once per function.  */
+int indirect_jump_in_function = 0;
+static int indirect_jump_in_function_p ();
+
+int loop_number ();
+static int is_power_of_2();
+static int is_conditional_branch ();
+
+/* Debugging functions.  */
+int fix_bct_param ();
+static int check_bct_param ();
+#endif  /* HAIFA */
+

 /* Relative gain of eliminating various kinds of operations.  */
 int add_cost;
@ -379,6 +441,32 @@ loop_optimize (f, dumpfile)
  loop_number_exit_labels = (rtx *) alloca (max_loop_num * sizeof (rtx));
  loop_number_exit_count = (int *) alloca (max_loop_num * sizeof (int));

+#ifdef HAIFA
+  /* Allocate for BCT optimization */
+  loop_can_insert_bct = (int *) alloca (max_loop_num * sizeof (int));
+  bzero ((char *) loop_can_insert_bct, max_loop_num * sizeof (int));
+
+  loop_used_count_register = (int *) alloca (max_loop_num * sizeof (int));
+  bzero ((char *) loop_used_count_register, max_loop_num * sizeof (int));
+
+  loop_unroll_factor = (int *) alloca (max_loop_num *sizeof (int));
+  bzero ((char *) loop_unroll_factor, max_loop_num * sizeof (int));
+
+  loop_unroll_iter = (int *) alloca (max_loop_num *sizeof (int));
+  bzero ((char *) loop_unroll_iter, max_loop_num * sizeof (int));
+
+  loop_increment = (rtx *) alloca (max_loop_num * sizeof (rtx));
+  loop_comparison_value = (rtx *) alloca (max_loop_num * sizeof (rtx));
+  loop_start_value = (rtx *) alloca (max_loop_num * sizeof (rtx));
+  bzero ((char *) loop_increment, max_loop_num * sizeof (rtx));
+  bzero ((char *) loop_comparison_value, max_loop_num * sizeof (rtx));
+  bzero ((char *) loop_start_value, max_loop_num * sizeof (rtx));
+
+  loop_comparison_code 
+    = (enum rtx_code *) alloca (max_loop_num * sizeof (enum rtx_code));
+  bzero ((char *) loop_comparison_code, max_loop_num * sizeof (enum rtx_code));
+#endif  /* HAIFA */
+
  /* Find and process each loop.
     First, find them, and record them in order of their beginnings.  */
  find_and_verify_loops (f);
@ -430,6 +518,12 @@ loop_optimize (f, dumpfile)
  if (flag_unroll_loops && write_symbols != NO_DEBUG)
    find_loop_tree_blocks ();

+#ifdef HAIFA
+  /* determine if the function has indirect jump. If it does,
+     we cannot instrument loops in this function with bct */
+  indirect_jump_in_function = indirect_jump_in_function_p (f);
+#endif  /* HAIFA */
+
  /* Now scan the loops, last ones first, since this means inner ones are done
     before outer ones.  */
  for (i = max_loop_num-1; i >= 0; i--)
@ -2639,6 +2733,11 @@ mark_loop_jump (x, loop_num)

      if (loop_num != -1)
 	{
+#ifdef HAIFA
+	  LABEL_OUTSIDE_LOOP_P (x) = 1;
+	  LABEL_NEXTREF (x) = loop_number_exit_labels[loop_num];
+#endif  /* HAIFA */
+
 	  loop_number_exit_labels[loop_num] = x;

 	  for (outer_loop = loop_num; outer_loop != -1;
@ -3755,6 +3854,16 @@ strength_reduce (scan_start, end, loop_top, insn_count,
     so that "decrement and branch until zero" insn can be used.  */
  check_dbra_loop (loop_end, insn_count, loop_start);

+#ifdef HAIFA
+  /* record loop-variables relevant for BCT optimization before unrolling
+     the loop.  Unrolling may update part of this information, and the
+     correct data will be used for generating the BCT.  */
+#ifdef HAVE_decrement_and_branch_on_count
+  if (HAVE_decrement_and_branch_on_count)
+    analyze_loop_iterations (loop_start, loop_end);
+#endif
+#endif  /* HAIFA */
+
  /* Create reg_map to hold substitutions for replaceable giv regs.  */
  reg_map = (rtx *) alloca (max_reg_before_loop * sizeof (rtx));
  bzero ((char *) reg_map, max_reg_before_loop * sizeof (rtx));
@ -4247,6 +4356,14 @@ strength_reduce (scan_start, end, loop_top, insn_count,
  if (flag_unroll_loops)
    unroll_loop (loop_end, insn_count, loop_start, end_insert_before, 1);

+#ifdef HAIFA
+  /* instrument the loop with bct insn */
+#ifdef HAVE_decrement_and_branch_on_count
+  if (HAVE_decrement_and_branch_on_count)
+    insert_bct (loop_start, loop_end);
+#endif
+#endif  /* HAIFA */
+
  if (loop_dump_stream)
    fprintf (loop_dump_stream, "\n");
 }
@ -6932,3 +7049,638 @@ get_condition_for_loop (x)
  return gen_rtx (swap_condition (GET_CODE (comparison)), VOIDmode,
 		  XEXP (comparison, 1), XEXP (comparison, 0));
 }
+
+#ifdef HAIFA
+/* Analyze a loop in order to instrument it with the use of count register.
+   loop_start and loop_end are the first and last insns of the loop.
+   This function works in cooperation with insert_bct ().
+   loop_can_insert_bct[loop_num] is set according to whether the optimization
+   is applicable to the loop.  When it is applicable, the following variables
+   are also set:
+    loop_start_value[loop_num]
+    loop_comparison_value[loop_num]
+    loop_increment[loop_num]
+    loop_comparison_code[loop_num] */
+
+static
+void analyze_loop_iterations (loop_start, loop_end)
+  rtx loop_start, loop_end;
+{
+  rtx comparison, comparison_value;
+  rtx iteration_var, initial_value, increment;
+  enum rtx_code comparison_code;
+
+  rtx last_loop_insn;
+  rtx insn;
+  int i;
+
+  /* loop_variable mode */
+  enum machine_mode original_mode;
+
+  /* find the number of the loop */
+  int loop_num = loop_number (loop_start, loop_end);
+
+  /* we change our mind only when we are sure that loop will be instrumented */
+  loop_can_insert_bct[loop_num] = 0;
+
+  /* debugging: do we wish to instrument this loop? */
+  if ( !check_bct_param () )
+    return;
+
+  /* is the optimization suppressed.  */
+  if ( !flag_branch_on_count_reg )
+    return;
+
+  /* make sure that count-reg is not in use */
+  if (loop_used_count_register[loop_num]){
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+	      "analyze_loop_iterations %d: BCT instrumentation failed: count register already in use\n",
+	      loop_num);
+    return;
+  }
+
+  /* make sure that the function has no indirect jumps.  */
+  if (indirect_jump_in_function){
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+              "analyze_loop_iterations %d: BCT instrumentation failed: indirect jump in function\n",
+	      loop_num);
+    return;
+  }
+
+  /* make sure that the last loop insn is a conditional jump */
+  last_loop_insn = PREV_INSN (loop_end);
+  if (!is_conditional_branch (last_loop_insn)) {
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+              "analyze_loop_iterations %d: BCT instrumentation failed: invalid jump at loop end\n",
+	      loop_num);
+    return;
+  }
+
+  /* First find the iteration variable.  If the last insn is a conditional
+     branch, and the insn preceding it tests a register value, make that
+     register the iteration variable.  */
+
+  /* We used to use prev_nonnote_insn here, but that fails because it might
+     accidentally get the branch for a contained loop if the branch for this
+     loop was deleted.  We can only trust branches immediately before the
+     loop_end.  */
+
+  comparison = get_condition_for_loop (last_loop_insn);
+  /* ??? Get_condition may switch position of induction variable and
+     invariant register when it canonicalizes the comparison.  */
+
+  if (comparison == 0) {
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+	      "analyze_loop_iterations %d: BCT instrumentation failed: comparison not found\n",
+	      loop_num);
+    return;
+  }
+
+  comparison_code = GET_CODE (comparison);
+  iteration_var = XEXP (comparison, 0);
+  comparison_value = XEXP (comparison, 1);
+
+  original_mode = GET_MODE (iteration_var);
+  if (GET_MODE_CLASS (original_mode) != MODE_INT
+      || GET_MODE_SIZE (original_mode) != UNITS_PER_WORD) {
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+	      "analyze_loop_iterations %d: BCT Instrumentation failed: loop variable not integer\n",
+	      loop_num);
+    return;
+  }
+
+  /* get info about loop bounds and increment */
+  iteration_info (iteration_var, &initial_value, &increment,
+		  loop_start, loop_end);
+
+  /* make sure that all required loop data were found */
+  if (!(initial_value && increment && comparison_value
+	&& invariant_p (comparison_value) && invariant_p (increment)
+	&& ! indirect_jump_in_function))
+    {
+      if (loop_dump_stream) {
+	fprintf (loop_dump_stream,
+                "analyze_loop_iterations %d: BCT instrumentation failed because of wrong loop: ", loop_num);
+	if (!(initial_value && increment && comparison_value)) {
+	  fprintf (loop_dump_stream, "\tbounds not available: ");
+	  if ( ! initial_value )
+	    fprintf (loop_dump_stream, "initial ");
+	  if ( ! increment )
+	    fprintf (loop_dump_stream, "increment ");
+	  if ( ! comparison_value )
+	    fprintf (loop_dump_stream, "comparison ");
+	  fprintf (loop_dump_stream, "\n");
+	}
+	if (!invariant_p (comparison_value) || !invariant_p (increment))
+	  fprintf (loop_dump_stream, "\tloop bounds not invariant\n");
+      }
+      return;
+    }
+
+  /* make sure that the increment is constant */
+  if (GET_CODE (increment) != CONST_INT) {
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+              "analyze_loop_iterations %d: instrumentation failed: not arithmetic loop\n",
+	      loop_num);
+    return;
+  }
+
+  /* make sure that the loop contains neither function call, nor jump on table.
+     (the count register might be altered by the called function, and might
+     be used for a branch on table).  */
+  for (insn = loop_start; insn && insn != loop_end; insn = NEXT_INSN (insn)) {
+    if (GET_CODE (insn) == CALL_INSN){
+      if (loop_dump_stream)
+	fprintf (loop_dump_stream,
+                "analyze_loop_iterations %d: BCT instrumentation failed: function call in the loop\n",
+		loop_num);
+      return;
+    }
+
+    if (GET_CODE (insn) == JUMP_INSN
+       && (GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC
+	   || GET_CODE (PATTERN (insn)) == ADDR_VEC)){
+      if (loop_dump_stream)
+	fprintf (loop_dump_stream,
+                "analyze_loop_iterations %d: BCT instrumentation failed: computed branch in the loop\n",
+		loop_num);
+      return;
+    }
+  }
+
+  /* At this point, we are sure that the loop can be instrumented with BCT.
+     Some of the loops, however, will not be instrumented - the final decision
+     is taken by insert_bct () */
+  if (loop_dump_stream)
+    fprintf (loop_dump_stream,
+            "analyze_loop_iterations: loop (luid =%d) can be BCT instrumented.\n",
+	    loop_num);
+
+  /* mark all enclosing loops that they cannot use count register */
+  /* ???: In fact, since insert_bct may decide not to instrument this loop,
+     marking here may prevent instrumenting an enclosing loop that could
+    actually be instrumented.  But since this is rare, it is safer to mark
+    here in case the order of calling  (analyze/insert)_bct would be changed.  */
+  for (i=loop_num; i != -1; i = loop_outer_loop[i])
+    loop_used_count_register[i] = 1;
+
+  /* Set data structures which will be used by the instrumentation phase */
+  loop_start_value[loop_num] = initial_value;
+  loop_comparison_value[loop_num] = comparison_value;
+  loop_increment[loop_num] = increment;
+  loop_comparison_code[loop_num] = comparison_code;
+  loop_can_insert_bct[loop_num] = 1;
+}
+
+
+/* instrument loop for insertion of bct instruction.  We distinguish between
+ loops with compile-time bounds, to those with run-time bounds.  The loop
+ behaviour is analized according to the following characteristics/variables:
+ ; Input variables:
+ ;   comparison-value: the value to which the iteration counter is compared.
+ ;   initial-value: iteration-counter initial value.
+ ;   increment: iteration-counter increment.
+ ; Computed variables:
+ ;   increment-direction: the sign of the increment.
+ ;   compare-direction: '1' for GT, GTE, '-1' for LT, LTE, '0' for NE.
+ ;   range-direction: sign (comparison-value - initial-value)
+ We give up on the following cases:
+ ; loop variable overflow.
+ ; run-time loop bounds with comparison code NE.
+ */
+
+static void
+insert_bct (loop_start, loop_end)
+     rtx loop_start, loop_end;
+{
+  rtx initial_value, comparison_value, increment;
+  enum rtx_code comparison_code;
+
+  int increment_direction, compare_direction;
+  int unsigned_p = 0;
+
+  /* if the loop condition is <= or >=, the number of iteration
+      is 1 more than the range of the bounds of the loop */
+  int add_iteration = 0;
+
+  /* the only machine mode we work with - is the integer of the size that the
+     machine has */
+  enum machine_mode loop_var_mode = SImode;
+
+  int loop_num = loop_number (loop_start, loop_end);
+
+  /* get loop-variables. No need to check that these are valid - already
+     checked in analyze_loop_iterations ().  */
+  comparison_code = loop_comparison_code[loop_num];
+  initial_value = loop_start_value[loop_num];
+  comparison_value = loop_comparison_value[loop_num];
+  increment = loop_increment[loop_num];
+
+  /* check analyze_loop_iterations decision for this loop.  */
+  if (! loop_can_insert_bct[loop_num]){
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+	      "insert_bct: [%d] - was decided not to instrument by analyze_loop_iterations ()\n",
+	      loop_num);
+    return;
+  }
+
+  /* make sure that the loop was not fully unrolled.  */
+  if (loop_unroll_factor[loop_num] == -1){
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream, "insert_bct %d: was completely unrolled\n", loop_num);
+    return;
+  }
+
+  /* make sure that the last loop insn is a conditional jump .
+     This check is repeated from analyze_loop_iterations (),
+     because unrolling might have changed that.  */
+  if (!is_conditional_branch (PREV_INSN (loop_end))){
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+	      "insert_bct: not instrumenting BCT because of invalid branch\n");
+    return;
+  }
+
+  /* fix increment in case loop was unrolled.  */
+  if (loop_unroll_factor[loop_num] > 1)
+    increment = GEN_INT ( INTVAL (increment) * loop_unroll_factor[loop_num] );
+
+  /* determine properties and directions of the loop */
+  increment_direction = (INTVAL (increment) > 0) ? 1:-1;
+  switch ( comparison_code ) {
+  case LEU:
+    unsigned_p = 1;
+    /* fallthrough */
+  case LE:
+    compare_direction = 1;
+    add_iteration = 1;
+    break;
+  case GEU:
+    unsigned_p = 1;
+    /* fallthrough */
+  case GE:
+    compare_direction = -1;
+    add_iteration = 1;
+    break;
+  case EQ:
+    /* in this case we cannot know the number of iterations */
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+              "insert_bct: %d: loop cannot be instrumented: == in condition\n",
+	      loop_num);
+    return;
+  case LTU:
+    unsigned_p = 1;
+    /* fallthrough */
+  case LT:
+    compare_direction = 1;
+    break;
+  case GTU:
+    unsigned_p = 1;
+    /* fallthrough */
+  case GT:
+    compare_direction = -1;
+    break;
+  case NE:
+    compare_direction = 0;
+    break;
+  default:
+    abort ();
+  }
+
+
+  /* make sure that the loop does not end by an overflow */
+  if (compare_direction != increment_direction) {
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+              "insert_bct: %d: loop cannot be instrumented: terminated by overflow\n",
+	      loop_num);
+    return;
+  }
+
+  /* try to instrument the loop.  */
+
+  /* Handle the simpler case, where the bounds are known at compile time.  */
+  if (GET_CODE (initial_value) == CONST_INT && GET_CODE (comparison_value) == CONST_INT)
+    {
+      int n_iterations;
+      int increment_value_abs = INTVAL (increment) * increment_direction;
+
+      /* check the relation between compare-val and initial-val */
+      int difference = INTVAL (comparison_value) - INTVAL (initial_value);
+      int range_direction = (difference > 0) ? 1 : -1;
+
+      /* make sure the loop executes enough iterations to gain from BCT */
+      if (difference > -3 && difference < 3) {
+	if (loop_dump_stream)
+	  fprintf (loop_dump_stream,
+		  "insert_bct: loop %d not BCT instrumented: too small iteration count.\n",
+		  loop_num);
+	return;
+      }
+
+      /* make sure that the loop executes at least once */
+      if ((range_direction ==  1 && compare_direction == -1)
+	  || (range_direction == -1 && compare_direction ==  1))
+	{
+	  if (loop_dump_stream)
+	    fprintf (loop_dump_stream,
+		    "insert_bct: loop %d: does not iterate even once. Not instrumenting.\n",
+		    loop_num);
+	  return;
+	}
+
+      /* make sure that the loop does not end by an overflow (in compile time
+         bounds we must have an additional check for overflow, because here
+         we also support the compare code of 'NE'.  */
+      if (comparison_code == NE
+	  && increment_direction != range_direction) {
+	if (loop_dump_stream)
+	  fprintf (loop_dump_stream,
+		  "insert_bct (compile time bounds): %d: loop not instrumented: terminated by overflow\n",
+		  loop_num);
+	return;
+      }
+
+      /* Determine the number of iterations by:
+	 ;
+         ;                  compare-val - initial-val + (increment -1) + additional-iteration
+         ; num_iterations = -----------------------------------------------------------------
+         ;                                           increment
+	 */
+      difference = (range_direction > 0) ? difference : -difference;
+#if 0
+      fprintf (stderr, "difference is: %d\n", difference); /* @*/
+      fprintf (stderr, "increment_value_abs is: %d\n", increment_value_abs); /* @*/
+      fprintf (stderr, "add_iteration is: %d\n", add_iteration); /* @*/
+      fprintf (stderr, "INTVAL (comparison_value) is: %d\n", INTVAL (comparison_value)); /* @*/
+      fprintf (stderr, "INTVAL (initial_value) is: %d\n", INTVAL (initial_value)); /* @*/
+#endif
+
+      if (increment_value_abs == 0) {
+	fprintf (stderr, "insert_bct: error: increment == 0 !!!\n");
+	abort ();
+      }
+      n_iterations = (difference + increment_value_abs - 1 + add_iteration)
+	/ increment_value_abs;
+
+#if 0
+      fprintf (stderr, "number of iterations is: %d\n", n_iterations); /* @*/
+#endif
+      instrument_loop_bct (loop_start, loop_end, GEN_INT (n_iterations));
+
+      /* Done with this loop.  */
+      return;
+    }
+
+  /* Handle the more complex case, that the bounds are NOT known at compile time.  */
+  /* In this case we generate run_time calculation of the number of iterations */
+
+  /* With runtime bounds, if the compare is of the form '!=' we give up */
+  if (comparison_code == NE) {
+    if (loop_dump_stream)
+      fprintf (loop_dump_stream,
+	      "insert_bct: fail for loop %d: runtime bounds with != comparison\n",
+	      loop_num);
+    return;
+  }
+
+  else {
+    /* We rely on the existence of run-time guard to ensure that the
+       loop executes at least once.  */
+    rtx sequence;
+    rtx iterations_num_reg;
+
+    int increment_value_abs = INTVAL (increment) * increment_direction;
+
+    /* make sure that the increment is a power of two, otherwise (an
+       expensive) divide is needed.  */
+    if ( !is_power_of_2(increment_value_abs) )
+      {
+	if (loop_dump_stream)
+	  fprintf (loop_dump_stream,
+		  "insert_bct: not instrumenting BCT because the increment is not power of 2\n");
+	return;
+      }
+
+    /* compute the number of iterations */
+    start_sequence ();
+    {
+      /* CYGNUS LOCAL: HAIFA bug fix */
+      rtx temp_reg;
+
+      /* Again, the number of iterations is calculated by:
+	 ;
+         ;                  compare-val - initial-val + (increment -1) + additional-iteration
+         ; num_iterations = -----------------------------------------------------------------
+         ;                                           increment
+	 */
+      /* ??? Do we have to call copy_rtx here before passing rtx to
+	 expand_binop?  */
+      if (compare_direction > 0) {
+	/* <, <= :the loop variable is increasing */
+	temp_reg = expand_binop (loop_var_mode, sub_optab, comparison_value,
+				 initial_value, NULL_RTX, 0, OPTAB_LIB_WIDEN);
+      }
+      else {
+	temp_reg = expand_binop (loop_var_mode, sub_optab, initial_value,
+				 comparison_value, NULL_RTX, 0, OPTAB_LIB_WIDEN);
+      }
+
+      if (increment_value_abs - 1 + add_iteration != 0)
+	temp_reg = expand_binop (loop_var_mode, add_optab, temp_reg,
+				 GEN_INT (increment_value_abs - 1 + add_iteration),
+				 NULL_RTX, 0, OPTAB_LIB_WIDEN);
+
+      if (increment_value_abs != 1)
+	{
+	  /* ??? This will generate an expensive divide instruction for
+	     most targets.  The original authors apparently expected this
+	     to be a shift, since they test for power-of-2 divisors above,
+	     but just naively generating a divide instruction will not give 
+	     a shift.  It happens to work for the PowerPC target because
+	     the rs6000.md file has a divide pattern that emits shifts.
+	     It will probably not work for any other target.  */
+	  iterations_num_reg = expand_binop (loop_var_mode, sdiv_optab,
+					     temp_reg,
+					     GEN_INT (increment_value_abs),
+					     NULL_RTX, 0, OPTAB_LIB_WIDEN);
+	}
+      else
+	iterations_num_reg = temp_reg;
+      /* END CYGNUS LOCAL: HAIFA bug fix */
+    }
+    sequence = gen_sequence ();
+    end_sequence ();
+    emit_insn_before (sequence, loop_start);
+    instrument_loop_bct (loop_start, loop_end, iterations_num_reg);
+  }
+}
+
+/* instrument loop by inserting a bct in it. This is done in the following way:
+   1. A new register is created and assigned the hard register number of the count
+    register.
+   2. In the head of the loop the new variable is initialized by the value passed in the
+    loop_num_iterations parameter.
+   3. At the end of the loop, comparison of the register with 0 is generated.
+    The created comparison follows the pattern defined for the
+    decrement_and_branch_on_count insn, so this insn will be generated in assembly
+    generation phase.
+   4. The compare&branch on the old variable is deleted. So, if the loop-variable was
+    not used elsewhere, it will be eliminated by data-flow analisys.  */
+
+static void
+instrument_loop_bct (loop_start, loop_end, loop_num_iterations)
+     rtx loop_start, loop_end;
+     rtx loop_num_iterations;
+{
+  rtx temp_reg1, temp_reg2;
+  rtx start_label;
+
+  rtx sequence;
+  enum machine_mode loop_var_mode = SImode;
+
+#ifdef HAVE_decrement_and_branch_on_count
+  if (HAVE_decrement_and_branch_on_count)
+    {
+      if (loop_dump_stream)
+	fprintf (loop_dump_stream, "Loop: Inserting BCT\n");
+
+      /* eliminate the check on the old variable */
+      delete_insn (PREV_INSN (loop_end));
+      delete_insn (PREV_INSN (loop_end));
+
+      /* insert the label which will delimit the start of the loop */
+      start_label = gen_label_rtx ();
+      emit_label_after (start_label, loop_start);
+
+      /* insert initialization of the count register into the loop header */
+      start_sequence ();
+      temp_reg1 = gen_reg_rtx (loop_var_mode);
+      emit_insn (gen_move_insn (temp_reg1, loop_num_iterations));
+
+      /* this will be count register */
+      temp_reg2 = gen_rtx (REG, loop_var_mode, COUNT_REGISTER_REGNUM);
+      /* we have to move the value to the count register from an GPR
+	 because rtx pointed to by loop_num_iterations could contain
+	 expression which cannot be moved into count register */
+      emit_insn (gen_move_insn (temp_reg2, temp_reg1));
+
+      sequence = gen_sequence ();
+      end_sequence ();
+      emit_insn_after (sequence, loop_start);
+
+      /* insert new comparison on the count register instead of the
+	 old one, generating the needed BCT pattern (that will be
+	 later recognized by assembly generation phase).  */
+      emit_jump_insn_before (gen_decrement_and_branch_on_count (temp_reg2, start_label),
+			     loop_end);
+      LABEL_NUSES (start_label)++;
+    }
+
+#endif /* HAVE_decrement_and_branch_on_count */
+}
+
+/* calculate the uid of the given loop */
+int
+loop_number (loop_start, loop_end)
+     rtx loop_start, loop_end;
+{
+  int loop_num = -1;
+
+  /* assume that this insn contains the LOOP_START
+     note, so it will not be changed by the loop unrolling */
+  loop_num = uid_loop_num[INSN_UID (loop_start)];
+  /* sanity check - should never happen */
+  if (loop_num == -1)
+    abort ();
+
+  return loop_num;
+}
+
+/* scan the function and determine whether it has indirect (computed) jump */
+static int
+indirect_jump_in_function_p (start)
+     rtx start;
+{
+  rtx insn;
+  int is_indirect_jump = 0;
+
+  for (insn = start; insn; insn = NEXT_INSN (insn)) {
+    if (GET_CODE (insn) == JUMP_INSN) {
+      if (GET_CODE (PATTERN (insn)) == SET) {
+	rtx insn_work_code = XEXP (PATTERN (insn), 1);
+
+	if (GET_CODE (insn_work_code) == LABEL_REF)
+	  continue;
+	if (GET_CODE (insn_work_code) == IF_THEN_ELSE) {
+	  rtx jump_target = XEXP (insn_work_code, 1);
+
+	  if (jump_target == pc_rtx
+	     || (GET_CODE (jump_target) == (enum rtx_code)LABEL_REF))
+	    continue;
+	}
+      }
+      is_indirect_jump = 1;
+    }
+  }
+  return is_indirect_jump;
+}
+
+/* return 1 iff n is a power of 2 */
+static int
+is_power_of_2(n)
+     int n;
+{
+  return (n & (n-1)) == 0;
+}
+
+/* return 1 iff insn is a conditional jump */
+is_conditional_branch (insn)
+     rtx insn;
+{
+  rtx work_code;
+  if (GET_CODE (insn) != JUMP_INSN)
+    return 0;
+  work_code = PATTERN (insn);
+  if (GET_CODE (work_code) != SET)
+    return 0;
+  if (GET_CODE (XEXP (work_code, 1)) != IF_THEN_ELSE)
+    return 0;
+  return 1;
+}
+
+/* debugging: fix_bct_param () is called from toplev.c upon detection
+   of the -fbct-***-N options.  */
+int
+fix_bct_param (param, val)
+     char *param, *val;
+{
+  if ( !strcmp (param, "max") )
+    dbg_bct_max = atoi (val);
+  else if ( !strcmp (param, "min") )
+    dbg_bct_min = atoi (val);
+}
+
+/* debugging: return 1 if the loop should be instrumented,
+   according to bct-min/max.  */
+static int
+check_bct_param ()
+{
+  static int dbg_bct_num = 0;
+
+  dbg_bct_num++;
+  if (dbg_bct_num > dbg_bct_min || dbg_bct_min == -1)
+    if (dbg_bct_num <= dbg_bct_max || dbg_bct_max == -1)
+      return 1;
+  return 0;
+}
+#endif	/* HAIFA */
+/* END CYGNUS LOCAL haifa */
--- a/gcc/loop.h
+++ b/gcc/loop.h
@ -181,3 +181,13 @@ rtx final_biv_value PROTO((struct iv_class *, rtx, rtx));
 rtx final_giv_value PROTO((struct induction *, rtx, rtx));
 void emit_unrolled_add PROTO((rtx, rtx, rtx));
 int back_branch_in_range_p PROTO((rtx, rtx, rtx));
+
+#ifdef HAIFA
+/* variables for interaction between unroll.c and loop.c, for
+   the insertion of branch-on-count instruction. */
+extern int *loop_unroll_factor;
+extern rtx *loop_start_value;
+extern int *loop_unroll_iter;
+extern int loop_number();
+#endif  /* HAIFA */
+
--- a/gcc/regmove.c
+++ b/gcc/regmove.c
@ -0,0 +1,983 @@
+/* Move registers around to reduce number of move instructions needed.
+   Copyright (C) 1987, 88, 89, 92-5, 1996, 1997 Free Software Foundation, Inc.
+
+This file is part of GNU CC.
+
+GNU CC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU CC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU CC; see the file COPYING.  If not, write to
+the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+
+/* This module looks for cases where matching constraints would force
+   an instruction to need a reload, and this reload would be a register
+   to register move.  It then attempts to change the registers used by the
+   instruction to avoid the move instruction.  */
+
+#include "config.h"
+#ifdef __STDC__
+#include <stdarg.h>
+#else
+#include <varargs.h>
+#endif
+
+/* Must precede rtl.h for FFS.  */
+#include <stdio.h>
+
+#include "rtl.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "output.h"
+#include "reload.h"
+#include "regs.h"
+
+static int stable_but_for_p PROTO((rtx, rtx, rtx));
+
+#if defined (HAVE_POST_INCREMENT) || defined (HAVE_POST_DECREMENT) \
+    || defined (HAVE_PRE_INCREMENT) || defined (HAVE_PRE_DECREMENT)
+
+/* INC_INSN is an instruction that adds INCREMENT to REG.
+   Try to fold INC_INSN as a post/pre in/decrement into INSN.
+   Iff INC_INSN_SET is nonzero, inc_insn has a destination different from src.
+   Return nonzero for success.  */
+static int
+try_auto_increment (insn, inc_insn, inc_insn_set, reg, increment, pre)
+     rtx reg, insn, inc_insn ,inc_insn_set;
+     HOST_WIDE_INT increment;
+     int pre;
+{
+  enum rtx_code inc_code;
+
+  rtx pset = single_set (insn);
+  if (pset)
+    {
+      /* Can't use the size of SET_SRC, we might have something like
+	 (sign_extend:SI (mem:QI ...  */
+      rtx use = find_use_as_address (pset, reg, 0);
+      if (use != 0 && use != (rtx) 1)
+	{
+	  int size = GET_MODE_SIZE (GET_MODE (use));
+	  if (0
+#ifdef HAVE_POST_INCREMENT
+	      || (pre == 0 && (inc_code = POST_INC, increment == size))
+#endif
+#ifdef HAVE_PRE_INCREMENT
+	      || (pre == 1 && (inc_code = PRE_INC, increment == size))
+#endif
+#ifdef HAVE_POST_DECREMENT
+	      || (pre == 0 && (inc_code = POST_DEC, increment == -size))
+#endif
+#ifdef HAVE_PRE_DECREMENT
+	      || (pre == 1 && (inc_code = PRE_DEC, increment == -size))
+#endif
+          )
+            {
+              if (inc_insn_set)
+                validate_change
+                  (inc_insn, 
+                   &SET_SRC (inc_insn_set),
+		   XEXP (SET_SRC (inc_insn_set), 0), 1);
+              validate_change (insn, &XEXP (use, 0),
+                               gen_rtx (inc_code,
+                                        Pmode,
+                                        reg), 1);
+              if (apply_change_group ())
+                {
+                  REG_NOTES (insn)
+                    = gen_rtx (EXPR_LIST, REG_INC,
+                               reg, REG_NOTES (insn));
+                  if (! inc_insn_set)
+                    {
+                      PUT_CODE (inc_insn, NOTE);
+                      NOTE_LINE_NUMBER (inc_insn) = NOTE_INSN_DELETED;
+                      NOTE_SOURCE_FILE (inc_insn) = 0;
+                    }
+		  return 1;
+                }
+            }
+        }
+    }
+  return 0;
+}
+#endif  /* defined (HAVE_POST_INCREMENT) || defined (HAVE_POST_DECREMENT) */
+
+void
+regmove_optimize (f, nregs, regmove_dump_file)
+     rtx f;
+     int nregs;
+     FILE *regmove_dump_file;
+{
+#ifdef REGISTER_CONSTRAINTS
+  rtx insn;
+  int matches[MAX_RECOG_OPERANDS][MAX_RECOG_OPERANDS];
+  int modified[MAX_RECOG_OPERANDS];
+  int early_clobber[MAX_RECOG_OPERANDS];
+  int commutative;
+  int pass;
+
+  /* A forward/backward pass.  Replace output operands with input operands.  */
+
+  for (pass = 0; pass < 2; pass++)
+    {
+      if (regmove_dump_file)
+	fprintf (regmove_dump_file, "Starting %s pass...\n",
+		 pass ? "backward" : "forward");
+
+      for (insn = pass ? get_last_insn () : f; insn;
+	   insn = pass ? PREV_INSN (insn) : NEXT_INSN (insn))
+	{
+	  if (GET_RTX_CLASS (GET_CODE (insn)) == 'i')
+	    {
+	      int insn_code_number = recog_memoized (insn);
+	      int operand_number, match_number;
+	      
+	      if (insn_code_number < 0)
+		continue;
+    
+	      insn_extract (insn);
+	      if (! constrain_operands (insn_code_number, 0))
+		continue;
+	      
+	      commutative = -1;
+    
+	      /* Must initialize this before the loop, because the code for
+		 the commutative case may set matches for operands other than
+		 the current one.  */
+	      bzero (matches, sizeof (matches));
+    
+	      for (operand_number = 0;
+		   operand_number < insn_n_operands[insn_code_number];
+		   operand_number++)
+		{
+		  int output_operand = 0;
+		  int matching_operand = operand_number;
+		  char *p, c;
+		  int i = 0;
+    
+		  modified[operand_number] = 0;
+		  early_clobber[operand_number] = 0;
+    
+		  p = insn_operand_constraint[insn_code_number][operand_number];
+
+		  if (*p == '=')
+		    modified[operand_number] = 2;
+		  else if (*p == '+')
+		    modified[operand_number] = 1;
+
+		  for (;*p && i < which_alternative; p++)
+		    if (*p == ',')
+		      i++;
+    
+		  while ((c = *p++) != '\0' && c != ',')
+		    switch (c)
+		      {
+		      case '=':
+			break;
+		      case '+':
+			break;
+		      case '&':
+			early_clobber[operand_number] = 1;
+			break;
+		      case '%':
+			commutative = operand_number;
+			break;
+		      case '0': case '1': case '2': case '3': case '4':
+		      case '5': case '6': case '7': case '8': case '9':
+			c -= '0';
+			matches[operand_number][c] = 1;
+			if (commutative >= 0)
+			  {
+			    if (c == commutative || c == commutative + 1)
+			      {
+				int other = c + (c == commutative ? 1 : -1);
+				matches[operand_number][other] = 1;
+			      }
+			    if (operand_number == commutative
+				|| operand_number == commutative + 1)
+			      {
+				int other = (operand_number
+					     + (operand_number == commutative
+						? 1 : -1));
+				matches[other][c] = 1;
+			      }
+			  }
+			break;
+		      }
+		}
+    
+	      /* Now scan through the operands looking for a source operand
+		 which is supposed to match the destination operand.
+		 Then scan forward for an instruction which uses the dest
+		 operand.
+		 If it dies there, then replace the dest in both operands with
+		 the source operand.  */
+    
+	      for (operand_number = 0;
+		   operand_number < insn_n_operands[insn_code_number];
+		   operand_number++)
+		{
+		  for (match_number = 0;
+		       match_number < insn_n_operands[insn_code_number];
+		       match_number++)
+		    {
+		      rtx set, p, src, dst, src_subreg;
+		      rtx post_inc = 0, post_inc_set = 0, search_end = 0;
+		      rtx src_note, dst_note;
+		      int success = 0;
+		      int num_calls = 0;
+		      enum rtx_code code = NOTE;
+		      HOST_WIDE_INT insn_const, newconst;
+		      rtx overlap = 0; /* need to move insn ? */
+    
+		      /* Nothing to do if the two operands aren't supposed to
+			 match.  */
+		      if (matches[operand_number][match_number] == 0)
+			continue;
+    
+		      src = recog_operand[operand_number];
+		      dst = recog_operand[match_number];
+    
+		      if (GET_CODE (src) != REG
+			  || REGNO (src) < FIRST_PSEUDO_REGISTER)
+			continue;
+    
+		      src_subreg = src;
+		      if (GET_CODE (dst) == SUBREG
+			  && GET_MODE_SIZE (GET_MODE (dst))
+			     >= GET_MODE_SIZE (GET_MODE (SUBREG_REG (dst))))
+			{
+			  src_subreg
+			    = gen_rtx(SUBREG,  GET_MODE (SUBREG_REG (dst)),
+				      src, SUBREG_WORD (dst));
+			  dst = SUBREG_REG (dst);
+			}
+		      if (GET_CODE (dst) != REG
+			  || REGNO (dst) < FIRST_PSEUDO_REGISTER)
+			continue;
+    
+		      /* If the operands already match, then there is nothing
+			 to do.  */
+		      if (operands_match_p (src, dst))
+			continue;
+    
+		      set = single_set (insn);
+		      if (! set)
+			continue;
+    
+		      /* operand_number/src must be a read-only operand, and
+			 match_operand/dst must be a write-only operand.  */
+		      if (modified[match_number] != 2)
+			continue;
+    
+		      if (early_clobber[match_number] == 1)
+			continue;
+    
+		      if (modified[operand_number] != 0)
+			continue;
+    
+		      /* Make sure match_operand is the destination.  */
+		      if (recog_operand[match_number] != SET_DEST (set))
+			continue;
+		  
+		      src_note = find_reg_note (insn, REG_DEAD, src);
+    
+		      if (! src_note)
+			{
+			  /* Look for (set (regX) (op regA constX))
+				      (set (regY) (op regA constY))
+			     and change that to
+				      (set (regA) (op regA constX)).
+				      (set (regY) (op regA constY-constX)).
+			     This works for add and shift operations, if
+			     regA is dead after or set by the second insn.  */
+
+			  code = GET_CODE (SET_SRC (set));
+			  if ((code == PLUS || code == LSHIFTRT
+			       || code == ASHIFT || code == ASHIFTRT)
+			      && XEXP (SET_SRC (set), 0) == src
+			      && (GET_CODE (XEXP (SET_SRC (set), 1))
+				  == CONST_INT))
+			    insn_const = INTVAL (XEXP (SET_SRC (set), 1));
+			  else if (! stable_but_for_p (SET_SRC (set), src, dst))
+			    continue;
+			  else
+			    /* We might find a src_note while scanning.  */
+			    code = NOTE;
+			}
+
+		      if (regmove_dump_file)
+			fprintf (regmove_dump_file,
+				 "Could fix operand %d of insn %d matching operand %d.\n",
+				 operand_number, INSN_UID (insn), match_number);
+    
+		      /* ??? If src is set once, and is set equal to a
+			 constant, then do not use it for this optimization,
+			 as this would make it no longer equivalent to a
+			 constant?  */
+    
+		      /* Scan forward to find the next instruction that
+			 uses the output operand.  If the operand dies here,
+			 then replace it in both instructions with
+			 operand_number.  */
+    
+		      for (p = NEXT_INSN (insn); p; p = NEXT_INSN (p))
+			{
+			  if (GET_CODE (p) == CODE_LABEL
+			      || GET_CODE (p) == JUMP_INSN
+			      || (GET_CODE (p) == NOTE
+				  && ((NOTE_LINE_NUMBER (p)
+				       == NOTE_INSN_LOOP_BEG)
+				      || (NOTE_LINE_NUMBER (p)
+					  == NOTE_INSN_LOOP_END))))
+			    break;
+    
+			  if (GET_RTX_CLASS (GET_CODE (p)) != 'i')
+			    continue;
+    
+			  if (reg_set_p (src, p) || reg_set_p (dst, p)
+			      || (GET_CODE (PATTERN (p)) == USE
+				  && reg_overlap_mentioned_p (src,
+							      XEXP (PATTERN (p),
+							      0))))
+			    break;
+    
+			  /* See if all of DST dies in P.  This test is
+			     slightly more conservative than it needs to be.  */
+			  if ((dst_note
+				= find_regno_note (p, REG_DEAD, REGNO (dst)))
+			      && (GET_MODE (XEXP (dst_note, 0))
+				  == GET_MODE (dst)))
+			    {
+			      if (! src_note)
+				{
+				  rtx q;
+				  rtx set2;
+    
+				  /* If an optimization is done, the value
+				     of SRC while P is executed will be
+				     changed.  Check that this is OK.  */
+				  if (reg_overlap_mentioned_p (src,
+							       PATTERN (p)))
+				    break;
+				  for (q = p; q; q = NEXT_INSN (q))
+				    {
+				      if (GET_CODE (q) == CODE_LABEL
+					  || GET_CODE (q) == JUMP_INSN
+					  || (GET_CODE (q) == NOTE
+					      && ((NOTE_LINE_NUMBER (q)
+						   == NOTE_INSN_LOOP_BEG)
+						  || (NOTE_LINE_NUMBER (q)
+						      == NOTE_INSN_LOOP_END))))
+					{
+					  q = 0;
+					  break;
+					}
+				      if (GET_RTX_CLASS (GET_CODE (q)) != 'i')
+					continue;
+				      if (reg_overlap_mentioned_p (src,
+								   PATTERN (q))
+					  || reg_set_p (src, q))
+					break;
+				    }
+				  if (q)
+				    set2 = single_set (q);
+				  if (! q || ! set2
+				      || GET_CODE (SET_SRC (set2)) != code
+				      || XEXP (SET_SRC (set2), 0) != src
+				      || (GET_CODE (XEXP (SET_SRC (set2), 1))
+					  != CONST_INT)
+				      || (SET_DEST (set2) != src
+					  && !find_reg_note (q, REG_DEAD, src)))
+				    {
+				      /* If this is a PLUS, we can still save
+					 a register by doing
+					 src += insn_const;
+					 P;
+					 src -= insn_const; .
+					 This also gives opportunities for
+					 subsequent optimizations in the
+					 backward pass, so do it there.  */
+				      if (code == PLUS && pass == 1
+#ifdef HAVE_cc0
+					  /* We man not emit an insn directly
+					     after P if the latter sets CC0.  */
+					  && ! sets_cc0_p (PATTERN (p))
+#endif
+					  )
+
+					{
+					  search_end = q;
+					  q = insn;
+					  set2 = set;
+					  newconst = -insn_const;
+					  code = MINUS;
+					}
+				      else
+					break;
+				    }
+				  else
+				    {
+				      newconst
+					= (INTVAL (XEXP (SET_SRC (set2), 1))
+					   - insn_const);
+				      /* Reject out of range shifts.  */
+				      if (code != PLUS
+					  && (newconst < 0
+					      || (newconst
+						  >= GET_MODE_BITSIZE (GET_MODE (SET_SRC (set2))))))
+					break;
+				      if (code == PLUS)
+					{
+					  post_inc = q;
+					  if (SET_DEST (set2) != src)
+					    post_inc_set = set2;
+					}
+				    }
+				  /* We use 1 as last argument to
+				     validate_change so that all changes
+				     are accepted or rejected together by
+				     apply_change_group when it is called
+				     by validate_replace_rtx .  */
+				  validate_change (q, &XEXP (SET_SRC (set2), 1),
+						   GEN_INT (newconst), 1);
+				}
+			      validate_change (insn,
+					       recog_operand_loc[match_number],
+					       src, 1);
+			      if (validate_replace_rtx (dst, src_subreg, p))
+				success = 1;
+			      break;
+			    }
+    
+			  if (reg_overlap_mentioned_p (dst, PATTERN (p)))
+			    break;
+			  if (! src_note
+			      && reg_overlap_mentioned_p (src, PATTERN (p)))
+			    {
+			      /* INSN was already checked to be movable when
+				 we found no REG_DEAD note for src on it.  */
+			      overlap = p;
+			      src_note = find_reg_note (p, REG_DEAD, src);
+			    }
+    
+			  /* If we have passed a call instruction, and the
+			     pseudo-reg SRC is not already live across a call,
+			     then don't perform the optimization.  */
+			  if (GET_CODE (p) == CALL_INSN)
+			    {
+			      num_calls++;
+    
+			      if (REG_N_CALLS_CROSSED (REGNO (src)) == 0)
+				break;
+			    }
+			}
+    
+		      if (success)
+			{
+			  /* Remove the death note for DST from P.  */
+			  remove_note (p, dst_note);
+			  if (code == MINUS)
+			    {
+			      post_inc
+				= emit_insn_after (copy_rtx (PATTERN (insn)),
+						   p);
+#if defined (HAVE_PRE_INCREMENT) || defined (HAVE_PRE_DECREMENT)
+			      if (search_end
+				  && try_auto_increment (search_end, post_inc,
+							 0, src, newconst, 1))
+				post_inc = 0;
+#endif
+			      validate_change (insn, &XEXP (SET_SRC (set), 1),
+					       GEN_INT (insn_const), 0);
+			      REG_N_SETS (REGNO (src))++;
+			    }
+			  if (overlap)
+			    {
+			      /* The lifetime of src and dest overlap,
+				 but we can change this by moving insn.  */
+			      rtx pat = PATTERN (insn);
+			      if (src_note)
+				remove_note (overlap, src_note);
+#if defined (HAVE_POST_INCREMENT) || defined (HAVE_POST_DECREMENT)
+			      if (code == PLUS
+				  && try_auto_increment (overlap, insn, 0,
+							 src, insn_const, 0))
+				insn = overlap;
+			      else
+#endif
+				{
+				  emit_insn_after_with_line_notes
+				    (pat, PREV_INSN (p), insn);
+				  PUT_CODE (insn, NOTE);
+				  NOTE_LINE_NUMBER (insn) = NOTE_INSN_DELETED;
+				  NOTE_SOURCE_FILE (insn) = 0;
+				  /* emit_insn_after_with_line_notes
+				     has no return value, so search
+				     for the new insn.  */
+				  for (insn = p; PATTERN (insn) != pat; )
+				    insn = PREV_INSN (insn);
+				}
+			    }
+			  /* Sometimes we'd generate src = const; src += n;
+			     if so, replace the instruction that set src
+			     in the first place.  */
+			
+			  if (! overlap && (code == PLUS || code == MINUS))
+			    {
+			      rtx note
+				= find_reg_note (insn, REG_EQUAL, NULL_RTX);
+			      rtx q, set2;
+			      int num_calls2 = 0;
+
+			      if (note && CONSTANT_P (XEXP (note, 0)))
+				{
+				  for (q = PREV_INSN (insn); q;
+				       q = PREV_INSN(q))
+				    {
+				      if (GET_CODE (q) == JUMP_INSN)
+					{
+					  q = 0;
+					  break;
+					}
+				      if (GET_RTX_CLASS (GET_CODE (q)) != 'i')
+					continue;
+				      if (reg_set_p (src, q))
+					{
+					  set2 = single_set (q);
+					  break;
+					}
+				      if (reg_overlap_mentioned_p (src,
+					  PATTERN (q)))
+					{
+					  q = 0;
+					  break;
+					}
+				      if (GET_CODE (p) == CALL_INSN)
+					num_calls2++;
+				    }
+				  if (q && set2 && SET_DEST (set2) == src
+				      && CONSTANT_P (SET_SRC (set2))
+				      && validate_change (insn, &SET_SRC (set),
+							  XEXP (note, 0), 0))
+				    {
+				      PUT_CODE (q, NOTE);
+				      NOTE_LINE_NUMBER (q) = NOTE_INSN_DELETED;
+				      NOTE_SOURCE_FILE (q) = 0;
+				      REG_N_SETS (REGNO (src))--;
+				      REG_N_CALLS_CROSSED (REGNO (src))
+					-= num_calls2;
+				      insn_const = 0;
+				    }
+				}
+			    }
+			  if (0) ;
+#if defined (HAVE_PRE_INCREMENT) || defined (HAVE_PRE_DECREMENT)
+			  else if ((code == PLUS || code == MINUS)
+				   && insn_const
+				   && try_auto_increment (p, insn, 0,
+							  src, insn_const, 1))
+			    insn = p;
+#endif
+#if defined (HAVE_POST_INCREMENT) || defined (HAVE_POST_DECREMENT)
+			  else if (post_inc
+				   && try_auto_increment (p, post_inc,
+							  post_inc_set, src,
+							  newconst, 0))
+			    post_inc = 0;
+#endif
+#if defined (HAVE_PRE_INCREMENT) || defined (HAVE_PRE_DECREMENT)
+			  /* If post_inc still prevails, try to find an
+			     insn where it can be used as a pre-in/decrement.
+			     If code is MINUS, this was already tried.  */
+			  if (post_inc && code == PLUS
+			  /* Check that newconst is likely to be usable
+			     in a pre-in/decrement before starting the
+			     search.  */
+			      && (0
+#if defined (HAVE_PRE_INCREMENT)
+				  || (newconst > 0 && newconst <= MOVE_MAX)
+#endif
+#if defined (HAVE_PRE_DECREMENT)
+				  || (newconst < 0 && newconst >= -MOVE_MAX)
+#endif
+				 ) && exact_log2 (newconst))
+			    {
+			      rtx q, inc_dest;
+
+			      inc_dest
+				= post_inc_set ? SET_DEST (post_inc_set) : src;
+			      for (q = post_inc; q = NEXT_INSN (q); )
+				{
+				  if (GET_CODE (q) == CODE_LABEL
+				      || GET_CODE (q) == JUMP_INSN
+				      || (GET_CODE (q) == NOTE
+					  && ((NOTE_LINE_NUMBER (q)
+					       == NOTE_INSN_LOOP_BEG)
+					      || (NOTE_LINE_NUMBER (q)
+						  == NOTE_INSN_LOOP_END))))
+				    break;
+				  if (GET_RTX_CLASS (GET_CODE (q)) != 'i')
+				    continue;
+				  if (src != inc_dest
+				      && (reg_overlap_mentioned_p (src,
+								   PATTERN (q))
+					  || reg_set_p (src, q)))
+				    break;
+				  if (reg_set_p (inc_dest, q))
+				    break;
+				  if (reg_overlap_mentioned_p (inc_dest,
+							       PATTERN (q)))
+				    {
+				      try_auto_increment (q, post_inc,
+							  post_inc_set,
+							  inc_dest,
+							  newconst, 1);
+				      break;
+				    }
+				}
+			    }
+#endif /* defined (HAVE_PRE_INCREMENT) || defined (HAVE_PRE_DECREMENT) */
+			  /* Move the death note for DST to INSN if it is used
+			     there.  */
+			  if (reg_overlap_mentioned_p (dst, PATTERN (insn)))
+			    {
+			      XEXP (dst_note, 1) = REG_NOTES (insn);
+			      REG_NOTES (insn) = dst_note;
+			    }
+    
+			  if (src_note)
+			    {
+			      /* Move the death note for SRC from INSN to P.  */
+			      if (! overlap)
+				remove_note (insn, src_note);
+			      XEXP (src_note, 1) = REG_NOTES (p);
+			      REG_NOTES (p) = src_note;
+    
+			      REG_N_CALLS_CROSSED (REGNO (src)) += num_calls;
+			    }
+    
+			  REG_N_SETS (REGNO (src))++;
+			  REG_N_SETS (REGNO (dst))--;
+    
+			  REG_N_CALLS_CROSSED (REGNO (dst)) -= num_calls;
+    
+			  /* ??? Must adjust reg_live_length, and reg_n_refs for
+			     both registers.  Must keep track of loop_depth in
+			     order to get reg_n_refs adjustment correct.  */
+    
+			  if (regmove_dump_file)
+			    fprintf (regmove_dump_file,
+				     "Fixed operand %d of insn %d matching operand %d.\n",
+				     operand_number, INSN_UID (insn),
+				     match_number);
+    
+			  goto done_forwards;
+			}
+		    }
+		}
+	    done_forwards:
+	      ;
+	    }
+	}
+    }
+
+  /* A backward pass.  Replace input operands with output operands.  */
+
+  if (regmove_dump_file)
+    fprintf (regmove_dump_file, "Starting backward pass...\n");
+
+  for (insn = get_last_insn (); insn; insn = PREV_INSN (insn))
+    {
+      if (GET_RTX_CLASS (GET_CODE (insn)) == 'i')
+	{
+	  int insn_code_number = recog_memoized (insn);
+	  int operand_number, match_number;
+	  
+	  if (insn_code_number < 0)
+	    continue;
+
+	  insn_extract (insn);
+	  if (! constrain_operands (insn_code_number, 0))
+	    continue;
+	  
+	  commutative = -1;
+
+	  /* Must initialize this before the loop, because the code for
+	     the commutative case may set matches for operands other than
+	     the current one.  */
+	  bzero (matches, sizeof (matches));
+
+	  for (operand_number = 0;
+	       operand_number < insn_n_operands[insn_code_number];
+	       operand_number++)
+	    {
+	      int output_operand = 0;
+	      int matching_operand = operand_number;
+	      char *p, c;
+	      int i = 0;
+
+	      modified[operand_number] = 0;
+	      early_clobber[operand_number] = 0;
+
+	      p = insn_operand_constraint[insn_code_number][operand_number];
+
+	      if (*p == '=')
+		modified[operand_number] = 2;
+	      else if (*p == '+')
+		modified[operand_number] = 1;
+
+	      for (; *p && i < which_alternative; p++)
+		if (*p == ',')
+		  i++;
+
+	      while ((c = *p++) != '\0' && c != ',')
+		switch (c)
+		  {
+		  case '=':
+		    break;
+		  case '+':
+		    break;
+		  case '&':
+		    early_clobber[operand_number] = 1;
+		    break;
+		  case '%':
+		    commutative = operand_number;
+		    break;
+		  case '0': case '1': case '2': case '3': case '4':
+		  case '5': case '6': case '7': case '8': case '9':
+		    c -= '0';
+		    matches[c][operand_number] = 1;
+		    if (commutative >= 0)
+		      {
+			if (c == commutative || c == commutative + 1)
+			  {
+			    int other = c + (c == commutative ? 1 : -1);
+			    matches[other][operand_number] = 1;
+			  }
+			if (operand_number == commutative
+			    || operand_number == commutative + 1)
+			  {
+			    int other = (operand_number
+					 + (operand_number == commutative
+					    ? 1 : -1));
+			    matches[c][other] = 1;
+			  }
+		      }
+		    break;
+		  }
+	    }
+
+	  /* Now scan through the operands looking for a destination operand
+	     which is supposed to match a source operand.
+	     Then scan backward for an instruction which sets the source
+	     operand.  If safe, then replace the source operand with the
+	     dest operand in both instructions.  */
+
+	  for (operand_number = 0;
+	       operand_number < insn_n_operands[insn_code_number];
+	       operand_number++)
+	    {
+	      for (match_number = 0;
+		   match_number < insn_n_operands[insn_code_number];
+		   match_number++)
+		{
+		  rtx set, p, src, dst;
+		  rtx src_note, dst_note;
+		  int success = 0;
+		  int num_calls = 0;
+
+		  /* Nothing to do if the two operands aren't supposed to
+		     match.  */
+		  if (matches[operand_number][match_number] == 0)
+		    continue;
+
+		  dst = recog_operand[operand_number];
+		  src = recog_operand[match_number];
+
+		  if (GET_CODE (src) != REG
+		      || REGNO (src) < FIRST_PSEUDO_REGISTER)
+		    continue;
+
+		  if (GET_CODE (dst) != REG
+		      || REGNO (dst) < FIRST_PSEUDO_REGISTER)
+		    continue;
+
+		  /* If the operands already match, then there is nothing
+		     to do.  */
+		  if (operands_match_p (src, dst))
+		    continue;
+
+		  set = single_set (insn);
+		  if (! set)
+		    continue;
+
+		  /* operand_number/dst must be a write-only operand, and
+		     match_operand/src must be a read-only operand.  */
+		  if (modified[match_number] != 0)
+		    continue;
+
+		  if (early_clobber[operand_number] == 1)
+		    continue;
+
+		  if (modified[operand_number] != 2)
+		    continue;
+
+		  /* Make sure operand_number is the destination.  */
+		  if (recog_operand[operand_number] != SET_DEST (set))
+		    continue;
+	      
+		  if (! (src_note = find_reg_note (insn, REG_DEAD, src)))
+		    continue;
+
+		  /* Can not modify an earlier insn to set dst if this insn
+		     uses an old value in the source.  */
+		  if (reg_overlap_mentioned_p (dst, SET_SRC (set)))
+		    continue;
+
+		  if (regmove_dump_file)
+		    fprintf (regmove_dump_file,
+			     "Could fix operand %d of insn %d matching operand %d.\n",
+			     operand_number, INSN_UID (insn), match_number);
+
+		  /* ??? If src is set once, and is set equal to a constant,
+		     then do not use it for this optimization, as this would
+		     make it no longer equivalent to a constant?  */
+
+		  /* Scan backward to find the first instruction that uses
+		     the input operand.  If the operand is set here, then
+		     replace it in both instructions with operand_number.  */
+
+		  for (p = PREV_INSN (insn); p; p = PREV_INSN (p))
+		    {
+		      rtx pset;
+
+		      if (GET_CODE (p) == CODE_LABEL
+			  || GET_CODE (p) == JUMP_INSN
+			  || (GET_CODE (p) == NOTE
+			      && (NOTE_LINE_NUMBER (p) == NOTE_INSN_LOOP_BEG
+				  || NOTE_LINE_NUMBER (p) == NOTE_INSN_LOOP_END)))
+			break;
+
+		      if (GET_RTX_CLASS (GET_CODE (p)) != 'i')
+			continue;
+
+		      /* ??? See if all of SRC is set in P.  This test is much
+			 more conservative than it needs to be.  */
+		      pset = single_set (p);
+		      if (pset && SET_DEST (pset) == src)
+			{
+			  /* We use validate_replace_rtx, in case there
+			     are multiple identical source operands.  All of
+			     them have to be changed at the same time.  */
+			  if (validate_replace_rtx (src, dst, insn))
+			    {
+			      if (validate_change (p, &SET_DEST (pset),
+						   dst, 0))
+				success = 1;
+			      else
+				{
+				  /* Change all source operands back.
+				     This modifies the dst as a side-effect.  */
+				  validate_replace_rtx (dst, src, insn);
+				  /* Now make sure the dst is right.  */
+				  validate_change (insn,
+						   recog_operand_loc[operand_number],
+						   dst, 0);
+				}
+			    }
+			  break;
+			}
+
+		      if (reg_overlap_mentioned_p (src, PATTERN (p))
+			  || reg_overlap_mentioned_p (dst, PATTERN (p)))
+			break;
+
+		      /* If we have passed a call instruction, and the
+			 pseudo-reg DST is not already live across a call,
+			 then don't perform the optimization.  */
+		      if (GET_CODE (p) == CALL_INSN)
+			{
+			  num_calls++;
+
+			  if (REG_N_CALLS_CROSSED (REGNO (dst)) == 0)
+			    break;
+			}
+		    }
+
+		  if (success)
+		    {
+		      /* Remove the death note for SRC from INSN.  */
+		      remove_note (insn, src_note);
+		      /* Move the death note for SRC to P if it is used
+			 there.  */
+		      if (reg_overlap_mentioned_p (src, PATTERN (p)))
+			{
+			  XEXP (src_note, 1) = REG_NOTES (p);
+			  REG_NOTES (p) = src_note;
+			}
+		      /* If there is a REG_DEAD note for DST on P, then remove
+			 it, because DST is now set there.  */
+		      if (dst_note = find_reg_note (p, REG_DEAD, dst))
+			remove_note (p, dst_note);
+
+		      REG_N_SETS (REGNO (dst))++;
+		      REG_N_SETS (REGNO (src))--;
+
+		      REG_N_CALLS_CROSSED (REGNO (dst)) += num_calls;
+		      REG_N_CALLS_CROSSED (REGNO (src)) -= num_calls;
+
+		      /* ??? Must adjust reg_live_length, and reg_n_refs for
+			 both registers.  Must keep track of loop_depth in
+			 order to get reg_n_refs adjustment correct.  */
+
+		      if (regmove_dump_file)
+			fprintf (regmove_dump_file,
+				 "Fixed operand %d of insn %d matching operand %d.\n",
+				 operand_number, INSN_UID (insn), match_number);
+
+		      goto done_backwards;
+		    }
+		}
+	    }
+	done_backwards:
+	  ;
+	}
+    }
+#endif /* REGISTER_CONSTRAINTS */
+}
+
+/* return nonzero if X is stable but for mentioning SRC or mentioning /
+   changing DST .  If in doubt, presume it is unstable.  */
+static int
+stable_but_for_p (x, src, dst)
+     rtx x, src, dst;
+{
+  RTX_CODE code = GET_CODE (x);
+  switch (GET_RTX_CLASS (code))
+    {
+    case '<': case '1': case 'c': case '2': case 'b': case '3':
+      {
+	int i;
+	char *fmt = GET_RTX_FORMAT (code);
+	for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+	  if (fmt[i] == 'e' && ! stable_but_for_p (XEXP (x, i), src, dst))
+	      return 0;
+	return 1;
+      }
+    case 'o':
+      if (x == src || x == dst)
+	return 1;
+      /* fall through */
+    default:
+      return ! rtx_unstable_p (x);
+    }
+}
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@ -812,6 +812,7 @@ extern void remove_note		PROTO((rtx, rtx));
 extern void note_stores		PROTO((rtx, void (*)()));
 extern int refers_to_regno_p	PROTO((int, int, rtx, rtx *));
 extern int reg_overlap_mentioned_p PROTO((rtx, rtx));
+extern rtx find_use_as_address	PROTO((rtx, rtx, HOST_WIDE_INT));


 /* Maximum number of parallel sets and clobbers in any insn in this fn.
--- a/gcc/toplev.c
+++ b/gcc/toplev.c
@ -245,6 +245,7 @@ int cse2_dump = 0;
 int branch_prob_dump = 0;
 int flow_dump = 0;
 int combine_dump = 0;
+int regmove_dump = 0;
 int sched_dump = 0;
 int local_reg_dump = 0;
 int global_reg_dump = 0;
@ -566,6 +567,35 @@ int flag_pedantic_errors = 0;
 int flag_schedule_insns = 0;
 int flag_schedule_insns_after_reload = 0;

+#ifdef HAIFA
+/* The following flags have effect only for scheduling before register
+   allocation:
+
+   flag_schedule_interblock means schedule insns accross basic blocks.
+   flag_schedule_speculative means allow speculative motion of non-load insns.
+   flag_schedule_speculative_load means allow speculative motion of some
+   load insns.
+   flag_schedule_speculative_load_dangerous allows speculative motion of more
+   load insns.
+   flag_schedule_reverse_before_reload means try to reverse original order
+   of insns (S).
+   flag_schedule_reverse_after_reload means try to reverse original order
+   of insns (R).  */
+
+int flag_schedule_interblock = 1;
+int flag_schedule_speculative = 1;
+int flag_schedule_speculative_load = 0;
+int flag_schedule_speculative_load_dangerous = 0;
+int flag_schedule_reverse_before_reload = 0;
+int flag_schedule_reverse_after_reload = 0;
+
+
+/* flag_on_branch_count_reg means try to replace add-1,compare,branch tupple
+   by a cheaper branch, on a count register. */
+int flag_branch_on_count_reg;
+#endif  /* HAIFA */
+
+
 /* -finhibit-size-directive inhibits output of .size for ELF.
   This is used only for compiling crtstuff.c, 
   and it may be extended to other effects
@ -616,6 +646,8 @@ int flag_check_memory_usage = 0;
   -fcheck-memory-usage.  */
 int flag_prefix_function_name = 0;

+int flag_regmove = 0;
+
 /* 1 if alias checking is on (by default, when -O).  */
 int flag_alias_check = 0;

@ -666,6 +698,15 @@ struct { char *string; int *variable; int on_value;} f_options[] =
  {"pretend-float", &flag_pretend_float, 1},
  {"schedule-insns", &flag_schedule_insns, 1},
  {"schedule-insns2", &flag_schedule_insns_after_reload, 1},
+#ifdef HAIFA
+  {"sched-interblock",&flag_schedule_interblock, 1},
+  {"sched-spec",&flag_schedule_speculative, 1},
+  {"sched-spec-load",&flag_schedule_speculative_load, 1},
+  {"sched-spec-load-dangerous",&flag_schedule_speculative_load_dangerous, 1},
+  {"sched-reverse-S",&flag_schedule_reverse_before_reload, 1},
+  {"sched-reverse-R",&flag_schedule_reverse_after_reload, 1},
+  {"branch-count-reg",&flag_branch_on_count_reg, 1},
+#endif  /* HAIFA */
  {"pic", &flag_pic, 1},
  {"PIC", &flag_pic, 2},
  {"exceptions", &flag_exceptions, 1},
@ -680,6 +721,7 @@ struct { char *string; int *variable; int on_value;} f_options[] =
  {"function-sections", &flag_function_sections, 1},
  {"verbose-asm", &flag_verbose_asm, 1},
  {"gnu-linker", &flag_gnu_linker, 1},
+  {"regmove", &flag_regmove, 1},
  {"pack-struct", &flag_pack_struct, 1},
  {"stack-check", &flag_stack_check, 1},
  {"bytecode", &output_bytecode, 1},
@ -885,6 +927,7 @@ FILE *cse2_dump_file;
 FILE *branch_prob_dump_file;
 FILE *flow_dump_file;
 FILE *combine_dump_file;
+FILE *regmove_dump_file;
 FILE *sched_dump_file;
 FILE *local_reg_dump_file;
 FILE *global_reg_dump_file;
@ -905,6 +948,7 @@ int cse2_time;
 int branch_prob_time;
 int flow_time;
 int combine_time;
+int regmove_time;
 int sched_time;
 int local_alloc_time;
 int global_alloc_time;
@ -1053,6 +1097,8 @@ fatal_insn (message, insn)
    fflush (flow_dump_file);
  if (combine_dump_file)
    fflush (combine_dump_file);
+  if (regmove_dump_file)
+    fflush (regmove_dump_file);
  if (sched_dump_file)
    fflush (sched_dump_file);
  if (local_reg_dump_file)
@ -2131,6 +2177,7 @@ compile_file (name)
  branch_prob_time = 0;
  flow_time = 0;
  combine_time = 0;
+  regmove_time = 0;
  sched_time = 0;
  local_alloc_time = 0;
  global_alloc_time = 0;
@ -2229,6 +2276,10 @@ compile_file (name)
  if (combine_dump)
    combine_dump_file = open_dump_file (dump_base_name, ".combine");

+  /* If regmove dump desired, open the output file.  */
+  if (regmove_dump)
+    regmove_dump_file = open_dump_file (dump_base_name, ".regmove");
+
  /* If scheduling dump desired, open the output file.  */
  if (sched_dump)
    sched_dump_file = open_dump_file (dump_base_name, ".sched");
@ -2713,6 +2764,9 @@ compile_file (name)
      fclose (combine_dump_file);
    }

+  if (regmove_dump)
+    fclose (regmove_dump_file);
+
  if (sched_dump)
    fclose (sched_dump_file);

@ -2765,6 +2819,7 @@ compile_file (name)
 	  print_time ("branch-prob", branch_prob_time);
 	  print_time ("flow", flow_time);
 	  print_time ("combine", combine_time);
+	  print_time ("regmove", regmove_time);
 	  print_time ("sched", sched_time);
 	  print_time ("local-alloc", local_alloc_time);
 	  print_time ("global-alloc", global_alloc_time);
@ -3304,6 +3359,26 @@ rest_of_compilation (decl)
 	       fflush (combine_dump_file);
 	     });

+  if (regmove_dump)
+    TIMEVAR (dump_time,
+	     {
+	       fprintf (regmove_dump_file, "\n;; Function %s\n\n",
+			(*decl_printable_name) (decl, 2));
+	     });
+
+  /* Register allocation pre-pass, to reduce number of moves
+     necessary for two-address machines.  */
+  if (optimize > 0 && flag_regmove)
+    TIMEVAR (regmove_time, regmove_optimize (insns, max_reg_num (),
+					     regmove_dump_file));
+
+  if (regmove_dump)
+    TIMEVAR (dump_time,
+	     {
+	       print_rtl (regmove_dump_file, insns);
+	       fflush (regmove_dump_file);
+	     });
+
  /* Print function header into sched dump now
     because doing the sched analysis makes some of the dump.  */

@ -3703,6 +3778,7 @@ main (argc, argv, envp)
      flag_schedule_insns = 1;
      flag_schedule_insns_after_reload = 1;
 #endif
+      flag_regmove = 1;
    }

  if (optimize >= 3)
@ -3764,6 +3840,7 @@ main (argc, argv, envp)
 		    jump2_opt_dump = 1;
 		    local_reg_dump = 1;
 		    loop_dump = 1;
+		    regmove_dump = 1;
 		    rtl_dump = 1;
 		    cse_dump = 1, cse2_dump = 1;
 		    sched_dump = 1;
@ -3815,6 +3892,9 @@ main (argc, argv, envp)
 		  case 't':
 		    cse2_dump = 1;
 		    break;
+		  case 'N':
+		    regmove_dump = 1;
+		    break;
 		  case 'S':
 		    sched_dump = 1;
 		    break;
@ -3862,6 +3942,18 @@ main (argc, argv, envp)

 	      if (found)
 		;
+#ifdef HAIFA
+#ifdef INSN_SCHEDULING
+	      else if (!strncmp (p, "sched-verbose-",14))
+		fix_sched_param("verbose",&p[14]);
+	      else if (!strncmp (p, "sched-max-",10))
+		fix_sched_param("max",&p[10]);
+	      else if (!strncmp (p, "sched-inter-max-b-",18))
+		fix_sched_param("interblock-max-blocks",&p[18]);
+	      else if (!strncmp (p, "sched-inter-max-i-",18))
+		fix_sched_param("interblock-max-insns",&p[18]);
+#endif
+#endif  /* HAIFA */
 	      else if (!strncmp (p, "fixed-", 6))
 		fix_register (&p[6], 1, 1);
 	      else if (!strncmp (p, "call-used-", 10))
--- a/gcc/unroll.c
+++ b/gcc/unroll.c
@ -202,7 +202,7 @@ static rtx initial_reg_note_copy PROTO((rtx, struct inline_remap *));
 static void final_reg_note_copy PROTO((rtx, struct inline_remap *));
 static void copy_loop_body PROTO((rtx, rtx, struct inline_remap *, rtx, int,
 				  enum unroll_types, rtx, rtx, rtx, rtx));
-static void iteration_info PROTO((rtx, rtx *, rtx *, rtx, rtx));
+void iteration_info PROTO((rtx, rtx *, rtx *, rtx, rtx));
 static rtx approx_final_value PROTO((enum rtx_code, rtx, int *, int *));
 static int find_splittable_regs PROTO((enum unroll_types, rtx, rtx, rtx, int));
 static int find_splittable_givs PROTO((struct iv_class *,enum unroll_types,
@ -1094,6 +1094,16 @@ unroll_loop (loop_end, insn_count, loop_start, end_insert_before,
 	  /* Set unroll type to MODULO now.  */
 	  unroll_type = UNROLL_MODULO;
 	  loop_preconditioned = 1;
+#ifdef HAIFA
+	  if (loop_n_iterations > 0)
+	    loop_unroll_iter[ loop_number(loop_start, loop_end) ]
+	      = (loop_n_iterations
+		  - loop_n_iterations % (abs_inc * unroll_number));
+	  else
+	    /* inform loop.c about the new initial value */
+	    loop_start_value[loop_number(loop_start, loop_end)] = initial_value;
+#endif
+
 	}
    }

@ -1108,6 +1118,15 @@ unroll_loop (loop_end, insn_count, loop_start, end_insert_before,

  /* At this point, we are guaranteed to unroll the loop.  */

+#ifdef HAIFA
+  /* inform loop.c about the factor of unrolling */
+  if (unroll_type == UNROLL_COMPLETELY)
+    loop_unroll_factor[ loop_number(loop_start, loop_end) ] = -1;
+  else
+    loop_unroll_factor[ loop_number(loop_start, loop_end) ] = unroll_number;
+#endif  /* HAIFA */
+
+
  /* For each biv and giv, determine whether it can be safely split into
     a different variable for each unrolled copy of the loop body.
     We precalculate and save this info here, since computing it is
@ -2263,7 +2282,7 @@ biv_total_increment (bl, loop_start, loop_end)
   Initial_value and/or increment are set to zero if their values could not
   be calculated.  */

-static void
+void
 iteration_info (iteration_var, initial_value, increment, loop_start, loop_end)
     rtx iteration_var, *initial_value, *increment;
     rtx loop_start, loop_end;