predict.c (estimate_probability): Reorganize opcode heuristics.

* predict.c (estimate_probability): Reorganize opcode heuristics. * predict.def (PRED_OPCODE_POSITIVE, PRED_OPCODE_NONEQUAL, PRED_FPOPCODE): New. * i386.c (override_options): Recognize various CPU variants and set SSE/MMX/3dNOW flags accordingly. * i386.h (MASK_MMX_SET, MASK_SSE_SET, MASK_SSE2_SET, MASK_3DNOW_SET, MASK_3DNOW_A_SET): New. (MASK_ACCUMULATE_OUTGOING_ARGS_SET): New. (MASK_NO_ACCUMULATE_OUTGOING_ARGS): Delete. (MASK_*): Renumber. (TARGET_FLAGS): Use new masks. (CPP_CPU_SPECS): Recognize new CPU variants. * invoke.texi (-mcpu): Update documentation. * flags.h (flag_prefetch_loop_arrays): Declare. * loop.h (LOOP_PREFETCH): Define new constant. * loop.c (strength_reduce): Call emit_prefetch_instructions. (MAX_PREFETCHES, PREFETCH_BLOCKS_BEFORE_LOOP_MAX, PREFETCH_BLOCKS_BEFORE_LOOP_MIN, PREFETCH_BLOCKS_IN_LOOP_MIN): New constants. (check_store_data): New structure. (check_store, emit_prefetch_instructions, rtx_equal_for_prefetch_p): New functions. * toplev.c: Include insn-flags.h. (flag_prefetch_loop_arrays): New global variable. (lang_independent_option): Add -fprefetch-loop-arrays. (rest_of_compilation) Pass LOOP_PREFETCH when flag_prefetch_loop_arrays is set. * Makefile.in (toplev.c): Depend on insn-flags.h. * invoke.texi (-fprefetch-loop-arrays): Document. * predict.c (estimate_probability): Distribute the loop exit probability according to number of exit edges. * cfgcleanup.c (insns_match_p): Break out from ...; (flow_find_cross_jump): ... here; (outgoing_edges_match): Add parameter MODE; attempt to match everything except for tablejumps. (try_crossjump_to_edge): Accept complex edges. (try_crossjump_bb): Likewise. From-SVN: r47969
2001-12-13 12:34:11 +01:00 · 2001-12-13 12:34:11 +01:00 · 0dd0e980b5
commit 0dd0e980b5
parent 85230e5255
12 changed files with 1035 additions and 230 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,47 @@
+Thu Dec 13 12:31:07 CET 2001  Jan Hubicka  <jh@suse.cz>
+
+	* predict.c (estimate_probability): Reorganize opcode heuristics.
+	* predict.def (PRED_OPCODE_POSITIVE, PRED_OPCODE_NONEQUAL,
+	PRED_FPOPCODE): New.
+
+	* i386.c (override_options): Recognize various CPU variants and set
+	SSE/MMX/3dNOW flags accordingly.
+	* i386.h (MASK_MMX_SET, MASK_SSE_SET, MASK_SSE2_SET, MASK_3DNOW_SET,
+	MASK_3DNOW_A_SET): New.
+	(MASK_ACCUMULATE_OUTGOING_ARGS_SET): New.
+	(MASK_NO_ACCUMULATE_OUTGOING_ARGS): Delete.
+	(MASK_*): Renumber.
+	(TARGET_FLAGS): Use new masks.
+	(CPP_CPU_SPECS): Recognize new CPU variants.
+	* invoke.texi (-mcpu): Update documentation.
+
+	* flags.h (flag_prefetch_loop_arrays): Declare.
+	* loop.h (LOOP_PREFETCH): Define new constant.
+	* loop.c (strength_reduce): Call emit_prefetch_instructions.
+	(MAX_PREFETCHES, PREFETCH_BLOCKS_BEFORE_LOOP_MAX,
+	PREFETCH_BLOCKS_BEFORE_LOOP_MIN, PREFETCH_BLOCKS_IN_LOOP_MIN): New
+	constants.
+	(check_store_data): New structure.
+	(check_store, emit_prefetch_instructions, rtx_equal_for_prefetch_p):
+	New functions.
+	* toplev.c: Include insn-flags.h.
+	(flag_prefetch_loop_arrays): New global variable.
+	(lang_independent_option): Add -fprefetch-loop-arrays.
+	(rest_of_compilation) Pass LOOP_PREFETCH when flag_prefetch_loop_arrays
+        is set.
+	* Makefile.in (toplev.c): Depend on insn-flags.h.
+	* invoke.texi (-fprefetch-loop-arrays): Document.
+
+	* predict.c (estimate_probability): Distribute the loop exit
+	probability according to number of exit edges.
+
+	* cfgcleanup.c (insns_match_p): Break out from ...;
+	(flow_find_cross_jump): ... here;
+	(outgoing_edges_match): Add parameter MODE; attempt to match everything
+	except for tablejumps.
+	(try_crossjump_to_edge): Accept complex edges.
+	(try_crossjump_bb): Likewise.
+
 2001-11-29  Corey Minyard <minyard@acm.org>

 	* recog.c (validate_replace_rtx_1): Use simplify_gen_binary
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@ -1321,7 +1321,7 @@ toplev.o : toplev.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(RTL_H) function.h \
   dwarf2out.h sdbout.h dbxout.h $(EXPR_H) hard-reg-set.h $(BASIC_BLOCK_H) \
   graph.h $(LOOP_H) except.h $(REGS_H) $(TIMEVAR_H) $(lang_options_files) \
   ssa.h $(PARAMS_H) $(TM_P_H) reload.h dwarf2asm.h $(TARGET_H) halfpic.h \
-   langhooks.h
+   langhooks.h insn-flags.h
 	$(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 	  -DTARGET_NAME=\"$(target_alias)\" \
 	  -c $(srcdir)/toplev.c $(OUTPUT_OPTION)
--- a/gcc/cfgcleanup.c
+++ b/gcc/cfgcleanup.c
@ -64,9 +64,11 @@ enum bb_flags {

 static bool try_crossjump_to_edge	PARAMS ((int, edge, edge));
 static bool try_crossjump_bb		PARAMS ((int, basic_block));
-static bool outgoing_edges_match	PARAMS ((basic_block, basic_block));
+static bool outgoing_edges_match	PARAMS ((int,
+						 basic_block, basic_block));
 static int flow_find_cross_jump		PARAMS ((int, basic_block, basic_block,
 						 rtx *, rtx *));
+static bool insns_match_p		PARAMS ((int, rtx, rtx));

 static bool delete_unreachable_blocks	PARAMS ((void));
 static bool label_is_jump_target_p	PARAMS ((rtx, rtx));
@ -546,6 +548,108 @@ merge_blocks (e, b, c, mode)
  return false;
 }

+
+/* Return true if I1 and I2 are equivalent and thus can be crossjumped.  */
+
+static bool
+insns_match_p (mode, i1, i2)
+	int mode;
+	rtx i1, i2;
+{
+  rtx p1, p2;
+
+  /* Verify that I1 and I2 are equivalent.  */
+  if (GET_CODE (i1) != GET_CODE (i2))
+    return false;
+
+  p1 = PATTERN (i1);
+  p2 = PATTERN (i2);
+
+  if (GET_CODE (p1) != GET_CODE (p2))
+    return false;
+
+  /* If this is a CALL_INSN, compare register usage information.
+     If we don't check this on stack register machines, the two
+     CALL_INSNs might be merged leaving reg-stack.c with mismatching
+     numbers of stack registers in the same basic block.
+     If we don't check this on machines with delay slots, a delay slot may
+     be filled that clobbers a parameter expected by the subroutine.
+
+     ??? We take the simple route for now and assume that if they're
+     equal, they were constructed identically.  */
+
+  if (GET_CODE (i1) == CALL_INSN
+      && !rtx_equal_p (CALL_INSN_FUNCTION_USAGE (i1),
+		       CALL_INSN_FUNCTION_USAGE (i2)))
+    return false;
+
+#ifdef STACK_REGS
+  /* If cross_jump_death_matters is not 0, the insn's mode
+     indicates whether or not the insn contains any stack-like
+     regs.  */
+
+  if ((mode & CLEANUP_POST_REGSTACK) && stack_regs_mentioned (i1))
+    {
+      /* If register stack conversion has already been done, then
+         death notes must also be compared before it is certain that
+         the two instruction streams match.  */
+
+      rtx note;
+      HARD_REG_SET i1_regset, i2_regset;
+
+      CLEAR_HARD_REG_SET (i1_regset);
+      CLEAR_HARD_REG_SET (i2_regset);
+
+      for (note = REG_NOTES (i1); note; note = XEXP (note, 1))
+	if (REG_NOTE_KIND (note) == REG_DEAD && STACK_REG_P (XEXP (note, 0)))
+	  SET_HARD_REG_BIT (i1_regset, REGNO (XEXP (note, 0)));
+
+      for (note = REG_NOTES (i2); note; note = XEXP (note, 1))
+	if (REG_NOTE_KIND (note) == REG_DEAD && STACK_REG_P (XEXP (note, 0)))
+	  SET_HARD_REG_BIT (i2_regset, REGNO (XEXP (note, 0)));
+
+      GO_IF_HARD_REG_EQUAL (i1_regset, i2_regset, done);
+
+      return false;
+
+    done:
+      ;
+    }
+#endif
+
+  if (reload_completed
+      ? ! rtx_renumbered_equal_p (p1, p2) : ! rtx_equal_p (p1, p2))
+    {
+      /* The following code helps take care of G++ cleanups.  */
+      rtx equiv1 = find_reg_equal_equiv_note (i1);
+      rtx equiv2 = find_reg_equal_equiv_note (i2);
+
+      if (equiv1 && equiv2
+	  /* If the equivalences are not to a constant, they may
+	     reference pseudos that no longer exist, so we can't
+	     use them.  */
+	  && (! reload_completed
+	      || (CONSTANT_P (XEXP (equiv1, 0))
+		  && rtx_equal_p (XEXP (equiv1, 0), XEXP (equiv2, 0)))))
+	{
+	  rtx s1 = single_set (i1);
+	  rtx s2 = single_set (i2);
+	  if (s1 != 0 && s2 != 0
+	      && rtx_renumbered_equal_p (SET_DEST (s1), SET_DEST (s2)))
+	    {
+	      validate_change (i1, &SET_SRC (s1), XEXP (equiv1, 0), 1);
+	      validate_change (i2, &SET_SRC (s2), XEXP (equiv2, 0), 1);
+	      if (! rtx_renumbered_equal_p (p1, p2))
+		cancel_changes (0);
+	      else if (apply_change_group ())
+		return true;
+	    }
+	}
+      return false;
+    }
+  return true;
+}
+
 /* Look through the insns at the end of BB1 and BB2 and find the longest
   sequence that are equivalent.  Store the first insns for that sequence
   in *F1 and *F2 and return the sequence length.
@ -559,7 +663,7 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
     basic_block bb1, bb2;
     rtx *f1, *f2;
 {
-  rtx i1, i2, p1, p2, last1, last2, afterlast1, afterlast2;
+  rtx i1, i2, last1, last2, afterlast1, afterlast2;
  int ninsns = 0;

  /* Skip simple jumps at the end of the blocks.  Complex jumps still
@ -586,100 +690,11 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
      if (i1 == bb1->head || i2 == bb2->head)
 	break;

-      /* Verify that I1 and I2 are equivalent.  */
-
-      if (GET_CODE (i1) != GET_CODE (i2))
+      if (!insns_match_p (mode, i1, i2))
 	break;

-      p1 = PATTERN (i1);
-      p2 = PATTERN (i2);
-
-      /* If this is a CALL_INSN, compare register usage information.
-	 If we don't check this on stack register machines, the two
-	 CALL_INSNs might be merged leaving reg-stack.c with mismatching
-	 numbers of stack registers in the same basic block.
-	 If we don't check this on machines with delay slots, a delay slot may
-	 be filled that clobbers a parameter expected by the subroutine.
-
-	 ??? We take the simple route for now and assume that if they're
-	 equal, they were constructed identically.  */
-
-      if (GET_CODE (i1) == CALL_INSN
-	  && ! rtx_equal_p (CALL_INSN_FUNCTION_USAGE (i1),
-			    CALL_INSN_FUNCTION_USAGE (i2)))
-	break;
-
-#ifdef STACK_REGS
-      /* If cross_jump_death_matters is not 0, the insn's mode
-	 indicates whether or not the insn contains any stack-like
-	 regs.  */
-
-      if ((mode & CLEANUP_POST_REGSTACK) && stack_regs_mentioned (i1))
-	{
-	  /* If register stack conversion has already been done, then
-	     death notes must also be compared before it is certain that
-	     the two instruction streams match.  */
-
-	  rtx note;
-	  HARD_REG_SET i1_regset, i2_regset;
-
-	  CLEAR_HARD_REG_SET (i1_regset);
-	  CLEAR_HARD_REG_SET (i2_regset);
-
-	  for (note = REG_NOTES (i1); note; note = XEXP (note, 1))
-	    if (REG_NOTE_KIND (note) == REG_DEAD
-		&& STACK_REG_P (XEXP (note, 0)))
-	      SET_HARD_REG_BIT (i1_regset, REGNO (XEXP (note, 0)));
-
-	  for (note = REG_NOTES (i2); note; note = XEXP (note, 1))
-	    if (REG_NOTE_KIND (note) == REG_DEAD
-		&& STACK_REG_P (XEXP (note, 0)))
-	      SET_HARD_REG_BIT (i2_regset, REGNO (XEXP (note, 0)));
-
-	  GO_IF_HARD_REG_EQUAL (i1_regset, i2_regset, done);
-
-	  break;
-
-	done:
-	  ;
-	}
-#endif
-
-      if (GET_CODE (p1) != GET_CODE (p2))
-	break;
-
-      if (! rtx_renumbered_equal_p (p1, p2))
-	{
-	  /* The following code helps take care of G++ cleanups.  */
-	  rtx equiv1 = find_reg_equal_equiv_note (i1);
-	  rtx equiv2 = find_reg_equal_equiv_note (i2);
-
-	  if (equiv1 && equiv2
-	      /* If the equivalences are not to a constant, they may
-		 reference pseudos that no longer exist, so we can't
-		 use them.  */
-	      && CONSTANT_P (XEXP (equiv1, 0))
-	      && rtx_equal_p (XEXP (equiv1, 0), XEXP (equiv2, 0)))
-	    {
-	      rtx s1 = single_set (i1);
-	      rtx s2 = single_set (i2);
-	      if (s1 != 0 && s2 != 0
-		  && rtx_renumbered_equal_p (SET_DEST (s1), SET_DEST (s2)))
-		{
-		  validate_change (i1, &SET_SRC (s1), XEXP (equiv1, 0), 1);
-		  validate_change (i2, &SET_SRC (s2), XEXP (equiv2, 0), 1);
-		  if (! rtx_renumbered_equal_p (p1, p2))
-		    cancel_changes (0);
-		  else if (apply_change_group ())
-		    goto win;
-		}
-	    }
-	  break;
-	}
-
-    win:
      /* Don't begin a cross-jump with a USE or CLOBBER insn.  */
-      if (GET_CODE (p1) != USE && GET_CODE (p1) != CLOBBER)
+      if (active_insn_p (i1))
 	{
 	  /* If the merged insns have different REG_EQUAL notes, then
 	     remove them.  */
@ -743,10 +758,15 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
   We may assume that there exists one edge with a common destination.  */

 static bool
-outgoing_edges_match (bb1, bb2)
+outgoing_edges_match (mode, bb1, bb2)
+     int mode;
     basic_block bb1;
     basic_block bb2;
 {
+  int nehedges1 = 0, nehedges2 = 0;
+  edge fallthru1 = 0, fallthru2 = 0;
+  edge e1, e2;
+
  /* If BB1 has only one successor, we must be looking at an unconditional
     jump.  Which, by the assumption above, means that we only need to check
     that BB2 has one successor.  */
@ -862,10 +882,60 @@ outgoing_edges_match (bb1, bb2)
      return match;
    }

-  /* ??? We can handle computed jumps too.  This may be important for
-     inlined functions containing switch statements.  Also jumps w/o
-     fallthru edges can be handled by simply matching whole insn.  */
-  return false;
+  /* Generic case - we are seeing an computed jump, table jump or trapping
+     instruction.  */
+
+  /* First ensure that the instructions match.  There may be many outgoing
+     edges so this test is generally cheaper.
+     ??? Currently the tablejumps will never match, as they do have
+     different tables.  */
+  if (!insns_match_p (mode, bb1->end, bb2->end))
+    return false;
+
+  /* Search the outgoing edges, ensure that the counts do match, find possible
+     fallthru and exception handling edges since these needs more
+     validation.  */
+  for (e1 = bb1->succ, e2 = bb2->succ; e1 && e2;
+       e1 = e1->succ_next, e2 = e2->succ_next)
+    {
+      if (e1->flags & EDGE_EH)
+	nehedges1++;
+      if (e2->flags & EDGE_EH)
+	nehedges2++;
+      if (e1->flags & EDGE_FALLTHRU)
+	fallthru1 = e1;
+      if (e2->flags & EDGE_FALLTHRU)
+	fallthru2 = e2;
+    }
+  /* If number of edges of various types does not match, fail.  */
+  if (e1 || e2)
+    return false;
+  if (nehedges1 != nehedges2)
+    return false;
+  if ((fallthru1 != 0) != (fallthru2 != 0))
+    return false;
+
+  /* fallthru edges must be forwarded to the same destination.  */
+  if (fallthru1)
+    {
+      basic_block d1 = (forwarder_block_p (fallthru1->dest)
+	                ? fallthru1->dest->succ->dest: fallthru1->dest);
+      basic_block d2 = (forwarder_block_p (fallthru2->dest)
+	                ? fallthru2->dest->succ->dest: fallthru2->dest);
+      if (d1 != d2)
+	return false;
+    }
+  /* In case we do have EH edges, ensure we are in the same region.  */
+  if (nehedges1)
+    {
+      rtx n1 = find_reg_note (bb1->end, REG_EH_REGION, 0);
+      rtx n2 = find_reg_note (bb2->end, REG_EH_REGION, 0);
+      if (XEXP (n1, 0) != XEXP (n2, 0))
+	return false;
+    }
+  /* We don't need to match the rest of edges as above checks should be enought
+     to ensure that they are equivalent.  */
+  return true;
 }

 /* E1 and E2 are edges with the same destination block.  Search their
@ -924,14 +994,8 @@ try_crossjump_to_edge (mode, e1, e2)
  if (!src1->pred || !src2->pred)
    return false;

-  /* Likewise with complex edges.
-     ??? We should be able to handle most complex edges later with some
-     care.  */
-  if (e1->flags & EDGE_COMPLEX)
-    return false;
-
  /* Look for the common insn sequence, part the first ...  */
-  if (!outgoing_edges_match (src1, src2))
+  if (!outgoing_edges_match (mode, src1, src2))
    return false;

  /* ... and part the second.  */
@ -1066,11 +1130,6 @@ try_crossjump_bb (mode, bb)
    {
      nexte = e->pred_next;

-      /* Elide complex edges now, as neither try_crossjump_to_edge
-	 nor outgoing_edges_match can handle them.  */
-      if (e->flags & EDGE_COMPLEX)
-	continue;
-
      /* As noted above, first try with the fallthru predecessor.  */
      if (fallthru)
 	{
@ -1113,11 +1172,6 @@ try_crossjump_bb (mode, bb)
 	  if (e2 == fallthru)
 	    continue;

-	  /* Again, neither try_crossjump_to_edge nor outgoing_edges_match
-	     can handle complex edges.  */
-	  if (e2->flags & EDGE_COMPLEX)
-	    continue;
-
 	  /* The "first successor" check above only prevents multiple
 	     checks of crossjump(A,B).  In order to prevent redundant
 	     checks of crossjump(B,A), require that A be the block
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@ -817,18 +817,42 @@ override_options ()
    {
      const char *const name;		/* processor name or nickname.  */
      const enum processor_type processor;
+      const enum pta_flags
+	{
+	  PTA_SSE = 1,
+	  PTA_SSE2 = 2,
+	  PTA_MMX = 4,
+	  PTA_SSEPREFETCH = 8,
+	  PTA_3DNOW = 16,
+	  PTA_3DNOW_A = 64
+	} flags;
    }
  const processor_alias_table[] =
    {
-      {"i386", PROCESSOR_I386},
-      {"i486", PROCESSOR_I486},
-      {"i586", PROCESSOR_PENTIUM},
-      {"pentium", PROCESSOR_PENTIUM},
-      {"i686", PROCESSOR_PENTIUMPRO},
-      {"pentiumpro", PROCESSOR_PENTIUMPRO},
-      {"k6", PROCESSOR_K6},
-      {"athlon", PROCESSOR_ATHLON},
-      {"pentium4", PROCESSOR_PENTIUM4},
+      {"i386", PROCESSOR_I386, 0},
+      {"i486", PROCESSOR_I486, 0},
+      {"i586", PROCESSOR_PENTIUM, 0},
+      {"pentium", PROCESSOR_PENTIUM, 0},
+      {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
+      {"i686", PROCESSOR_PENTIUMPRO, 0},
+      {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
+      {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
+      {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_SSEPREFETCH},
+      {"pentium4", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2 |
+				       PTA_MMX | PTA_SSEPREFETCH},
+      {"k6", PROCESSOR_K6, PTA_MMX},
+      {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
+      {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
+      {"athlon", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
+				   | PTA_3DNOW_A},
+      {"athlon-tbird", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH
+					 | PTA_3DNOW | PTA_3DNOW_A},
+      {"athlon-4", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
+				    | PTA_3DNOW_A | PTA_SSE},
+      {"athlon-xp", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
+				      | PTA_3DNOW_A | PTA_SSE},
+      {"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
+				      | PTA_3DNOW_A | PTA_SSE},
    };

  int const pta_size = sizeof (processor_alias_table) / sizeof (struct pta);
@ -880,6 +904,21 @@ override_options ()
 	    ix86_arch = processor_alias_table[i].processor;
 	    /* Default cpu tuning to the architecture.  */
 	    ix86_cpu = ix86_arch;
+	    if (processor_alias_table[i].flags & PTA_MMX
+	        && !(target_flags & MASK_MMX_SET))
+	      target_flags |= MASK_MMX;
+	    if (processor_alias_table[i].flags & PTA_3DNOW
+	        && !(target_flags & MASK_3DNOW_SET))
+	      target_flags |= MASK_3DNOW;
+	    if (processor_alias_table[i].flags & PTA_3DNOW_A
+	        && !(target_flags & MASK_3DNOW_A_SET))
+	      target_flags |= MASK_3DNOW_A;
+	    if (processor_alias_table[i].flags & PTA_SSE
+	        && !(target_flags & MASK_SSE_SET))
+	      target_flags |= MASK_SSE;
+	    if (processor_alias_table[i].flags & PTA_SSE2
+	        && !(target_flags & MASK_SSE2_SET))
+	      target_flags |= MASK_SSE2;
 	    break;
 	  }

@ -1045,7 +1084,7 @@ override_options ()
 	target_flags |= MASK_3DNOW_A;
    }
  if ((x86_accumulate_outgoing_args & CPUMASK)
-      && !(target_flags & MASK_NO_ACCUMULATE_OUTGOING_ARGS)
+      && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS_SET)
      && !optimize_size)
    target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;

--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@ -112,25 +112,30 @@ extern int target_flags;
 #define MASK_NO_FANCY_MATH_387	0x00000040	/* Disable sin, cos, sqrt */
 #define MASK_OMIT_LEAF_FRAME_POINTER 0x080      /* omit leaf frame pointers */
 #define MASK_STACK_PROBE	0x00000100	/* Enable stack probing */
-#define MASK_NO_ALIGN_STROPS	0x00001000	/* Enable aligning of string ops.  */
-#define MASK_INLINE_ALL_STROPS	0x00002000	/* Inline stringops in all cases */
-#define MASK_NO_PUSH_ARGS	0x00004000	/* Use push instructions */
-#define MASK_ACCUMULATE_OUTGOING_ARGS 0x00008000/* Accumulate outgoing args */
-#define MASK_NO_ACCUMULATE_OUTGOING_ARGS 0x00010000
-#define MASK_MMX		0x00020000	/* Support MMX regs/builtins */
-#define MASK_SSE		0x00040000	/* Support SSE regs/builtins */
-#define MASK_SSE2		0x00080000	/* Support SSE2 regs/builtins */
+#define MASK_NO_ALIGN_STROPS	0x00000200	/* Enable aligning of string ops.  */
+#define MASK_INLINE_ALL_STROPS	0x00000400	/* Inline stringops in all cases */
+#define MASK_NO_PUSH_ARGS	0x00000800	/* Use push instructions */
+#define MASK_ACCUMULATE_OUTGOING_ARGS 0x00001000/* Accumulate outgoing args */
+#define MASK_ACCUMULATE_OUTGOING_ARGS_SET 0x00002000
+#define MASK_MMX		0x00004000	/* Support MMX regs/builtins */
+#define MASK_MMX_SET		0x00008000
+#define MASK_SSE		0x00010000	/* Support SSE regs/builtins */
+#define MASK_SSE_SET		0x00020000
+#define MASK_SSE2		0x00040000	/* Support SSE2 regs/builtins */
+#define MASK_SSE2_SET		0x00080000
 #define MASK_3DNOW		0x00100000	/* Support 3Dnow builtins */
-#define MASK_3DNOW_A		0x00200000	/* Support Athlon 3Dnow builtins */
-#define MASK_128BIT_LONG_DOUBLE 0x00400000	/* long double size is 128bit */
-#define MASK_MIX_SSE_I387	0x00800000	/* Mix SSE and i387 instructions */
-#define MASK_64BIT		0x01000000	/* Produce 64bit code */
-#define MASK_NO_RED_ZONE	0x02000000	/* Do not use red zone */
+#define MASK_3DNOW_SET		0x00200000
+#define MASK_3DNOW_A		0x00400000	/* Support Athlon 3Dnow builtins */
+#define MASK_3DNOW_A_SET	0x00800000
+#define MASK_128BIT_LONG_DOUBLE 0x01000000	/* long double size is 128bit */
+#define MASK_MIX_SSE_I387	0x02000000	/* Mix SSE and i387 instructions */
+#define MASK_64BIT		0x04000000	/* Produce 64bit code */
+#define MASK_NO_RED_ZONE	0x08000000	/* Do not use red zone */

 /* Temporary codegen switches */
-#define MASK_INTEL_SYNTAX	0x00000200
-#define MASK_DEBUG_ARG		0x00000400	/* function_arg */   
-#define MASK_DEBUG_ADDR		0x00000800	/* GO_IF_LEGITIMATE_ADDRESS */
+#define MASK_INTEL_SYNTAX	0x10000000
+#define MASK_DEBUG_ARG		0x20000000	/* function_arg */   
+#define MASK_DEBUG_ADDR		0x40000000	/* GO_IF_LEGITIMATE_ADDRESS */

 /* Use the floating point instructions */
 #define TARGET_80387 (target_flags & MASK_80387)
@ -335,24 +340,30 @@ extern const int x86_epilogue_using_move, x86_decompose_lea;
    N_("Use push instructions to save outgoing arguments") },		      \
  { "no-push-args",		MASK_NO_PUSH_ARGS,			      \
    N_("Do not use push instructions to save outgoing arguments") },	      \
-  { "accumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS,		      \
+  { "accumulate-outgoing-args",	(MASK_ACCUMULATE_OUTGOING_ARGS		      \
+				 | MASK_ACCUMULATE_OUTGOING_ARGS_SET),	      \
    N_("Use push instructions to save outgoing arguments") },		      \
-  { "no-accumulate-outgoing-args",MASK_NO_ACCUMULATE_OUTGOING_ARGS,	      \
+  { "no-accumulate-outgoing-args",MASK_ACCUMULATE_OUTGOING_ARGS_SET,	      \
    N_("Do not use push instructions to save outgoing arguments") },	      \
-  { "mmx",			 MASK_MMX, N_("Support MMX builtins") },      \
-  { "no-mmx",			-MASK_MMX,				      \
+  { "mmx",			 MASK_MMX | MASK_MMX_SET,		      \
+    N_("Support MMX builtins") },					      \
+  { "no-mmx",			 -MASK_MMX,				      \
    N_("Do not support MMX builtins") },				      \
-  { "3dnow",                     MASK_3DNOW,				      \
+  { "no-mmx",			 MASK_MMX_SET, N_("") },		      \
+  { "3dnow",                     MASK_3DNOW | MASK_3DNOW_SET,		      \
    N_("Support 3DNow! builtins") },					      \
-  { "no-3dnow",                 -MASK_3DNOW,				      \
+  { "no-3dnow",                  -MASK_3DNOW, N_("") },			      \
+  { "no-3dnow",                  MASK_3DNOW_SET,			      \
    N_("Do not support 3DNow! builtins") },				      \
-  { "sse",			 MASK_SSE,				      \
+  { "sse",			 MASK_SSE | MASK_SSE_SET,		      \
    N_("Support MMX and SSE builtins and code generation") },		      \
-  { "no-sse",			-MASK_SSE,				      \
+  { "no-sse",			 -MASK_SSE, N_("") },	 		      \
+  { "no-sse",			 MASK_SSE_SET,				      \
    N_("Do not support MMX and SSE builtins and code generation") },	      \
-  { "sse2",			 MASK_SSE2,				      \
+  { "sse2",			 MASK_SSE2 | MASK_SSE2_SET,		      \
    N_("Support MMX, SSE and SSE2 builtins and code generation") },	      \
-  { "no-sse2",			-MASK_SSE2,				      \
+  { "no-sse2",			 -MASK_SSE2, N_("") },			      \
+  { "no-sse2",			 MASK_SSE2_SET,				      \
    N_("Do not support MMX, SSE and SSE2 builtins and code generation") },    \
  { "mix-sse-i387",		 MASK_MIX_SSE_I387,			      \
    N_("Use both SSE and i387 instruction sets for floating point arithmetics") },\
@ -522,11 +533,22 @@ extern int ix86_arch;
 %{march=pentium4:-D__pentium4 -D__pentium4__ %{!mcpu*:-D__tune_pentium4__ }}\
 %{m386|mcpu=i386:-D__tune_i386__ }\
 %{m486|mcpu=i486:-D__tune_i486__ }\
-%{mpentium|mcpu=pentium|mcpu=i586:-D__tune_i586__ -D__tune_pentium__ }\
-%{mpentiumpro|mcpu=pentiumpro|mcpu=i686:-D__tune_i686__ -D__tune_pentiumpro__ }\
-%{mcpu=k6:-D__tune_k6__ }\
-%{mcpu=athlon:-D__tune_athlon__ }\
+%{mpentium|mcpu=pentium|mcpu=i586|mcpu=pentium-mmx:-D__tune_i586__ -D__tune_pentium__ }\
+%{mpentiumpro|mcpu=pentiumpro|mcpu=i686|cpu=pentium2|cpu=pentium3:-D__tune_i686__\
+-D__tune_pentiumpro__ }\
+%{mcpu=k6|mcpu=k6-2|mcpu=k6-3:-D__tune_k6__ }\
+%{mcpu=athlon|mcpu=athlon-tbird|mcpu=athlon-4|mcpu=athlon-xp|mcpu=athlon-mp:\
+-D__tune_athlon__ }\
 %{mcpu=pentium4:-D__tune_pentium4__ }\
+%{march=march=athlon-tbird|march=athlon-xp|march=athlon-mp|march=pentium3|march=pentium4:\
+-D__SSE__ }\
+%{march=pentium-mmx|march=k6|march=k6-2|march=k6-3\
+march=athlon|march=athlon-tbird|march=athlon-4|march=athlon-xp\
+|march=athlon-mp|march=pentium2|march=pentium3|march=pentium4: -D__MMX__ }\
+%{march=k6|march=k6-2|march=k6-3\
+march=athlon|march=athlon-tbird|march=athlon-4|march=athlon-xp\
+|march=athlon-mp: -D__3dNOW__ }\
+%{mcpu=mcpu=pentium4: -D__SSE2__ }\
 %{!march*:%{!mcpu*:%{!m386:%{!m486:%{!mpentium*:%(cpp_cpu_default)}}}}}"

 #ifndef CPP_CPU_SPEC
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@ -272,7 +272,7 @@ in the following sections.
 -fno-inline  -fno-math-errno  -fno-peephole  -fno-peephole2 @gol
 -funsafe-math-optimizations -fno-trapping-math @gol
 -fomit-frame-pointer  -foptimize-register-move @gol
-foptimize-sibling-calls  -freduce-all-givs @gol
+-foptimize-sibling-calls  -fprefetch-loop-arrays  -freduce-all-givs @gol
 -fregmove  -frename-registers @gol
 -frerun-cse-after-loop  -frerun-loop-opt @gol
 -fschedule-insns  -fschedule-insns2 @gol
@ -3570,6 +3570,10 @@ the loop is entered.  This usually makes programs run more slowly.
@option{-funroll-all-loops} implies the same options as
@option{-funroll-loops},

+@item -fprefetch-loop-arrays
+@opindex fprefetch-loop-arrays
+If supported by the target machine, generate instructions to prefetch
+memory to improve the performance of loops that access large arrays.

@item -fmove-all-movables
@opindex fmove-all-movables
@ -7476,10 +7480,13 @@ computers:
@table @gcctabopt
@item -mcpu=@var{cpu-type}
@opindex mcpu
-Assume the defaults for the machine type @var{cpu-type} when scheduling
-instructions.  The choices for @var{cpu-type} are @samp{i386},
-@samp{i486}, @samp{i586}, @samp{i686}, @samp{pentium},
-@samp{pentiumpro}, @samp{pentium4}, @samp{k6}, and @samp{athlon}
+Tune to @var{cpu-type} everything applicable about the generated code, except
+for the ABI and the set of available instructions.  The choices for
+@var{cpu-type} are @samp{i386}, @samp{i486}, @samp{i586}, @samp{i686},
+@samp{pentium}, @samp{pentium-mmx}, @samp{pentiumpro}, @samp{pentium2},
+@samp{pentium3}, @samp{pentium4}, @samp{k6}, @samp{k6-2}, @samp{k6-3},
+@samp{athlon}, @samp{athlon-tbird}, @samp{athlon-4}, @samp{athlon-xp}
+and @samp{athlon-mp}.

 While picking a specific @var{cpu-type} will schedule things appropriately
 for that particular chip, the compiler will not generate any code that
--- a/gcc/flags.h
+++ b/gcc/flags.h
@ -269,6 +269,10 @@ extern int flag_unroll_all_loops;

 extern int flag_move_all_movables;

+/* Nonzero enables prefetch optimizations for arrays in loops.  */
+
+extern int flag_prefetch_loop_arrays;
+
 /* Nonzero forces all general induction variables in loops to be
   strength reduced.  */

--- a/gcc/loop.c
+++ b/gcc/loop.c
@ -53,6 +53,90 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 #include "except.h"
 #include "toplev.h"
 #include "predict.h"
+#include "insn-flags.h"
+
+/* Not really meaningful values, but at least something.  */
+#ifndef SIMULTANEOUS_PREFETCHES
+#define SIMULTANEOUS_PREFETCHES 3
+#endif
+#ifndef PREFETCH_BLOCK
+#define PREFETCH_BLOCK 32
+#endif
+#ifndef HAVE_prefetch
+#define HAVE_prefetch 0
+#define gen_prefetch(a,b,c) (abort(), NULL_RTX)
+#endif
+
+/* Give up the prefetch optimizations once we exceed a given threshhold.
+   It is unlikely that we would be able to optimize something in a loop
+   with so many detected prefetches.  */
+#define MAX_PREFETCHES 100
+/* The number of prefetch blocks that are beneficial to fetch at once before
+   a loop with a known (and low) iteration count.  */
+#define PREFETCH_BLOCKS_BEFORE_LOOP_MAX  6
+/* For very tiny loops it is not worthwhile to prefetch even before the loop,
+   since it is likely that the data are already in the cache.  */
+#define PREFETCH_BLOCKS_BEFORE_LOOP_MIN  2
+/* The minimal number of prefetch blocks that a loop must consume to make
+   the emitting of prefetch instruction in the body of loop worthwhile.  */
+#define PREFETCH_BLOCKS_IN_LOOP_MIN  6
+
+/* Parameterize some prefetch heuristics so they can be turned on and off
+   easily for performance testing on new architecures.  These can be
+   defined in target-dependent files.  */
+
+/* Prefetch is worthwhile only when loads/stores are dense.  */
+#ifndef PREFETCH_ONLY_DENSE_MEM
+#define PREFETCH_ONLY_DENSE_MEM 1
+#endif
+
+/* Define what we mean by "dense" loads and stores; This value divided by 256
+   is the minimum percentage of memory references that worth prefetching.  */
+#ifndef PREFETCH_DENSE_MEM
+#define PREFETCH_DENSE_MEM 220
+#endif
+
+/* Do not prefetch for a loop whose iteration count is known to be low.  */
+#ifndef PREFETCH_NO_LOW_LOOPCNT
+#define PREFETCH_NO_LOW_LOOPCNT 1
+#endif
+
+/* Define what we mean by a "low" iteration count.  */
+#ifndef PREFETCH_LOW_LOOPCNT
+#define PREFETCH_LOW_LOOPCNT 32
+#endif
+
+/* Do not prefetch for a loop that contains a function call; such a loop is
+   probably not an internal loop.  */
+#ifndef PREFETCH_NO_CALL
+#define PREFETCH_NO_CALL 1
+#endif
+
+/* Do not prefetch accesses with an extreme stride.  */
+#ifndef PREFETCH_NO_EXTREME_STRIDE
+#define PREFETCH_NO_EXTREME_STRIDE 1
+#endif
+
+/* Define what we mean by an "extreme" stride.  */
+#ifndef PREFETCH_EXTREME_STRIDE
+#define PREFETCH_EXTREME_STRIDE 4096
+#endif
+
+/* Do not handle reversed order prefetches (negative stride).  */
+#ifndef PREFETCH_NO_REVERSE_ORDER
+#define PREFETCH_NO_REVERSE_ORDER 1
+#endif
+
+/* Prefetch even if the GIV is not always executed.  */
+#ifndef PREFETCH_NOT_ALWAYS
+#define PREFETCH_NOT_ALWAYS 0
+#endif
+
+/* If the loop requires more prefetches than the target can process in
+   parallel then don't prefetch anything in that loop.  */
+#ifndef PREFETCH_LIMIT_TO_SIMULTANEOUS
+#define PREFETCH_LIMIT_TO_SIMULTANEOUS 1
+#endif

 #define LOOP_REG_LIFETIME(LOOP, REGNO) \
 ((REGNO_LAST_LUID (REGNO) - REGNO_FIRST_LUID (REGNO)))
@ -262,6 +346,7 @@ static rtx loop_insn_sink_or_swim PARAMS((const struct loop *, rtx));

 static void loop_dump_aux PARAMS ((const struct loop *, FILE *, int));
 static void loop_delete_insns PARAMS ((rtx, rtx));
+static int remove_constant_addition PARAMS ((rtx *));
 void debug_ivs PARAMS ((const struct loop *));
 void debug_iv_class PARAMS ((const struct iv_class *));
 void debug_biv PARAMS ((const struct induction *));
@ -3412,6 +3497,509 @@ loop_reg_used_before_p (loop, set, insn)
  return 0;
 }

+
+/* Information we collect about arrays that we might want to prefetch.  */
+struct prefetch_info
+{
+  struct iv_class *class;	/* Class this prefetch is based on.  */
+  struct induction *giv;	/* GIV this prefetch is based on.  */
+  rtx base_address;		/* Start prefetching from this address plus
+				   index.  */
+  HOST_WIDE_INT index;
+  HOST_WIDE_INT stride;		/* Prefetch stride in bytes in each
+				   iteration.  */
+  unsigned int bytes_accesed;	/* Sum of sizes of all acceses to this
+				   prefetch area in one iteration.  */
+  unsigned int total_bytes;	/* Total bytes loop will access in this block.
+				   This is set only for loops with known
+				   iteration counts and is 0xffffffff
+				   otherwise.  */
+  unsigned int write : 1;	/* 1 for read/write prefetches.  */
+  unsigned int prefetch_in_loop : 1;
+  				/* 1 for those chosen for prefetching.  */
+  unsigned int prefetch_before_loop : 1;
+  				/* 1 for those chosen for prefetching.  */
+};
+
+/* Data used by check_store function.  */
+struct check_store_data
+{
+  rtx mem_address;
+  int mem_write;
+};
+
+static void check_store PARAMS ((rtx, rtx, void *));
+static void emit_prefetch_instructions PARAMS ((struct loop *));
+static int rtx_equal_for_prefetch_p PARAMS ((rtx, rtx));
+
+/* Set mem_write when mem_address is found.  Used as callback to
+   note_stores.  */
+static void
+check_store (x, pat, data)
+     rtx x, pat ATTRIBUTE_UNUSED;
+     void *data;
+{
+  struct check_store_data *d = (struct check_store_data *)data;
+
+  if ((GET_CODE (x) == MEM) && rtx_equal_p (d->mem_address, XEXP (x, 0)))
+    d->mem_write = 1;
+}
+
+/* Like rtx_equal_p, but attempts to swap commutative operands.  This is
+   important to get some addresses combined.  Later more sophisticated
+   transformations can be added when necesary.
+
+   ??? Same trick with swapping operand is done at several other places.
+   It can be nice to develop some common way to handle this.  */
+
+static int
+rtx_equal_for_prefetch_p (x, y)
+     rtx x, y;
+{
+  int i;
+  int j;
+  enum rtx_code code = GET_CODE (x);
+  const char *fmt;
+
+  if (x == y)
+    return 1;
+  if (code != GET_CODE (y))
+    return 0;
+
+  code = GET_CODE (x);
+
+  if (GET_RTX_CLASS (code) == 'c')
+    {
+      return ((rtx_equal_for_prefetch_p (XEXP (x, 0), XEXP (y, 0))
+	       && rtx_equal_for_prefetch_p (XEXP (x, 1), XEXP (y, 1)))
+	      || (rtx_equal_for_prefetch_p (XEXP (x, 0), XEXP (y, 1))
+	          && rtx_equal_for_prefetch_p (XEXP (x, 1), XEXP (y, 0))));
+    }
+  /* Compare the elements.  If any pair of corresponding elements fails to
+     match, return 0 for the whole thing.  */
+
+  fmt = GET_RTX_FORMAT (code);
+  for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+    {
+      switch (fmt[i])
+	{
+	case 'w':
+	  if (XWINT (x, i) != XWINT (y, i))
+	    return 0;
+	  break;
+
+	case 'i':
+	  if (XINT (x, i) != XINT (y, i))
+	    return 0;
+	  break;
+
+	case 'E':
+	  /* Two vectors must have the same length.  */
+	  if (XVECLEN (x, i) != XVECLEN (y, i))
+	    return 0;
+
+	  /* And the corresponding elements must match.  */
+	  for (j = 0; j < XVECLEN (x, i); j++)
+	    if (rtx_equal_for_prefetch_p (XVECEXP (x, i, j),
+					  XVECEXP (y, i, j)) == 0)
+	      return 0;
+	  break;
+
+	case 'e':
+	  if (rtx_equal_for_prefetch_p (XEXP (x, i), XEXP (y, i)) == 0)
+	    return 0;
+	  break;
+
+	case 's':
+	  if (strcmp (XSTR (x, i), XSTR (y, i)))
+	    return 0;
+	  break;
+
+	case 'u':
+	  /* These are just backpointers, so they don't matter.  */
+	  break;
+
+	case '0':
+	  break;
+
+	  /* It is believed that rtx's at this level will never
+	     contain anything but integers and other rtx's,
+	     except for within LABEL_REFs and SYMBOL_REFs.  */
+	default:
+	  abort ();
+	}
+    }
+  return 1;
+}
+
+/* Remove constant addition value from the expression X (when present)
+   and return it.  */
+static HOST_WIDE_INT
+remove_constant_addition (x)
+   rtx *x;
+{
+  HOST_WIDE_INT addval = 0;
+  rtx exp=*x;
+
+  if (GET_CODE (exp) == CONST)
+    exp = XEXP (exp, 0);
+  if (GET_CODE (exp) == CONST_INT)
+    {
+      addval = INTVAL (exp);
+      *x = const0_rtx;
+    }
+  /* For plus expression recurse on ourself.  */
+  else if (GET_CODE (exp) == PLUS)
+    {
+      addval += remove_constant_addition (&XEXP (exp, 0));
+      addval += remove_constant_addition (&XEXP (exp, 1));
+      /* In case our parameter was constant,  remove extra zero
+         from the expression.  */
+      if (XEXP (exp, 0) == const0_rtx)
+        *x = XEXP (exp, 1);
+      else if (XEXP (exp, 1) == const0_rtx)
+        *x = XEXP (exp, 0);
+    }
+  return addval;
+}
+
+/* Attempt to identify accesses to arrays that are most likely to cause cache
+   misses, and emit prefetch instructions a few prefetch blocks forward.
+
+   To detect the arrays we use the GIV information that was collected by the
+   strength reduction pass.
+
+   The prefetch instructions are generated after the GIV information is done
+   and before the strength reduction process. The new GIVs are injected into
+   the strength reduction tables, so the prefetch addresses are optimized as
+   well.
+
+   GIVs are split into base address, stride, and constant addition values.
+   GIVs with the same address, stride and close addition values are combined
+   into a single prefetch.  Also writes to GIVs are detected, so that prefetch
+   for write instructions can be used for the block we write to, on machines
+   that support write prefetches.
+
+   Several heuristics are used to determine when to prefetch.  They are
+   controlled by defined symbols that can be overridden for each target.
+*/
+static void
+emit_prefetch_instructions (struct loop *loop)
+{
+  int num_prefetches = 0;
+  int num_real_prefetches = 0;
+  int num_real_write_prefetches = 0;
+  int ahead;
+  int i;
+  struct iv_class *bl;
+  struct induction *iv;
+  struct prefetch_info info[MAX_PREFETCHES];
+  struct loop_ivs *ivs = LOOP_IVS (loop);
+
+  if (!HAVE_prefetch)
+    return;
+
+  /* Consider only loops w/o calls.  When a call is done, the loop is probably
+     slow enough to read the memory.  */
+  if (PREFETCH_NO_CALL && LOOP_INFO (loop)->has_call)
+    {
+      if (loop_dump_stream)
+	fprintf (loop_dump_stream, "Prefetch: ignoring loop - has call.\n");
+      return;
+    }
+
+  if (PREFETCH_NO_LOW_LOOPCNT
+      && LOOP_INFO (loop)->n_iterations
+      && LOOP_INFO (loop)->n_iterations <= PREFETCH_LOW_LOOPCNT)
+    {
+      if (loop_dump_stream)
+	fprintf (loop_dump_stream,
+		 "Prefetch: ignoring loop - not enought iterations.\n");
+      return;
+    }
+
+  /* Search all induction variables and pick those interesting for the prefetch
+     machinery.  */
+  for (bl = ivs->list; bl; bl = bl->next)
+    {
+      struct induction *biv = bl->biv, *biv1;
+      int basestride = 0;
+
+      biv1 = biv;
+      /* Expect all BIVs to be executed in each iteration.  This makes our
+	 analysis more conservative.  */
+      while (biv1)
+	{
+	  /* Discard non-constant additions that we can't handle well yet, and
+	     BIVs that are executed multiple times; such BIVs ought to be
+	     handled in the nested loop.  We accept not_every_iteration BIVs,
+	     since these only result in larger strides and make our
+	     heuristics more conservative.
+	     ??? What does the last sentence mean?  */
+
+	  if (GET_CODE (biv->add_val) != CONST_INT)
+	    {
+	      if (loop_dump_stream)
+		{
+		  fprintf (loop_dump_stream, "Prefetch: biv %i ignored: non-constant addition at insn %i:",
+			   REGNO (biv->src_reg), INSN_UID (biv->insn));
+		  print_rtl (loop_dump_stream, biv->add_val);
+		  fprintf (loop_dump_stream, "\n");
+		}
+	      break;
+	    }
+	  if (biv->maybe_multiple)
+	    {
+	      if (loop_dump_stream)
+		{
+		  fprintf (loop_dump_stream, "Prefetch: biv %i ignored: maybe_multiple at insn %i:",
+			   REGNO (biv->src_reg), INSN_UID (biv->insn));
+		  print_rtl (loop_dump_stream, biv->add_val);
+		  fprintf (loop_dump_stream, "\n");
+		}
+	      break;
+	    }
+	  basestride += INTVAL (biv1->add_val);
+	  biv1 = biv1->next_iv;
+	}
+      if (biv1 || !basestride)
+	continue;
+      for (iv = bl->giv; iv; iv = iv->next_iv)
+	{
+	  rtx address;
+	  rtx temp;
+	  HOST_WIDE_INT index = 0;
+	  int add = 1;
+	  HOST_WIDE_INT stride;
+	  struct check_store_data d;
+	  int size = GET_MODE_SIZE (GET_MODE (iv));
+
+	  /* There are several reasons why an induction variable is not
+	     interesting to us.  */
+	  if (iv->giv_type != DEST_ADDR
+	  /* We are interested only in constant stride memory references
+	     in order to be able to compute density easily.  */
+	      || GET_CODE (iv->mult_val) != CONST_INT
+	  /* Don't handle reversed order prefetches, since they are usually
+	     ineffective.  Later we may be able to reverse such BIVs.  */
+	      || (PREFETCH_NO_REVERSE_ORDER 
+		  && (stride = INTVAL (iv->mult_val) * basestride) < 0)
+	  /* Prefetching of accesses with such a extreme stride is probably
+	     not worthwhile, either.  */
+	      || (PREFETCH_NO_EXTREME_STRIDE
+		  && stride > PREFETCH_EXTREME_STRIDE)
+	  /* Ignore GIVs with varying add values; we can't predict the value
+	     for the next iteration.  */
+	      || !loop_invariant_p (loop, iv->add_val)
+	  /* Ignore GIVs in the nested loops; they ought to have been handled
+	     already.  */
+	      || iv->maybe_multiple)
+	    {
+	      if (loop_dump_stream)
+		{
+		  fprintf (loop_dump_stream, "Prefetch: Ignoring giv at %i\n",
+			   INSN_UID (iv->insn));
+		}
+	      continue;
+	    }
+
+	  /* Determine the pointer to the basic array we are examining.  It is
+	     the sum of the BIV's initial value and the GIV's add_val.  */
+	  index = 0;
+
+	  address = copy_rtx (iv->add_val);
+	  temp = copy_rtx (bl->initial_value);
+
+	  address = simplify_gen_binary (PLUS, Pmode, temp, address);
+	  index = remove_constant_addition (&address);
+
+	  index += size;
+	  d.mem_write = 0;
+	  d.mem_address = *iv->location;
+	  /* When the GIV is not always executed, we might be better off by
+	     not dirtying the cache pages.  */
+	  if (PREFETCH_NOT_ALWAYS || iv->always_executed)
+	    note_stores (PATTERN (iv->insn), check_store, &d);
+
+	  /* Attempt to find another prefetch to the same array and see if we
+	     can merge this one.  */
+	  for (i = 0; i < num_prefetches; i++)
+	    if (rtx_equal_for_prefetch_p (address, info[i].base_address)
+		&& stride == info[i].stride)
+	      {
+		/* In case both access same array (same location
+		   just with small difference in constant indexes), merge
+		   the prefetches.  Just do the later and the earlier will
+		   get prefetched from previous iteration.
+		   4096 is artificial threshold.  It should not be too small,
+		   but also not bigger than small portion of memory usually
+		   traversed by single loop.  */
+
+		if (index >= info[i].index && index - info[i].index < 4096)
+		  {
+		    info[i].write |= d.mem_write;
+		    info[i].bytes_accesed += size;
+		    info[i].index = index;
+		    info[i].giv = iv;
+		    info[i].class = bl;
+		    info[num_prefetches].base_address = address;
+		    add = 0;
+		    break;
+		  }
+		if (index < info[i].index && info[i].index - index < 4096)
+		  {
+		    info[i].write |= d.mem_write;
+		    info[i].bytes_accesed += size;
+		    add = 0;
+		    break;
+		  }
+	      }
+	  /* Merging failed.  */
+	  if (add)
+	    {
+	      info[num_prefetches].giv = iv;
+	      info[num_prefetches].class = bl;
+	      info[num_prefetches].index = index;
+	      info[num_prefetches].stride = stride;
+	      info[num_prefetches].base_address = address;
+	      info[num_prefetches].write = d.mem_write;
+	      info[num_prefetches].bytes_accesed = size;
+	      num_prefetches++;
+	      if (num_prefetches >= MAX_PREFETCHES)
+		{
+		  if (loop_dump_stream)
+		    fprintf(loop_dump_stream,"Maximal number of prefetches exceeded.\n");
+		  return;
+		}
+	    }
+	}
+    }
+  for (i = 0; i < num_prefetches; i++)
+    {
+      /* Attempt to calculate the number of bytes fetched by the loop.
+	 Avoid overflow.  */
+      if (LOOP_INFO (loop)->n_iterations
+          && (0xffffffff / info[i].stride) >= LOOP_INFO (loop)->n_iterations)
+	info[i].total_bytes = info[i].stride * LOOP_INFO (loop)->n_iterations;
+      else
+	info[i].total_bytes = 0xffffffff;
+
+
+      /* Prefetch is worthwhile only when the loads/stores are dense.  */
+      if (PREFETCH_ONLY_DENSE_MEM
+	  && (info[i].bytes_accesed * 256 / info[i].stride > PREFETCH_DENSE_MEM)
+	  && (info[i].total_bytes / PREFETCH_BLOCK >=
+	      PREFETCH_BLOCKS_BEFORE_LOOP_MIN))
+	{
+	  info[i].prefetch_before_loop = 1;
+	  if (info[i].total_bytes / PREFETCH_BLOCK <=
+	      PREFETCH_BLOCKS_BEFORE_LOOP_MAX)
+	    info[i].prefetch_in_loop = 0;
+	  else
+	    info[i].prefetch_in_loop = 1;
+	}
+      else
+        info[i].prefetch_in_loop = 0, info[i].prefetch_before_loop = 0;
+
+      if (info[i].prefetch_in_loop)
+	{
+	  num_real_prefetches += ((info[i].stride + PREFETCH_BLOCK - 1)
+				  / PREFETCH_BLOCK);
+	  if (info[i].write)
+	    num_real_write_prefetches +=
+		((info[i].stride + PREFETCH_BLOCK - 1) / PREFETCH_BLOCK);
+	}
+    }
+  if (loop_dump_stream)
+    {
+      for (i = 0; i < num_prefetches; i++)
+	{
+	  fprintf (loop_dump_stream, "Prefetch insn %i address: ",
+		   INSN_UID (info[i].giv->insn));
+	  print_rtl (loop_dump_stream, info[i].base_address);
+	  fprintf (loop_dump_stream, " Index:%i stride:%i density:%i%% total_bytes: %u %s in loop:%s before:%s\n",
+		   info[i].index, info[i].stride,
+		   info[i].bytes_accesed * 100 / info[i].stride,
+		   info[i].total_bytes,
+		   info[i].write ? "read/write" : "read only",
+		   info[i].prefetch_in_loop ? "yes" : "no",
+		   info[i].prefetch_before_loop ? "yes" : "no");
+	}
+      fprintf (loop_dump_stream, "Real prefetches needed:%i (write:%i)\n",
+	       num_real_prefetches, num_real_write_prefetches);
+    }
+
+  if (!num_real_prefetches)
+    return;
+
+  ahead = (SIMULTANEOUS_PREFETCHES / (num_real_prefetches));
+
+  if (!ahead)
+    return;
+  for (i = 0; i < num_prefetches; i++)
+    {
+      if (info[i].prefetch_in_loop)
+	{
+	  int y;
+	  for (y = 0; y < ((info[i].stride + PREFETCH_BLOCK - 1)
+			   / PREFETCH_BLOCK); y++)
+	    {
+	      rtx loc = copy_rtx (*info[i].giv->location);
+	      rtx insn;
+	      int bytes_ahead = PREFETCH_BLOCK * (ahead + y);
+	      rtx before_insn = info[i].giv->insn;
+	      rtx prev_insn = PREV_INSN (info[i].giv->insn);
+
+	      /* We can save some effort by offsetting the address on
+		 architectures with offsettable memory references.  */
+	      if (offsettable_address_p (0, VOIDmode, loc))
+		loc = plus_constant (loc, bytes_ahead);
+	      else
+		{
+		  rtx reg = gen_reg_rtx (Pmode);
+		  loop_iv_add_mult_emit_before (loop, loc, const1_rtx,
+		      				GEN_INT (bytes_ahead), reg,
+				  		0, before_insn);
+		  loc = reg;
+		}
+
+	      emit_insn_before (gen_prefetch (loc, GEN_INT (info[i].write),
+		                              GEN_INT (3)), before_insn);
+
+	      /* Check all insns emitted and record the new GIV information.  */
+	      insn = NEXT_INSN (prev_insn);
+	      while (insn != before_insn)
+		{
+		  insn = check_insn_for_givs (loop, insn,
+					      info[i].giv->always_executed,
+					      info[i].giv->maybe_multiple);
+		  insn = NEXT_INSN (insn);
+		}
+	    }
+	}
+      if (info[i].prefetch_before_loop)
+	{
+	  int y;
+	  /* Emit INSNs before the loop to fetch the first cache lines.  */
+	  for (y = 0; ((!info[i].prefetch_in_loop || y < ahead)
+		       && y * PREFETCH_BLOCK < (int)info[i].total_bytes); y ++)
+	    {
+	      rtx reg = gen_reg_rtx (Pmode);
+	      rtx loop_start = loop->start;
+	      rtx add_val = simplify_gen_binary (PLUS, Pmode,
+						 info[i].giv->add_val,
+						 GEN_INT (y * PREFETCH_BLOCK));
+	      loop_iv_add_mult_emit_before (loop, info[i].class->initial_value,
+					    info[i].giv->mult_val,
+				            add_val, reg, 0, loop_start);
+	      emit_insn_before (gen_prefetch (reg, GEN_INT (info[i].write),
+					      GEN_INT (3)), loop_start);
+	    }
+	}
+    }
+  return;
+}
+
 /* A "basic induction variable" or biv is a pseudo reg that is set
   (within this loop) only by incrementing or decrementing it.  */
 /* A "general induction variable" or giv is a pseudo reg whose
@ -4298,6 +4886,11 @@ strength_reduce (loop, flags)
     fail if the iteration variable is a giv.  */
  loop_iterations (loop);

+#ifdef HAVE_prefetch
+  if (flags & LOOP_PREFETCH)
+    emit_prefetch_instructions (loop);
+#endif
+
  /* Now for each giv for which we still don't know whether or not it is
     replaceable, check to see if it is replaceable because its final value
     can be calculated.  This must be done after loop_iterations is called,
--- a/gcc/loop.h
+++ b/gcc/loop.h
@ -27,6 +27,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 /* Flags passed to loop_optimize.  */
 #define LOOP_UNROLL 1
 #define LOOP_BCT 2
+#define LOOP_PREFETCH 4

 /* Get the loop info pointer of a loop.  */
 #define LOOP_INFO(LOOP) ((struct loop_info *) (LOOP)->aux)
--- a/gcc/predict.c
+++ b/gcc/predict.c
@ -329,12 +329,17 @@ estimate_probability (loops_info)
  for (i = 0; i < loops_info->num; i++)
    {
      int j;
+      int exits;
+      struct loop *loop = &loops_info->array[i];

-      for (j = loops_info->array[i].first->index;
-	   j <= loops_info->array[i].last->index;
+      flow_loop_scan (loops_info, loop, LOOP_EXIT_EDGES);
+      exits = loop->num_exits;
+
+      for (j = loop->first->index;
+	   j <= loop->last->index;
 	   ++j)
 	{
-	  if (TEST_BIT (loops_info->array[i].nodes, j))
+	  if (TEST_BIT (loop->nodes, j))
 	    {
 	      int header_found = 0;
 	      edge e;
@ -342,8 +347,8 @@ estimate_probability (loops_info)
 	      /* Loop branch heuristics - predict as taken an edge back to
 	         a loop's head.  */
 	      for (e = BASIC_BLOCK(j)->succ; e; e = e->succ_next)
-		if (e->dest == loops_info->array[i].header
-		    && e->src == loops_info->array[i].latch)
+		if (e->dest == loop->header
+		    && e->src == loop->latch)
 		  {
 		    header_found = 1;
 		    predict_edge_def (e, PRED_LOOP_BRANCH, TAKEN);
@ -354,8 +359,11 @@ estimate_probability (loops_info)
 	      if (!header_found)
 		for (e = BASIC_BLOCK(j)->succ; e; e = e->succ_next)
 		  if (e->dest->index <= 0
-		      || !TEST_BIT (loops_info->array[i].nodes, e->dest->index))
-		    predict_edge_def (e, PRED_LOOP_EXIT, NOT_TAKEN);
+		      || !TEST_BIT (loop->nodes, e->dest->index))
+		    predict_edge (e, PRED_LOOP_EXIT,
+				  (REG_BR_PROB_BASE
+				   - predictor_info [(int)PRED_LOOP_EXIT].hitrate)
+				  / exits);
 	    }
 	}
    }
@ -435,74 +443,83 @@ estimate_probability (loops_info)
      /* Try "pointer heuristic."
 	 A comparison ptr == 0 is predicted as false.
 	 Similarly, a comparison ptr1 == ptr2 is predicted as false.  */
-      switch (GET_CODE (cond))
-	{
-	case EQ:
-	  if (GET_CODE (XEXP (cond, 0)) == REG
-	      && REG_POINTER (XEXP (cond, 0))
-	      && (XEXP (cond, 1) == const0_rtx
-		  || (GET_CODE (XEXP (cond, 1)) == REG
-		      && REG_POINTER (XEXP (cond, 1)))))
-
+      if (GET_RTX_CLASS (GET_CODE (cond)) == '<'
+	  && ((REG_P (XEXP (cond, 0)) && REG_POINTER (XEXP (cond, 0)))
+	      || (REG_P (XEXP (cond, 1)) && REG_POINTER (XEXP (cond, 1)))))
+	switch (GET_CODE (cond))
+	  {
+	  case EQ:
 	    predict_insn_def (last_insn, PRED_POINTER, NOT_TAKEN);
-	  break;
-	case NE:
-	  if (GET_CODE (XEXP (cond, 0)) == REG
-	      && REG_POINTER (XEXP (cond, 0))
-	      && (XEXP (cond, 1) == const0_rtx
-		  || (GET_CODE (XEXP (cond, 1)) == REG
-		      && REG_POINTER (XEXP (cond, 1)))))
+	    break;
+	  case NE:
 	    predict_insn_def (last_insn, PRED_POINTER, TAKEN);
-	  break;
-
-	default:
-	  break;
-	}
-
+	    break;
+	  default:
+	    break;
+	  }
+      else
      /* Try "opcode heuristic."
 	 EQ tests are usually false and NE tests are usually true. Also,
 	 most quantities are positive, so we can make the appropriate guesses
 	 about signed comparisons against zero.  */
-      switch (GET_CODE (cond))
-	{
-	case CONST_INT:
-	  /* Unconditional branch.  */
-	  predict_insn_def (last_insn, PRED_UNCONDITIONAL,
-			    cond == const0_rtx ? NOT_TAKEN : TAKEN);
-	  break;
+	switch (GET_CODE (cond))
+	  {
+	  case CONST_INT:
+	    /* Unconditional branch.  */
+	    predict_insn_def (last_insn, PRED_UNCONDITIONAL,
+			      cond == const0_rtx ? NOT_TAKEN : TAKEN);
+	    break;

-	case EQ:
-	case UNEQ:
-	  predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
-	  break;
-	case NE:
-	case LTGT:
-	  predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
-	  break;
-	case ORDERED:
-	  predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
-	  break;
-	case UNORDERED:
-	  predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
-	  break;
-	case LE:
-	case LT:
-	  if (XEXP (cond, 1) == const0_rtx
-	      || (GET_CODE (XEXP (cond, 1)) == CONST_INT
-		  && INTVAL (XEXP (cond, 1)) == -1))
-	    predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
-	  break;
-	case GE:
-	case GT:
-	  if (XEXP (cond, 1) == const0_rtx
-	      || (GET_CODE (XEXP (cond, 1)) == CONST_INT
-		  && INTVAL (XEXP (cond, 1)) == -1))
-	    predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
-	  break;
+	  case EQ:
+	  case UNEQ:
+	    /* Floating point comparisons appears to behave in a very
+	       inpredictable way because of special role of = tests in
+	       FP code.  */
+	    if (FLOAT_MODE_P (GET_MODE (XEXP (cond, 0))))
+	      ;
+	    /* Comparisons with 0 are often used for booleans and there is
+	       nothing usefull to predict about them.  */
+	    else if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 0) == const0_rtx)
+	      ;
+	    else
+	      predict_insn_def (last_insn, PRED_OPCODE_NONEQUAL, NOT_TAKEN);
+	    break;
+	  case NE:
+	  case LTGT:
+	    /* Floating point comparisons appears to behave in a very
+	       inpredictable way because of special role of = tests in
+	       FP code.  */
+	    if (FLOAT_MODE_P (GET_MODE (XEXP (cond, 0))))
+	      ;
+	    /* Comparisons with 0 are often used for booleans and there is
+	       nothing usefull to predict about them.  */
+	    else if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 0) == const0_rtx)
+	      ;
+	    else
+	      predict_insn_def (last_insn, PRED_OPCODE_NONEQUAL, TAKEN);
+	    break;
+	  case ORDERED:
+	    predict_insn_def (last_insn, PRED_FPOPCODE, TAKEN);
+	    break;
+	  case UNORDERED:
+	    predict_insn_def (last_insn, PRED_FPOPCODE, NOT_TAKEN);
+	    break;
+	  case LE:
+	  case LT:
+	    if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 1) == const1_rtx
+		|| XEXP (cond, 1) == constm1_rtx)
+	      predict_insn_def (last_insn, PRED_OPCODE_POSITIVE, NOT_TAKEN);
+	    break;
+	  case GE:
+	  case GT:
+	    if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 1) == const1_rtx
+		|| XEXP (cond, 1) == constm1_rtx)
+	      predict_insn_def (last_insn, PRED_OPCODE_POSITIVE, TAKEN);
+	    break;

-	default:
-	  break;
-	}
+	  default:
+	    break;
+	  }
    }

  /* Attach the combined probability to each conditional jump.  */
--- a/gcc/predict.def
+++ b/gcc/predict.def
@ -89,7 +89,9 @@ DEF_PREDICTOR (PRED_LOOP_HEADER, "loop header", HITRATE (64), 0)
 DEF_PREDICTOR (PRED_POINTER, "pointer", HITRATE (83), 0)

 /* NE is probable, EQ not etc...  */
-DEF_PREDICTOR (PRED_OPCODE, "opcode", HITRATE (55), 0)
+DEF_PREDICTOR (PRED_OPCODE_POSITIVE, "opcode values positive", HITRATE (78), 0)
+DEF_PREDICTOR (PRED_OPCODE_NONEQUAL, "opcode values nonequal", HITRATE (70), 0)
+DEF_PREDICTOR (PRED_FPOPCODE, "fp_opcode", HITRATE (90), 0)

 /* Branch guarding call is probably taken.  */
 DEF_PREDICTOR (PRED_CALL, "call", HITRATE (70), 0)
--- a/gcc/toplev.c
+++ b/gcc/toplev.c
@ -46,6 +46,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
 #include "flags.h"
 #include "insn-attr.h"
 #include "insn-config.h"
+#include "insn-flags.h"
 #include "hard-reg-set.h"
 #include "recog.h"
 #include "output.h"
@ -544,6 +545,10 @@ int flag_unroll_loops;

 int flag_unroll_all_loops;

+/* Nonzero enables prefetch optimizations for arrays in loops.  */
+
+int flag_prefetch_loop_arrays;
+
 /* Nonzero forces all invariant computations in loops to be moved
   outside the loop.  */

@ -1001,6 +1006,8 @@ lang_independent_options f_options[] =
   N_("Perform loop unrolling when iteration count is known") },
  {"unroll-all-loops", &flag_unroll_all_loops, 1,
   N_("Perform loop unrolling for all loops") },
+  {"prefetch-loop-arrays", &flag_prefetch_loop_arrays, 1,
+   N_("Generate prefetch instructions, if available, for arrays in loops") },
  {"move-all-movables", &flag_move_all_movables, 1,
   N_("Force all loop invariant computations out of loops") },
  {"reduce-all-givs", &flag_reduce_all_givs, 1,
@ -2863,7 +2870,8 @@ rest_of_compilation (decl)
 	}
      cleanup_barriers ();
      loop_optimize (insns, rtl_dump_file,
-		     (flag_unroll_loops ? LOOP_UNROLL : 0) | LOOP_BCT);
+		     (flag_unroll_loops ? LOOP_UNROLL : 0) | LOOP_BCT
+		     | (flag_prefetch_loop_arrays ? LOOP_PREFETCH : 0));

      close_dump_file (DFI_loop, print_rtl, insns);
      timevar_pop (TV_LOOP);
@ -4928,6 +4936,20 @@ process_options ()
      flag_function_sections = 0;
    }

+#ifndef HAVE_prefetch
+  if (flag_prefetch_loop_arrays)
+    {
+      warning ("-fprefetch-loop-arrays not supported for this target");
+      flag_prefetch_loop_arrays = 0;
+    }
+#else
+  if (flag_prefetch_loop_arrays && !HAVE_prefetch)
+    {
+      warning ("-fprefetch-loop-arrays not supported for this target (try -march switches)");
+      flag_prefetch_loop_arrays = 0;
+    }
+#endif
+
 #ifndef OBJECT_FORMAT_ELF
  if (flag_function_sections && write_symbols != NO_DEBUG)
    warning ("-ffunction-sections may affect debugging on some targets");