predict.c (estimate_probability): Reorganize opcode heuristics.

* predict.c (estimate_probability): Reorganize opcode heuristics.
	* predict.def (PRED_OPCODE_POSITIVE, PRED_OPCODE_NONEQUAL,
	PRED_FPOPCODE): New.

	* i386.c (override_options): Recognize various CPU variants and set
	SSE/MMX/3dNOW flags accordingly.
	* i386.h (MASK_MMX_SET, MASK_SSE_SET, MASK_SSE2_SET, MASK_3DNOW_SET,
	MASK_3DNOW_A_SET): New.
	(MASK_ACCUMULATE_OUTGOING_ARGS_SET): New.
	(MASK_NO_ACCUMULATE_OUTGOING_ARGS): Delete.
	(MASK_*): Renumber.
	(TARGET_FLAGS): Use new masks.
	(CPP_CPU_SPECS): Recognize new CPU variants.
	* invoke.texi (-mcpu): Update documentation.

	* flags.h (flag_prefetch_loop_arrays): Declare.
	* loop.h (LOOP_PREFETCH): Define new constant.
	* loop.c (strength_reduce): Call emit_prefetch_instructions.
	(MAX_PREFETCHES, PREFETCH_BLOCKS_BEFORE_LOOP_MAX,
	PREFETCH_BLOCKS_BEFORE_LOOP_MIN, PREFETCH_BLOCKS_IN_LOOP_MIN): New
	constants.
	(check_store_data): New structure.
	(check_store, emit_prefetch_instructions, rtx_equal_for_prefetch_p):
	New functions.
	* toplev.c: Include insn-flags.h.
	(flag_prefetch_loop_arrays): New global variable.
	(lang_independent_option): Add -fprefetch-loop-arrays.
	(rest_of_compilation) Pass LOOP_PREFETCH when flag_prefetch_loop_arrays
        is set.
	* Makefile.in (toplev.c): Depend on insn-flags.h.
	* invoke.texi (-fprefetch-loop-arrays): Document.

	* predict.c (estimate_probability): Distribute the loop exit
	probability according to number of exit edges.

	* cfgcleanup.c (insns_match_p): Break out from ...;
	(flow_find_cross_jump): ... here;
	(outgoing_edges_match): Add parameter MODE; attempt to match everything
	except for tablejumps.
	(try_crossjump_to_edge): Accept complex edges.
	(try_crossjump_bb): Likewise.

From-SVN: r47969
This commit is contained in:
Jan Hubicka 2001-12-13 12:34:11 +01:00 committed by Jan Hubicka
parent 85230e5255
commit 0dd0e980b5
12 changed files with 1035 additions and 230 deletions

View File

@ -1,3 +1,47 @@
Thu Dec 13 12:31:07 CET 2001 Jan Hubicka <jh@suse.cz>
* predict.c (estimate_probability): Reorganize opcode heuristics.
* predict.def (PRED_OPCODE_POSITIVE, PRED_OPCODE_NONEQUAL,
PRED_FPOPCODE): New.
* i386.c (override_options): Recognize various CPU variants and set
SSE/MMX/3dNOW flags accordingly.
* i386.h (MASK_MMX_SET, MASK_SSE_SET, MASK_SSE2_SET, MASK_3DNOW_SET,
MASK_3DNOW_A_SET): New.
(MASK_ACCUMULATE_OUTGOING_ARGS_SET): New.
(MASK_NO_ACCUMULATE_OUTGOING_ARGS): Delete.
(MASK_*): Renumber.
(TARGET_FLAGS): Use new masks.
(CPP_CPU_SPECS): Recognize new CPU variants.
* invoke.texi (-mcpu): Update documentation.
* flags.h (flag_prefetch_loop_arrays): Declare.
* loop.h (LOOP_PREFETCH): Define new constant.
* loop.c (strength_reduce): Call emit_prefetch_instructions.
(MAX_PREFETCHES, PREFETCH_BLOCKS_BEFORE_LOOP_MAX,
PREFETCH_BLOCKS_BEFORE_LOOP_MIN, PREFETCH_BLOCKS_IN_LOOP_MIN): New
constants.
(check_store_data): New structure.
(check_store, emit_prefetch_instructions, rtx_equal_for_prefetch_p):
New functions.
* toplev.c: Include insn-flags.h.
(flag_prefetch_loop_arrays): New global variable.
(lang_independent_option): Add -fprefetch-loop-arrays.
(rest_of_compilation) Pass LOOP_PREFETCH when flag_prefetch_loop_arrays
is set.
* Makefile.in (toplev.c): Depend on insn-flags.h.
* invoke.texi (-fprefetch-loop-arrays): Document.
* predict.c (estimate_probability): Distribute the loop exit
probability according to number of exit edges.
* cfgcleanup.c (insns_match_p): Break out from ...;
(flow_find_cross_jump): ... here;
(outgoing_edges_match): Add parameter MODE; attempt to match everything
except for tablejumps.
(try_crossjump_to_edge): Accept complex edges.
(try_crossjump_bb): Likewise.
2001-11-29 Corey Minyard <minyard@acm.org>
* recog.c (validate_replace_rtx_1): Use simplify_gen_binary

View File

@ -1321,7 +1321,7 @@ toplev.o : toplev.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(RTL_H) function.h \
dwarf2out.h sdbout.h dbxout.h $(EXPR_H) hard-reg-set.h $(BASIC_BLOCK_H) \
graph.h $(LOOP_H) except.h $(REGS_H) $(TIMEVAR_H) $(lang_options_files) \
ssa.h $(PARAMS_H) $(TM_P_H) reload.h dwarf2asm.h $(TARGET_H) halfpic.h \
langhooks.h
langhooks.h insn-flags.h
$(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
-DTARGET_NAME=\"$(target_alias)\" \
-c $(srcdir)/toplev.c $(OUTPUT_OPTION)

View File

@ -64,9 +64,11 @@ enum bb_flags {
static bool try_crossjump_to_edge PARAMS ((int, edge, edge));
static bool try_crossjump_bb PARAMS ((int, basic_block));
static bool outgoing_edges_match PARAMS ((basic_block, basic_block));
static bool outgoing_edges_match PARAMS ((int,
basic_block, basic_block));
static int flow_find_cross_jump PARAMS ((int, basic_block, basic_block,
rtx *, rtx *));
static bool insns_match_p PARAMS ((int, rtx, rtx));
static bool delete_unreachable_blocks PARAMS ((void));
static bool label_is_jump_target_p PARAMS ((rtx, rtx));
@ -546,6 +548,108 @@ merge_blocks (e, b, c, mode)
return false;
}
/* Return true if I1 and I2 are equivalent and thus can be crossjumped. */
static bool
insns_match_p (mode, i1, i2)
int mode;
rtx i1, i2;
{
rtx p1, p2;
/* Verify that I1 and I2 are equivalent. */
if (GET_CODE (i1) != GET_CODE (i2))
return false;
p1 = PATTERN (i1);
p2 = PATTERN (i2);
if (GET_CODE (p1) != GET_CODE (p2))
return false;
/* If this is a CALL_INSN, compare register usage information.
If we don't check this on stack register machines, the two
CALL_INSNs might be merged leaving reg-stack.c with mismatching
numbers of stack registers in the same basic block.
If we don't check this on machines with delay slots, a delay slot may
be filled that clobbers a parameter expected by the subroutine.
??? We take the simple route for now and assume that if they're
equal, they were constructed identically. */
if (GET_CODE (i1) == CALL_INSN
&& !rtx_equal_p (CALL_INSN_FUNCTION_USAGE (i1),
CALL_INSN_FUNCTION_USAGE (i2)))
return false;
#ifdef STACK_REGS
/* If cross_jump_death_matters is not 0, the insn's mode
indicates whether or not the insn contains any stack-like
regs. */
if ((mode & CLEANUP_POST_REGSTACK) && stack_regs_mentioned (i1))
{
/* If register stack conversion has already been done, then
death notes must also be compared before it is certain that
the two instruction streams match. */
rtx note;
HARD_REG_SET i1_regset, i2_regset;
CLEAR_HARD_REG_SET (i1_regset);
CLEAR_HARD_REG_SET (i2_regset);
for (note = REG_NOTES (i1); note; note = XEXP (note, 1))
if (REG_NOTE_KIND (note) == REG_DEAD && STACK_REG_P (XEXP (note, 0)))
SET_HARD_REG_BIT (i1_regset, REGNO (XEXP (note, 0)));
for (note = REG_NOTES (i2); note; note = XEXP (note, 1))
if (REG_NOTE_KIND (note) == REG_DEAD && STACK_REG_P (XEXP (note, 0)))
SET_HARD_REG_BIT (i2_regset, REGNO (XEXP (note, 0)));
GO_IF_HARD_REG_EQUAL (i1_regset, i2_regset, done);
return false;
done:
;
}
#endif
if (reload_completed
? ! rtx_renumbered_equal_p (p1, p2) : ! rtx_equal_p (p1, p2))
{
/* The following code helps take care of G++ cleanups. */
rtx equiv1 = find_reg_equal_equiv_note (i1);
rtx equiv2 = find_reg_equal_equiv_note (i2);
if (equiv1 && equiv2
/* If the equivalences are not to a constant, they may
reference pseudos that no longer exist, so we can't
use them. */
&& (! reload_completed
|| (CONSTANT_P (XEXP (equiv1, 0))
&& rtx_equal_p (XEXP (equiv1, 0), XEXP (equiv2, 0)))))
{
rtx s1 = single_set (i1);
rtx s2 = single_set (i2);
if (s1 != 0 && s2 != 0
&& rtx_renumbered_equal_p (SET_DEST (s1), SET_DEST (s2)))
{
validate_change (i1, &SET_SRC (s1), XEXP (equiv1, 0), 1);
validate_change (i2, &SET_SRC (s2), XEXP (equiv2, 0), 1);
if (! rtx_renumbered_equal_p (p1, p2))
cancel_changes (0);
else if (apply_change_group ())
return true;
}
}
return false;
}
return true;
}
/* Look through the insns at the end of BB1 and BB2 and find the longest
sequence that are equivalent. Store the first insns for that sequence
in *F1 and *F2 and return the sequence length.
@ -559,7 +663,7 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
basic_block bb1, bb2;
rtx *f1, *f2;
{
rtx i1, i2, p1, p2, last1, last2, afterlast1, afterlast2;
rtx i1, i2, last1, last2, afterlast1, afterlast2;
int ninsns = 0;
/* Skip simple jumps at the end of the blocks. Complex jumps still
@ -586,100 +690,11 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
if (i1 == bb1->head || i2 == bb2->head)
break;
/* Verify that I1 and I2 are equivalent. */
if (GET_CODE (i1) != GET_CODE (i2))
if (!insns_match_p (mode, i1, i2))
break;
p1 = PATTERN (i1);
p2 = PATTERN (i2);
/* If this is a CALL_INSN, compare register usage information.
If we don't check this on stack register machines, the two
CALL_INSNs might be merged leaving reg-stack.c with mismatching
numbers of stack registers in the same basic block.
If we don't check this on machines with delay slots, a delay slot may
be filled that clobbers a parameter expected by the subroutine.
??? We take the simple route for now and assume that if they're
equal, they were constructed identically. */
if (GET_CODE (i1) == CALL_INSN
&& ! rtx_equal_p (CALL_INSN_FUNCTION_USAGE (i1),
CALL_INSN_FUNCTION_USAGE (i2)))
break;
#ifdef STACK_REGS
/* If cross_jump_death_matters is not 0, the insn's mode
indicates whether or not the insn contains any stack-like
regs. */
if ((mode & CLEANUP_POST_REGSTACK) && stack_regs_mentioned (i1))
{
/* If register stack conversion has already been done, then
death notes must also be compared before it is certain that
the two instruction streams match. */
rtx note;
HARD_REG_SET i1_regset, i2_regset;
CLEAR_HARD_REG_SET (i1_regset);
CLEAR_HARD_REG_SET (i2_regset);
for (note = REG_NOTES (i1); note; note = XEXP (note, 1))
if (REG_NOTE_KIND (note) == REG_DEAD
&& STACK_REG_P (XEXP (note, 0)))
SET_HARD_REG_BIT (i1_regset, REGNO (XEXP (note, 0)));
for (note = REG_NOTES (i2); note; note = XEXP (note, 1))
if (REG_NOTE_KIND (note) == REG_DEAD
&& STACK_REG_P (XEXP (note, 0)))
SET_HARD_REG_BIT (i2_regset, REGNO (XEXP (note, 0)));
GO_IF_HARD_REG_EQUAL (i1_regset, i2_regset, done);
break;
done:
;
}
#endif
if (GET_CODE (p1) != GET_CODE (p2))
break;
if (! rtx_renumbered_equal_p (p1, p2))
{
/* The following code helps take care of G++ cleanups. */
rtx equiv1 = find_reg_equal_equiv_note (i1);
rtx equiv2 = find_reg_equal_equiv_note (i2);
if (equiv1 && equiv2
/* If the equivalences are not to a constant, they may
reference pseudos that no longer exist, so we can't
use them. */
&& CONSTANT_P (XEXP (equiv1, 0))
&& rtx_equal_p (XEXP (equiv1, 0), XEXP (equiv2, 0)))
{
rtx s1 = single_set (i1);
rtx s2 = single_set (i2);
if (s1 != 0 && s2 != 0
&& rtx_renumbered_equal_p (SET_DEST (s1), SET_DEST (s2)))
{
validate_change (i1, &SET_SRC (s1), XEXP (equiv1, 0), 1);
validate_change (i2, &SET_SRC (s2), XEXP (equiv2, 0), 1);
if (! rtx_renumbered_equal_p (p1, p2))
cancel_changes (0);
else if (apply_change_group ())
goto win;
}
}
break;
}
win:
/* Don't begin a cross-jump with a USE or CLOBBER insn. */
if (GET_CODE (p1) != USE && GET_CODE (p1) != CLOBBER)
if (active_insn_p (i1))
{
/* If the merged insns have different REG_EQUAL notes, then
remove them. */
@ -743,10 +758,15 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
We may assume that there exists one edge with a common destination. */
static bool
outgoing_edges_match (bb1, bb2)
outgoing_edges_match (mode, bb1, bb2)
int mode;
basic_block bb1;
basic_block bb2;
{
int nehedges1 = 0, nehedges2 = 0;
edge fallthru1 = 0, fallthru2 = 0;
edge e1, e2;
/* If BB1 has only one successor, we must be looking at an unconditional
jump. Which, by the assumption above, means that we only need to check
that BB2 has one successor. */
@ -862,10 +882,60 @@ outgoing_edges_match (bb1, bb2)
return match;
}
/* ??? We can handle computed jumps too. This may be important for
inlined functions containing switch statements. Also jumps w/o
fallthru edges can be handled by simply matching whole insn. */
return false;
/* Generic case - we are seeing an computed jump, table jump or trapping
instruction. */
/* First ensure that the instructions match. There may be many outgoing
edges so this test is generally cheaper.
??? Currently the tablejumps will never match, as they do have
different tables. */
if (!insns_match_p (mode, bb1->end, bb2->end))
return false;
/* Search the outgoing edges, ensure that the counts do match, find possible
fallthru and exception handling edges since these needs more
validation. */
for (e1 = bb1->succ, e2 = bb2->succ; e1 && e2;
e1 = e1->succ_next, e2 = e2->succ_next)
{
if (e1->flags & EDGE_EH)
nehedges1++;
if (e2->flags & EDGE_EH)
nehedges2++;
if (e1->flags & EDGE_FALLTHRU)
fallthru1 = e1;
if (e2->flags & EDGE_FALLTHRU)
fallthru2 = e2;
}
/* If number of edges of various types does not match, fail. */
if (e1 || e2)
return false;
if (nehedges1 != nehedges2)
return false;
if ((fallthru1 != 0) != (fallthru2 != 0))
return false;
/* fallthru edges must be forwarded to the same destination. */
if (fallthru1)
{
basic_block d1 = (forwarder_block_p (fallthru1->dest)
? fallthru1->dest->succ->dest: fallthru1->dest);
basic_block d2 = (forwarder_block_p (fallthru2->dest)
? fallthru2->dest->succ->dest: fallthru2->dest);
if (d1 != d2)
return false;
}
/* In case we do have EH edges, ensure we are in the same region. */
if (nehedges1)
{
rtx n1 = find_reg_note (bb1->end, REG_EH_REGION, 0);
rtx n2 = find_reg_note (bb2->end, REG_EH_REGION, 0);
if (XEXP (n1, 0) != XEXP (n2, 0))
return false;
}
/* We don't need to match the rest of edges as above checks should be enought
to ensure that they are equivalent. */
return true;
}
/* E1 and E2 are edges with the same destination block. Search their
@ -924,14 +994,8 @@ try_crossjump_to_edge (mode, e1, e2)
if (!src1->pred || !src2->pred)
return false;
/* Likewise with complex edges.
??? We should be able to handle most complex edges later with some
care. */
if (e1->flags & EDGE_COMPLEX)
return false;
/* Look for the common insn sequence, part the first ... */
if (!outgoing_edges_match (src1, src2))
if (!outgoing_edges_match (mode, src1, src2))
return false;
/* ... and part the second. */
@ -1066,11 +1130,6 @@ try_crossjump_bb (mode, bb)
{
nexte = e->pred_next;
/* Elide complex edges now, as neither try_crossjump_to_edge
nor outgoing_edges_match can handle them. */
if (e->flags & EDGE_COMPLEX)
continue;
/* As noted above, first try with the fallthru predecessor. */
if (fallthru)
{
@ -1113,11 +1172,6 @@ try_crossjump_bb (mode, bb)
if (e2 == fallthru)
continue;
/* Again, neither try_crossjump_to_edge nor outgoing_edges_match
can handle complex edges. */
if (e2->flags & EDGE_COMPLEX)
continue;
/* The "first successor" check above only prevents multiple
checks of crossjump(A,B). In order to prevent redundant
checks of crossjump(B,A), require that A be the block

View File

@ -817,18 +817,42 @@ override_options ()
{
const char *const name; /* processor name or nickname. */
const enum processor_type processor;
const enum pta_flags
{
PTA_SSE = 1,
PTA_SSE2 = 2,
PTA_MMX = 4,
PTA_SSEPREFETCH = 8,
PTA_3DNOW = 16,
PTA_3DNOW_A = 64
} flags;
}
const processor_alias_table[] =
{
{"i386", PROCESSOR_I386},
{"i486", PROCESSOR_I486},
{"i586", PROCESSOR_PENTIUM},
{"pentium", PROCESSOR_PENTIUM},
{"i686", PROCESSOR_PENTIUMPRO},
{"pentiumpro", PROCESSOR_PENTIUMPRO},
{"k6", PROCESSOR_K6},
{"athlon", PROCESSOR_ATHLON},
{"pentium4", PROCESSOR_PENTIUM4},
{"i386", PROCESSOR_I386, 0},
{"i486", PROCESSOR_I486, 0},
{"i586", PROCESSOR_PENTIUM, 0},
{"pentium", PROCESSOR_PENTIUM, 0},
{"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
{"i686", PROCESSOR_PENTIUMPRO, 0},
{"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
{"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
{"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_SSEPREFETCH},
{"pentium4", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2 |
PTA_MMX | PTA_SSEPREFETCH},
{"k6", PROCESSOR_K6, PTA_MMX},
{"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
{"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
{"athlon", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
| PTA_3DNOW_A},
{"athlon-tbird", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH
| PTA_3DNOW | PTA_3DNOW_A},
{"athlon-4", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
| PTA_3DNOW_A | PTA_SSE},
{"athlon-xp", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
| PTA_3DNOW_A | PTA_SSE},
{"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
| PTA_3DNOW_A | PTA_SSE},
};
int const pta_size = sizeof (processor_alias_table) / sizeof (struct pta);
@ -880,6 +904,21 @@ override_options ()
ix86_arch = processor_alias_table[i].processor;
/* Default cpu tuning to the architecture. */
ix86_cpu = ix86_arch;
if (processor_alias_table[i].flags & PTA_MMX
&& !(target_flags & MASK_MMX_SET))
target_flags |= MASK_MMX;
if (processor_alias_table[i].flags & PTA_3DNOW
&& !(target_flags & MASK_3DNOW_SET))
target_flags |= MASK_3DNOW;
if (processor_alias_table[i].flags & PTA_3DNOW_A
&& !(target_flags & MASK_3DNOW_A_SET))
target_flags |= MASK_3DNOW_A;
if (processor_alias_table[i].flags & PTA_SSE
&& !(target_flags & MASK_SSE_SET))
target_flags |= MASK_SSE;
if (processor_alias_table[i].flags & PTA_SSE2
&& !(target_flags & MASK_SSE2_SET))
target_flags |= MASK_SSE2;
break;
}
@ -1045,7 +1084,7 @@ override_options ()
target_flags |= MASK_3DNOW_A;
}
if ((x86_accumulate_outgoing_args & CPUMASK)
&& !(target_flags & MASK_NO_ACCUMULATE_OUTGOING_ARGS)
&& !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS_SET)
&& !optimize_size)
target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;

View File

@ -112,25 +112,30 @@ extern int target_flags;
#define MASK_NO_FANCY_MATH_387 0x00000040 /* Disable sin, cos, sqrt */
#define MASK_OMIT_LEAF_FRAME_POINTER 0x080 /* omit leaf frame pointers */
#define MASK_STACK_PROBE 0x00000100 /* Enable stack probing */
#define MASK_NO_ALIGN_STROPS 0x00001000 /* Enable aligning of string ops. */
#define MASK_INLINE_ALL_STROPS 0x00002000 /* Inline stringops in all cases */
#define MASK_NO_PUSH_ARGS 0x00004000 /* Use push instructions */
#define MASK_ACCUMULATE_OUTGOING_ARGS 0x00008000/* Accumulate outgoing args */
#define MASK_NO_ACCUMULATE_OUTGOING_ARGS 0x00010000
#define MASK_MMX 0x00020000 /* Support MMX regs/builtins */
#define MASK_SSE 0x00040000 /* Support SSE regs/builtins */
#define MASK_SSE2 0x00080000 /* Support SSE2 regs/builtins */
#define MASK_NO_ALIGN_STROPS 0x00000200 /* Enable aligning of string ops. */
#define MASK_INLINE_ALL_STROPS 0x00000400 /* Inline stringops in all cases */
#define MASK_NO_PUSH_ARGS 0x00000800 /* Use push instructions */
#define MASK_ACCUMULATE_OUTGOING_ARGS 0x00001000/* Accumulate outgoing args */
#define MASK_ACCUMULATE_OUTGOING_ARGS_SET 0x00002000
#define MASK_MMX 0x00004000 /* Support MMX regs/builtins */
#define MASK_MMX_SET 0x00008000
#define MASK_SSE 0x00010000 /* Support SSE regs/builtins */
#define MASK_SSE_SET 0x00020000
#define MASK_SSE2 0x00040000 /* Support SSE2 regs/builtins */
#define MASK_SSE2_SET 0x00080000
#define MASK_3DNOW 0x00100000 /* Support 3Dnow builtins */
#define MASK_3DNOW_A 0x00200000 /* Support Athlon 3Dnow builtins */
#define MASK_128BIT_LONG_DOUBLE 0x00400000 /* long double size is 128bit */
#define MASK_MIX_SSE_I387 0x00800000 /* Mix SSE and i387 instructions */
#define MASK_64BIT 0x01000000 /* Produce 64bit code */
#define MASK_NO_RED_ZONE 0x02000000 /* Do not use red zone */
#define MASK_3DNOW_SET 0x00200000
#define MASK_3DNOW_A 0x00400000 /* Support Athlon 3Dnow builtins */
#define MASK_3DNOW_A_SET 0x00800000
#define MASK_128BIT_LONG_DOUBLE 0x01000000 /* long double size is 128bit */
#define MASK_MIX_SSE_I387 0x02000000 /* Mix SSE and i387 instructions */
#define MASK_64BIT 0x04000000 /* Produce 64bit code */
#define MASK_NO_RED_ZONE 0x08000000 /* Do not use red zone */
/* Temporary codegen switches */
#define MASK_INTEL_SYNTAX 0x00000200
#define MASK_DEBUG_ARG 0x00000400 /* function_arg */
#define MASK_DEBUG_ADDR 0x00000800 /* GO_IF_LEGITIMATE_ADDRESS */
#define MASK_INTEL_SYNTAX 0x10000000
#define MASK_DEBUG_ARG 0x20000000 /* function_arg */
#define MASK_DEBUG_ADDR 0x40000000 /* GO_IF_LEGITIMATE_ADDRESS */
/* Use the floating point instructions */
#define TARGET_80387 (target_flags & MASK_80387)
@ -335,24 +340,30 @@ extern const int x86_epilogue_using_move, x86_decompose_lea;
N_("Use push instructions to save outgoing arguments") }, \
{ "no-push-args", MASK_NO_PUSH_ARGS, \
N_("Do not use push instructions to save outgoing arguments") }, \
{ "accumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS, \
{ "accumulate-outgoing-args", (MASK_ACCUMULATE_OUTGOING_ARGS \
| MASK_ACCUMULATE_OUTGOING_ARGS_SET), \
N_("Use push instructions to save outgoing arguments") }, \
{ "no-accumulate-outgoing-args",MASK_NO_ACCUMULATE_OUTGOING_ARGS, \
{ "no-accumulate-outgoing-args",MASK_ACCUMULATE_OUTGOING_ARGS_SET, \
N_("Do not use push instructions to save outgoing arguments") }, \
{ "mmx", MASK_MMX, N_("Support MMX builtins") }, \
{ "no-mmx", -MASK_MMX, \
{ "mmx", MASK_MMX | MASK_MMX_SET, \
N_("Support MMX builtins") }, \
{ "no-mmx", -MASK_MMX, \
N_("Do not support MMX builtins") }, \
{ "3dnow", MASK_3DNOW, \
{ "no-mmx", MASK_MMX_SET, N_("") }, \
{ "3dnow", MASK_3DNOW | MASK_3DNOW_SET, \
N_("Support 3DNow! builtins") }, \
{ "no-3dnow", -MASK_3DNOW, \
{ "no-3dnow", -MASK_3DNOW, N_("") }, \
{ "no-3dnow", MASK_3DNOW_SET, \
N_("Do not support 3DNow! builtins") }, \
{ "sse", MASK_SSE, \
{ "sse", MASK_SSE | MASK_SSE_SET, \
N_("Support MMX and SSE builtins and code generation") }, \
{ "no-sse", -MASK_SSE, \
{ "no-sse", -MASK_SSE, N_("") }, \
{ "no-sse", MASK_SSE_SET, \
N_("Do not support MMX and SSE builtins and code generation") }, \
{ "sse2", MASK_SSE2, \
{ "sse2", MASK_SSE2 | MASK_SSE2_SET, \
N_("Support MMX, SSE and SSE2 builtins and code generation") }, \
{ "no-sse2", -MASK_SSE2, \
{ "no-sse2", -MASK_SSE2, N_("") }, \
{ "no-sse2", MASK_SSE2_SET, \
N_("Do not support MMX, SSE and SSE2 builtins and code generation") }, \
{ "mix-sse-i387", MASK_MIX_SSE_I387, \
N_("Use both SSE and i387 instruction sets for floating point arithmetics") },\
@ -522,11 +533,22 @@ extern int ix86_arch;
%{march=pentium4:-D__pentium4 -D__pentium4__ %{!mcpu*:-D__tune_pentium4__ }}\
%{m386|mcpu=i386:-D__tune_i386__ }\
%{m486|mcpu=i486:-D__tune_i486__ }\
%{mpentium|mcpu=pentium|mcpu=i586:-D__tune_i586__ -D__tune_pentium__ }\
%{mpentiumpro|mcpu=pentiumpro|mcpu=i686:-D__tune_i686__ -D__tune_pentiumpro__ }\
%{mcpu=k6:-D__tune_k6__ }\
%{mcpu=athlon:-D__tune_athlon__ }\
%{mpentium|mcpu=pentium|mcpu=i586|mcpu=pentium-mmx:-D__tune_i586__ -D__tune_pentium__ }\
%{mpentiumpro|mcpu=pentiumpro|mcpu=i686|cpu=pentium2|cpu=pentium3:-D__tune_i686__\
-D__tune_pentiumpro__ }\
%{mcpu=k6|mcpu=k6-2|mcpu=k6-3:-D__tune_k6__ }\
%{mcpu=athlon|mcpu=athlon-tbird|mcpu=athlon-4|mcpu=athlon-xp|mcpu=athlon-mp:\
-D__tune_athlon__ }\
%{mcpu=pentium4:-D__tune_pentium4__ }\
%{march=march=athlon-tbird|march=athlon-xp|march=athlon-mp|march=pentium3|march=pentium4:\
-D__SSE__ }\
%{march=pentium-mmx|march=k6|march=k6-2|march=k6-3\
march=athlon|march=athlon-tbird|march=athlon-4|march=athlon-xp\
|march=athlon-mp|march=pentium2|march=pentium3|march=pentium4: -D__MMX__ }\
%{march=k6|march=k6-2|march=k6-3\
march=athlon|march=athlon-tbird|march=athlon-4|march=athlon-xp\
|march=athlon-mp: -D__3dNOW__ }\
%{mcpu=mcpu=pentium4: -D__SSE2__ }\
%{!march*:%{!mcpu*:%{!m386:%{!m486:%{!mpentium*:%(cpp_cpu_default)}}}}}"
#ifndef CPP_CPU_SPEC

View File

@ -272,7 +272,7 @@ in the following sections.
-fno-inline -fno-math-errno -fno-peephole -fno-peephole2 @gol
-funsafe-math-optimizations -fno-trapping-math @gol
-fomit-frame-pointer -foptimize-register-move @gol
-foptimize-sibling-calls -freduce-all-givs @gol
-foptimize-sibling-calls -fprefetch-loop-arrays -freduce-all-givs @gol
-fregmove -frename-registers @gol
-frerun-cse-after-loop -frerun-loop-opt @gol
-fschedule-insns -fschedule-insns2 @gol
@ -3570,6 +3570,10 @@ the loop is entered. This usually makes programs run more slowly.
@option{-funroll-all-loops} implies the same options as
@option{-funroll-loops},
@item -fprefetch-loop-arrays
@opindex fprefetch-loop-arrays
If supported by the target machine, generate instructions to prefetch
memory to improve the performance of loops that access large arrays.
@item -fmove-all-movables
@opindex fmove-all-movables
@ -7476,10 +7480,13 @@ computers:
@table @gcctabopt
@item -mcpu=@var{cpu-type}
@opindex mcpu
Assume the defaults for the machine type @var{cpu-type} when scheduling
instructions. The choices for @var{cpu-type} are @samp{i386},
@samp{i486}, @samp{i586}, @samp{i686}, @samp{pentium},
@samp{pentiumpro}, @samp{pentium4}, @samp{k6}, and @samp{athlon}
Tune to @var{cpu-type} everything applicable about the generated code, except
for the ABI and the set of available instructions. The choices for
@var{cpu-type} are @samp{i386}, @samp{i486}, @samp{i586}, @samp{i686},
@samp{pentium}, @samp{pentium-mmx}, @samp{pentiumpro}, @samp{pentium2},
@samp{pentium3}, @samp{pentium4}, @samp{k6}, @samp{k6-2}, @samp{k6-3},
@samp{athlon}, @samp{athlon-tbird}, @samp{athlon-4}, @samp{athlon-xp}
and @samp{athlon-mp}.
While picking a specific @var{cpu-type} will schedule things appropriately
for that particular chip, the compiler will not generate any code that

View File

@ -269,6 +269,10 @@ extern int flag_unroll_all_loops;
extern int flag_move_all_movables;
/* Nonzero enables prefetch optimizations for arrays in loops. */
extern int flag_prefetch_loop_arrays;
/* Nonzero forces all general induction variables in loops to be
strength reduced. */

View File

@ -53,6 +53,90 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
#include "except.h"
#include "toplev.h"
#include "predict.h"
#include "insn-flags.h"
/* Not really meaningful values, but at least something. */
#ifndef SIMULTANEOUS_PREFETCHES
#define SIMULTANEOUS_PREFETCHES 3
#endif
#ifndef PREFETCH_BLOCK
#define PREFETCH_BLOCK 32
#endif
#ifndef HAVE_prefetch
#define HAVE_prefetch 0
#define gen_prefetch(a,b,c) (abort(), NULL_RTX)
#endif
/* Give up the prefetch optimizations once we exceed a given threshhold.
It is unlikely that we would be able to optimize something in a loop
with so many detected prefetches. */
#define MAX_PREFETCHES 100
/* The number of prefetch blocks that are beneficial to fetch at once before
a loop with a known (and low) iteration count. */
#define PREFETCH_BLOCKS_BEFORE_LOOP_MAX 6
/* For very tiny loops it is not worthwhile to prefetch even before the loop,
since it is likely that the data are already in the cache. */
#define PREFETCH_BLOCKS_BEFORE_LOOP_MIN 2
/* The minimal number of prefetch blocks that a loop must consume to make
the emitting of prefetch instruction in the body of loop worthwhile. */
#define PREFETCH_BLOCKS_IN_LOOP_MIN 6
/* Parameterize some prefetch heuristics so they can be turned on and off
easily for performance testing on new architecures. These can be
defined in target-dependent files. */
/* Prefetch is worthwhile only when loads/stores are dense. */
#ifndef PREFETCH_ONLY_DENSE_MEM
#define PREFETCH_ONLY_DENSE_MEM 1
#endif
/* Define what we mean by "dense" loads and stores; This value divided by 256
is the minimum percentage of memory references that worth prefetching. */
#ifndef PREFETCH_DENSE_MEM
#define PREFETCH_DENSE_MEM 220
#endif
/* Do not prefetch for a loop whose iteration count is known to be low. */
#ifndef PREFETCH_NO_LOW_LOOPCNT
#define PREFETCH_NO_LOW_LOOPCNT 1
#endif
/* Define what we mean by a "low" iteration count. */
#ifndef PREFETCH_LOW_LOOPCNT
#define PREFETCH_LOW_LOOPCNT 32
#endif
/* Do not prefetch for a loop that contains a function call; such a loop is
probably not an internal loop. */
#ifndef PREFETCH_NO_CALL
#define PREFETCH_NO_CALL 1
#endif
/* Do not prefetch accesses with an extreme stride. */
#ifndef PREFETCH_NO_EXTREME_STRIDE
#define PREFETCH_NO_EXTREME_STRIDE 1
#endif
/* Define what we mean by an "extreme" stride. */
#ifndef PREFETCH_EXTREME_STRIDE
#define PREFETCH_EXTREME_STRIDE 4096
#endif
/* Do not handle reversed order prefetches (negative stride). */
#ifndef PREFETCH_NO_REVERSE_ORDER
#define PREFETCH_NO_REVERSE_ORDER 1
#endif
/* Prefetch even if the GIV is not always executed. */
#ifndef PREFETCH_NOT_ALWAYS
#define PREFETCH_NOT_ALWAYS 0
#endif
/* If the loop requires more prefetches than the target can process in
parallel then don't prefetch anything in that loop. */
#ifndef PREFETCH_LIMIT_TO_SIMULTANEOUS
#define PREFETCH_LIMIT_TO_SIMULTANEOUS 1
#endif
#define LOOP_REG_LIFETIME(LOOP, REGNO) \
((REGNO_LAST_LUID (REGNO) - REGNO_FIRST_LUID (REGNO)))
@ -262,6 +346,7 @@ static rtx loop_insn_sink_or_swim PARAMS((const struct loop *, rtx));
static void loop_dump_aux PARAMS ((const struct loop *, FILE *, int));
static void loop_delete_insns PARAMS ((rtx, rtx));
static int remove_constant_addition PARAMS ((rtx *));
void debug_ivs PARAMS ((const struct loop *));
void debug_iv_class PARAMS ((const struct iv_class *));
void debug_biv PARAMS ((const struct induction *));
@ -3412,6 +3497,509 @@ loop_reg_used_before_p (loop, set, insn)
return 0;
}
/* Information we collect about arrays that we might want to prefetch. */
struct prefetch_info
{
struct iv_class *class; /* Class this prefetch is based on. */
struct induction *giv; /* GIV this prefetch is based on. */
rtx base_address; /* Start prefetching from this address plus
index. */
HOST_WIDE_INT index;
HOST_WIDE_INT stride; /* Prefetch stride in bytes in each
iteration. */
unsigned int bytes_accesed; /* Sum of sizes of all acceses to this
prefetch area in one iteration. */
unsigned int total_bytes; /* Total bytes loop will access in this block.
This is set only for loops with known
iteration counts and is 0xffffffff
otherwise. */
unsigned int write : 1; /* 1 for read/write prefetches. */
unsigned int prefetch_in_loop : 1;
/* 1 for those chosen for prefetching. */
unsigned int prefetch_before_loop : 1;
/* 1 for those chosen for prefetching. */
};
/* Data used by check_store function. */
struct check_store_data
{
rtx mem_address;
int mem_write;
};
static void check_store PARAMS ((rtx, rtx, void *));
static void emit_prefetch_instructions PARAMS ((struct loop *));
static int rtx_equal_for_prefetch_p PARAMS ((rtx, rtx));
/* Set mem_write when mem_address is found. Used as callback to
note_stores. */
static void
check_store (x, pat, data)
rtx x, pat ATTRIBUTE_UNUSED;
void *data;
{
struct check_store_data *d = (struct check_store_data *)data;
if ((GET_CODE (x) == MEM) && rtx_equal_p (d->mem_address, XEXP (x, 0)))
d->mem_write = 1;
}
/* Like rtx_equal_p, but attempts to swap commutative operands. This is
important to get some addresses combined. Later more sophisticated
transformations can be added when necesary.
??? Same trick with swapping operand is done at several other places.
It can be nice to develop some common way to handle this. */
static int
rtx_equal_for_prefetch_p (x, y)
rtx x, y;
{
int i;
int j;
enum rtx_code code = GET_CODE (x);
const char *fmt;
if (x == y)
return 1;
if (code != GET_CODE (y))
return 0;
code = GET_CODE (x);
if (GET_RTX_CLASS (code) == 'c')
{
return ((rtx_equal_for_prefetch_p (XEXP (x, 0), XEXP (y, 0))
&& rtx_equal_for_prefetch_p (XEXP (x, 1), XEXP (y, 1)))
|| (rtx_equal_for_prefetch_p (XEXP (x, 0), XEXP (y, 1))
&& rtx_equal_for_prefetch_p (XEXP (x, 1), XEXP (y, 0))));
}
/* Compare the elements. If any pair of corresponding elements fails to
match, return 0 for the whole thing. */
fmt = GET_RTX_FORMAT (code);
for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
{
switch (fmt[i])
{
case 'w':
if (XWINT (x, i) != XWINT (y, i))
return 0;
break;
case 'i':
if (XINT (x, i) != XINT (y, i))
return 0;
break;
case 'E':
/* Two vectors must have the same length. */
if (XVECLEN (x, i) != XVECLEN (y, i))
return 0;
/* And the corresponding elements must match. */
for (j = 0; j < XVECLEN (x, i); j++)
if (rtx_equal_for_prefetch_p (XVECEXP (x, i, j),
XVECEXP (y, i, j)) == 0)
return 0;
break;
case 'e':
if (rtx_equal_for_prefetch_p (XEXP (x, i), XEXP (y, i)) == 0)
return 0;
break;
case 's':
if (strcmp (XSTR (x, i), XSTR (y, i)))
return 0;
break;
case 'u':
/* These are just backpointers, so they don't matter. */
break;
case '0':
break;
/* It is believed that rtx's at this level will never
contain anything but integers and other rtx's,
except for within LABEL_REFs and SYMBOL_REFs. */
default:
abort ();
}
}
return 1;
}
/* Remove constant addition value from the expression X (when present)
and return it. */
static HOST_WIDE_INT
remove_constant_addition (x)
rtx *x;
{
HOST_WIDE_INT addval = 0;
rtx exp=*x;
if (GET_CODE (exp) == CONST)
exp = XEXP (exp, 0);
if (GET_CODE (exp) == CONST_INT)
{
addval = INTVAL (exp);
*x = const0_rtx;
}
/* For plus expression recurse on ourself. */
else if (GET_CODE (exp) == PLUS)
{
addval += remove_constant_addition (&XEXP (exp, 0));
addval += remove_constant_addition (&XEXP (exp, 1));
/* In case our parameter was constant, remove extra zero
from the expression. */
if (XEXP (exp, 0) == const0_rtx)
*x = XEXP (exp, 1);
else if (XEXP (exp, 1) == const0_rtx)
*x = XEXP (exp, 0);
}
return addval;
}
/* Attempt to identify accesses to arrays that are most likely to cause cache
misses, and emit prefetch instructions a few prefetch blocks forward.
To detect the arrays we use the GIV information that was collected by the
strength reduction pass.
The prefetch instructions are generated after the GIV information is done
and before the strength reduction process. The new GIVs are injected into
the strength reduction tables, so the prefetch addresses are optimized as
well.
GIVs are split into base address, stride, and constant addition values.
GIVs with the same address, stride and close addition values are combined
into a single prefetch. Also writes to GIVs are detected, so that prefetch
for write instructions can be used for the block we write to, on machines
that support write prefetches.
Several heuristics are used to determine when to prefetch. They are
controlled by defined symbols that can be overridden for each target.
*/
static void
emit_prefetch_instructions (struct loop *loop)
{
int num_prefetches = 0;
int num_real_prefetches = 0;
int num_real_write_prefetches = 0;
int ahead;
int i;
struct iv_class *bl;
struct induction *iv;
struct prefetch_info info[MAX_PREFETCHES];
struct loop_ivs *ivs = LOOP_IVS (loop);
if (!HAVE_prefetch)
return;
/* Consider only loops w/o calls. When a call is done, the loop is probably
slow enough to read the memory. */
if (PREFETCH_NO_CALL && LOOP_INFO (loop)->has_call)
{
if (loop_dump_stream)
fprintf (loop_dump_stream, "Prefetch: ignoring loop - has call.\n");
return;
}
if (PREFETCH_NO_LOW_LOOPCNT
&& LOOP_INFO (loop)->n_iterations
&& LOOP_INFO (loop)->n_iterations <= PREFETCH_LOW_LOOPCNT)
{
if (loop_dump_stream)
fprintf (loop_dump_stream,
"Prefetch: ignoring loop - not enought iterations.\n");
return;
}
/* Search all induction variables and pick those interesting for the prefetch
machinery. */
for (bl = ivs->list; bl; bl = bl->next)
{
struct induction *biv = bl->biv, *biv1;
int basestride = 0;
biv1 = biv;
/* Expect all BIVs to be executed in each iteration. This makes our
analysis more conservative. */
while (biv1)
{
/* Discard non-constant additions that we can't handle well yet, and
BIVs that are executed multiple times; such BIVs ought to be
handled in the nested loop. We accept not_every_iteration BIVs,
since these only result in larger strides and make our
heuristics more conservative.
??? What does the last sentence mean? */
if (GET_CODE (biv->add_val) != CONST_INT)
{
if (loop_dump_stream)
{
fprintf (loop_dump_stream, "Prefetch: biv %i ignored: non-constant addition at insn %i:",
REGNO (biv->src_reg), INSN_UID (biv->insn));
print_rtl (loop_dump_stream, biv->add_val);
fprintf (loop_dump_stream, "\n");
}
break;
}
if (biv->maybe_multiple)
{
if (loop_dump_stream)
{
fprintf (loop_dump_stream, "Prefetch: biv %i ignored: maybe_multiple at insn %i:",
REGNO (biv->src_reg), INSN_UID (biv->insn));
print_rtl (loop_dump_stream, biv->add_val);
fprintf (loop_dump_stream, "\n");
}
break;
}
basestride += INTVAL (biv1->add_val);
biv1 = biv1->next_iv;
}
if (biv1 || !basestride)
continue;
for (iv = bl->giv; iv; iv = iv->next_iv)
{
rtx address;
rtx temp;
HOST_WIDE_INT index = 0;
int add = 1;
HOST_WIDE_INT stride;
struct check_store_data d;
int size = GET_MODE_SIZE (GET_MODE (iv));
/* There are several reasons why an induction variable is not
interesting to us. */
if (iv->giv_type != DEST_ADDR
/* We are interested only in constant stride memory references
in order to be able to compute density easily. */
|| GET_CODE (iv->mult_val) != CONST_INT
/* Don't handle reversed order prefetches, since they are usually
ineffective. Later we may be able to reverse such BIVs. */
|| (PREFETCH_NO_REVERSE_ORDER
&& (stride = INTVAL (iv->mult_val) * basestride) < 0)
/* Prefetching of accesses with such a extreme stride is probably
not worthwhile, either. */
|| (PREFETCH_NO_EXTREME_STRIDE
&& stride > PREFETCH_EXTREME_STRIDE)
/* Ignore GIVs with varying add values; we can't predict the value
for the next iteration. */
|| !loop_invariant_p (loop, iv->add_val)
/* Ignore GIVs in the nested loops; they ought to have been handled
already. */
|| iv->maybe_multiple)
{
if (loop_dump_stream)
{
fprintf (loop_dump_stream, "Prefetch: Ignoring giv at %i\n",
INSN_UID (iv->insn));
}
continue;
}
/* Determine the pointer to the basic array we are examining. It is
the sum of the BIV's initial value and the GIV's add_val. */
index = 0;
address = copy_rtx (iv->add_val);
temp = copy_rtx (bl->initial_value);
address = simplify_gen_binary (PLUS, Pmode, temp, address);
index = remove_constant_addition (&address);
index += size;
d.mem_write = 0;
d.mem_address = *iv->location;
/* When the GIV is not always executed, we might be better off by
not dirtying the cache pages. */
if (PREFETCH_NOT_ALWAYS || iv->always_executed)
note_stores (PATTERN (iv->insn), check_store, &d);
/* Attempt to find another prefetch to the same array and see if we
can merge this one. */
for (i = 0; i < num_prefetches; i++)
if (rtx_equal_for_prefetch_p (address, info[i].base_address)
&& stride == info[i].stride)
{
/* In case both access same array (same location
just with small difference in constant indexes), merge
the prefetches. Just do the later and the earlier will
get prefetched from previous iteration.
4096 is artificial threshold. It should not be too small,
but also not bigger than small portion of memory usually
traversed by single loop. */
if (index >= info[i].index && index - info[i].index < 4096)
{
info[i].write |= d.mem_write;
info[i].bytes_accesed += size;
info[i].index = index;
info[i].giv = iv;
info[i].class = bl;
info[num_prefetches].base_address = address;
add = 0;
break;
}
if (index < info[i].index && info[i].index - index < 4096)
{
info[i].write |= d.mem_write;
info[i].bytes_accesed += size;
add = 0;
break;
}
}
/* Merging failed. */
if (add)
{
info[num_prefetches].giv = iv;
info[num_prefetches].class = bl;
info[num_prefetches].index = index;
info[num_prefetches].stride = stride;
info[num_prefetches].base_address = address;
info[num_prefetches].write = d.mem_write;
info[num_prefetches].bytes_accesed = size;
num_prefetches++;
if (num_prefetches >= MAX_PREFETCHES)
{
if (loop_dump_stream)
fprintf(loop_dump_stream,"Maximal number of prefetches exceeded.\n");
return;
}
}
}
}
for (i = 0; i < num_prefetches; i++)
{
/* Attempt to calculate the number of bytes fetched by the loop.
Avoid overflow. */
if (LOOP_INFO (loop)->n_iterations
&& (0xffffffff / info[i].stride) >= LOOP_INFO (loop)->n_iterations)
info[i].total_bytes = info[i].stride * LOOP_INFO (loop)->n_iterations;
else
info[i].total_bytes = 0xffffffff;
/* Prefetch is worthwhile only when the loads/stores are dense. */
if (PREFETCH_ONLY_DENSE_MEM
&& (info[i].bytes_accesed * 256 / info[i].stride > PREFETCH_DENSE_MEM)
&& (info[i].total_bytes / PREFETCH_BLOCK >=
PREFETCH_BLOCKS_BEFORE_LOOP_MIN))
{
info[i].prefetch_before_loop = 1;
if (info[i].total_bytes / PREFETCH_BLOCK <=
PREFETCH_BLOCKS_BEFORE_LOOP_MAX)
info[i].prefetch_in_loop = 0;
else
info[i].prefetch_in_loop = 1;
}
else
info[i].prefetch_in_loop = 0, info[i].prefetch_before_loop = 0;
if (info[i].prefetch_in_loop)
{
num_real_prefetches += ((info[i].stride + PREFETCH_BLOCK - 1)
/ PREFETCH_BLOCK);
if (info[i].write)
num_real_write_prefetches +=
((info[i].stride + PREFETCH_BLOCK - 1) / PREFETCH_BLOCK);
}
}
if (loop_dump_stream)
{
for (i = 0; i < num_prefetches; i++)
{
fprintf (loop_dump_stream, "Prefetch insn %i address: ",
INSN_UID (info[i].giv->insn));
print_rtl (loop_dump_stream, info[i].base_address);
fprintf (loop_dump_stream, " Index:%i stride:%i density:%i%% total_bytes: %u %s in loop:%s before:%s\n",
info[i].index, info[i].stride,
info[i].bytes_accesed * 100 / info[i].stride,
info[i].total_bytes,
info[i].write ? "read/write" : "read only",
info[i].prefetch_in_loop ? "yes" : "no",
info[i].prefetch_before_loop ? "yes" : "no");
}
fprintf (loop_dump_stream, "Real prefetches needed:%i (write:%i)\n",
num_real_prefetches, num_real_write_prefetches);
}
if (!num_real_prefetches)
return;
ahead = (SIMULTANEOUS_PREFETCHES / (num_real_prefetches));
if (!ahead)
return;
for (i = 0; i < num_prefetches; i++)
{
if (info[i].prefetch_in_loop)
{
int y;
for (y = 0; y < ((info[i].stride + PREFETCH_BLOCK - 1)
/ PREFETCH_BLOCK); y++)
{
rtx loc = copy_rtx (*info[i].giv->location);
rtx insn;
int bytes_ahead = PREFETCH_BLOCK * (ahead + y);
rtx before_insn = info[i].giv->insn;
rtx prev_insn = PREV_INSN (info[i].giv->insn);
/* We can save some effort by offsetting the address on
architectures with offsettable memory references. */
if (offsettable_address_p (0, VOIDmode, loc))
loc = plus_constant (loc, bytes_ahead);
else
{
rtx reg = gen_reg_rtx (Pmode);
loop_iv_add_mult_emit_before (loop, loc, const1_rtx,
GEN_INT (bytes_ahead), reg,
0, before_insn);
loc = reg;
}
emit_insn_before (gen_prefetch (loc, GEN_INT (info[i].write),
GEN_INT (3)), before_insn);
/* Check all insns emitted and record the new GIV information. */
insn = NEXT_INSN (prev_insn);
while (insn != before_insn)
{
insn = check_insn_for_givs (loop, insn,
info[i].giv->always_executed,
info[i].giv->maybe_multiple);
insn = NEXT_INSN (insn);
}
}
}
if (info[i].prefetch_before_loop)
{
int y;
/* Emit INSNs before the loop to fetch the first cache lines. */
for (y = 0; ((!info[i].prefetch_in_loop || y < ahead)
&& y * PREFETCH_BLOCK < (int)info[i].total_bytes); y ++)
{
rtx reg = gen_reg_rtx (Pmode);
rtx loop_start = loop->start;
rtx add_val = simplify_gen_binary (PLUS, Pmode,
info[i].giv->add_val,
GEN_INT (y * PREFETCH_BLOCK));
loop_iv_add_mult_emit_before (loop, info[i].class->initial_value,
info[i].giv->mult_val,
add_val, reg, 0, loop_start);
emit_insn_before (gen_prefetch (reg, GEN_INT (info[i].write),
GEN_INT (3)), loop_start);
}
}
}
return;
}
/* A "basic induction variable" or biv is a pseudo reg that is set
(within this loop) only by incrementing or decrementing it. */
/* A "general induction variable" or giv is a pseudo reg whose
@ -4298,6 +4886,11 @@ strength_reduce (loop, flags)
fail if the iteration variable is a giv. */
loop_iterations (loop);
#ifdef HAVE_prefetch
if (flags & LOOP_PREFETCH)
emit_prefetch_instructions (loop);
#endif
/* Now for each giv for which we still don't know whether or not it is
replaceable, check to see if it is replaceable because its final value
can be calculated. This must be done after loop_iterations is called,

View File

@ -27,6 +27,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
/* Flags passed to loop_optimize. */
#define LOOP_UNROLL 1
#define LOOP_BCT 2
#define LOOP_PREFETCH 4
/* Get the loop info pointer of a loop. */
#define LOOP_INFO(LOOP) ((struct loop_info *) (LOOP)->aux)

View File

@ -329,12 +329,17 @@ estimate_probability (loops_info)
for (i = 0; i < loops_info->num; i++)
{
int j;
int exits;
struct loop *loop = &loops_info->array[i];
for (j = loops_info->array[i].first->index;
j <= loops_info->array[i].last->index;
flow_loop_scan (loops_info, loop, LOOP_EXIT_EDGES);
exits = loop->num_exits;
for (j = loop->first->index;
j <= loop->last->index;
++j)
{
if (TEST_BIT (loops_info->array[i].nodes, j))
if (TEST_BIT (loop->nodes, j))
{
int header_found = 0;
edge e;
@ -342,8 +347,8 @@ estimate_probability (loops_info)
/* Loop branch heuristics - predict as taken an edge back to
a loop's head. */
for (e = BASIC_BLOCK(j)->succ; e; e = e->succ_next)
if (e->dest == loops_info->array[i].header
&& e->src == loops_info->array[i].latch)
if (e->dest == loop->header
&& e->src == loop->latch)
{
header_found = 1;
predict_edge_def (e, PRED_LOOP_BRANCH, TAKEN);
@ -354,8 +359,11 @@ estimate_probability (loops_info)
if (!header_found)
for (e = BASIC_BLOCK(j)->succ; e; e = e->succ_next)
if (e->dest->index <= 0
|| !TEST_BIT (loops_info->array[i].nodes, e->dest->index))
predict_edge_def (e, PRED_LOOP_EXIT, NOT_TAKEN);
|| !TEST_BIT (loop->nodes, e->dest->index))
predict_edge (e, PRED_LOOP_EXIT,
(REG_BR_PROB_BASE
- predictor_info [(int)PRED_LOOP_EXIT].hitrate)
/ exits);
}
}
}
@ -435,74 +443,83 @@ estimate_probability (loops_info)
/* Try "pointer heuristic."
A comparison ptr == 0 is predicted as false.
Similarly, a comparison ptr1 == ptr2 is predicted as false. */
switch (GET_CODE (cond))
{
case EQ:
if (GET_CODE (XEXP (cond, 0)) == REG
&& REG_POINTER (XEXP (cond, 0))
&& (XEXP (cond, 1) == const0_rtx
|| (GET_CODE (XEXP (cond, 1)) == REG
&& REG_POINTER (XEXP (cond, 1)))))
if (GET_RTX_CLASS (GET_CODE (cond)) == '<'
&& ((REG_P (XEXP (cond, 0)) && REG_POINTER (XEXP (cond, 0)))
|| (REG_P (XEXP (cond, 1)) && REG_POINTER (XEXP (cond, 1)))))
switch (GET_CODE (cond))
{
case EQ:
predict_insn_def (last_insn, PRED_POINTER, NOT_TAKEN);
break;
case NE:
if (GET_CODE (XEXP (cond, 0)) == REG
&& REG_POINTER (XEXP (cond, 0))
&& (XEXP (cond, 1) == const0_rtx
|| (GET_CODE (XEXP (cond, 1)) == REG
&& REG_POINTER (XEXP (cond, 1)))))
break;
case NE:
predict_insn_def (last_insn, PRED_POINTER, TAKEN);
break;
default:
break;
}
break;
default:
break;
}
else
/* Try "opcode heuristic."
EQ tests are usually false and NE tests are usually true. Also,
most quantities are positive, so we can make the appropriate guesses
about signed comparisons against zero. */
switch (GET_CODE (cond))
{
case CONST_INT:
/* Unconditional branch. */
predict_insn_def (last_insn, PRED_UNCONDITIONAL,
cond == const0_rtx ? NOT_TAKEN : TAKEN);
break;
switch (GET_CODE (cond))
{
case CONST_INT:
/* Unconditional branch. */
predict_insn_def (last_insn, PRED_UNCONDITIONAL,
cond == const0_rtx ? NOT_TAKEN : TAKEN);
break;
case EQ:
case UNEQ:
predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
break;
case NE:
case LTGT:
predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
break;
case ORDERED:
predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
break;
case UNORDERED:
predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
break;
case LE:
case LT:
if (XEXP (cond, 1) == const0_rtx
|| (GET_CODE (XEXP (cond, 1)) == CONST_INT
&& INTVAL (XEXP (cond, 1)) == -1))
predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
break;
case GE:
case GT:
if (XEXP (cond, 1) == const0_rtx
|| (GET_CODE (XEXP (cond, 1)) == CONST_INT
&& INTVAL (XEXP (cond, 1)) == -1))
predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
break;
case EQ:
case UNEQ:
/* Floating point comparisons appears to behave in a very
inpredictable way because of special role of = tests in
FP code. */
if (FLOAT_MODE_P (GET_MODE (XEXP (cond, 0))))
;
/* Comparisons with 0 are often used for booleans and there is
nothing usefull to predict about them. */
else if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 0) == const0_rtx)
;
else
predict_insn_def (last_insn, PRED_OPCODE_NONEQUAL, NOT_TAKEN);
break;
case NE:
case LTGT:
/* Floating point comparisons appears to behave in a very
inpredictable way because of special role of = tests in
FP code. */
if (FLOAT_MODE_P (GET_MODE (XEXP (cond, 0))))
;
/* Comparisons with 0 are often used for booleans and there is
nothing usefull to predict about them. */
else if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 0) == const0_rtx)
;
else
predict_insn_def (last_insn, PRED_OPCODE_NONEQUAL, TAKEN);
break;
case ORDERED:
predict_insn_def (last_insn, PRED_FPOPCODE, TAKEN);
break;
case UNORDERED:
predict_insn_def (last_insn, PRED_FPOPCODE, NOT_TAKEN);
break;
case LE:
case LT:
if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 1) == const1_rtx
|| XEXP (cond, 1) == constm1_rtx)
predict_insn_def (last_insn, PRED_OPCODE_POSITIVE, NOT_TAKEN);
break;
case GE:
case GT:
if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 1) == const1_rtx
|| XEXP (cond, 1) == constm1_rtx)
predict_insn_def (last_insn, PRED_OPCODE_POSITIVE, TAKEN);
break;
default:
break;
}
default:
break;
}
}
/* Attach the combined probability to each conditional jump. */

View File

@ -89,7 +89,9 @@ DEF_PREDICTOR (PRED_LOOP_HEADER, "loop header", HITRATE (64), 0)
DEF_PREDICTOR (PRED_POINTER, "pointer", HITRATE (83), 0)
/* NE is probable, EQ not etc... */
DEF_PREDICTOR (PRED_OPCODE, "opcode", HITRATE (55), 0)
DEF_PREDICTOR (PRED_OPCODE_POSITIVE, "opcode values positive", HITRATE (78), 0)
DEF_PREDICTOR (PRED_OPCODE_NONEQUAL, "opcode values nonequal", HITRATE (70), 0)
DEF_PREDICTOR (PRED_FPOPCODE, "fp_opcode", HITRATE (90), 0)
/* Branch guarding call is probably taken. */
DEF_PREDICTOR (PRED_CALL, "call", HITRATE (70), 0)

View File

@ -46,6 +46,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
#include "flags.h"
#include "insn-attr.h"
#include "insn-config.h"
#include "insn-flags.h"
#include "hard-reg-set.h"
#include "recog.h"
#include "output.h"
@ -544,6 +545,10 @@ int flag_unroll_loops;
int flag_unroll_all_loops;
/* Nonzero enables prefetch optimizations for arrays in loops. */
int flag_prefetch_loop_arrays;
/* Nonzero forces all invariant computations in loops to be moved
outside the loop. */
@ -1001,6 +1006,8 @@ lang_independent_options f_options[] =
N_("Perform loop unrolling when iteration count is known") },
{"unroll-all-loops", &flag_unroll_all_loops, 1,
N_("Perform loop unrolling for all loops") },
{"prefetch-loop-arrays", &flag_prefetch_loop_arrays, 1,
N_("Generate prefetch instructions, if available, for arrays in loops") },
{"move-all-movables", &flag_move_all_movables, 1,
N_("Force all loop invariant computations out of loops") },
{"reduce-all-givs", &flag_reduce_all_givs, 1,
@ -2863,7 +2870,8 @@ rest_of_compilation (decl)
}
cleanup_barriers ();
loop_optimize (insns, rtl_dump_file,
(flag_unroll_loops ? LOOP_UNROLL : 0) | LOOP_BCT);
(flag_unroll_loops ? LOOP_UNROLL : 0) | LOOP_BCT
| (flag_prefetch_loop_arrays ? LOOP_PREFETCH : 0));
close_dump_file (DFI_loop, print_rtl, insns);
timevar_pop (TV_LOOP);
@ -4928,6 +4936,20 @@ process_options ()
flag_function_sections = 0;
}
#ifndef HAVE_prefetch
if (flag_prefetch_loop_arrays)
{
warning ("-fprefetch-loop-arrays not supported for this target");
flag_prefetch_loop_arrays = 0;
}
#else
if (flag_prefetch_loop_arrays && !HAVE_prefetch)
{
warning ("-fprefetch-loop-arrays not supported for this target (try -march switches)");
flag_prefetch_loop_arrays = 0;
}
#endif
#ifndef OBJECT_FORMAT_ELF
if (flag_function_sections && write_symbols != NO_DEBUG)
warning ("-ffunction-sections may affect debugging on some targets");