predict.c (estimate_probability): Reorganize opcode heuristics.
* predict.c (estimate_probability): Reorganize opcode heuristics. * predict.def (PRED_OPCODE_POSITIVE, PRED_OPCODE_NONEQUAL, PRED_FPOPCODE): New. * i386.c (override_options): Recognize various CPU variants and set SSE/MMX/3dNOW flags accordingly. * i386.h (MASK_MMX_SET, MASK_SSE_SET, MASK_SSE2_SET, MASK_3DNOW_SET, MASK_3DNOW_A_SET): New. (MASK_ACCUMULATE_OUTGOING_ARGS_SET): New. (MASK_NO_ACCUMULATE_OUTGOING_ARGS): Delete. (MASK_*): Renumber. (TARGET_FLAGS): Use new masks. (CPP_CPU_SPECS): Recognize new CPU variants. * invoke.texi (-mcpu): Update documentation. * flags.h (flag_prefetch_loop_arrays): Declare. * loop.h (LOOP_PREFETCH): Define new constant. * loop.c (strength_reduce): Call emit_prefetch_instructions. (MAX_PREFETCHES, PREFETCH_BLOCKS_BEFORE_LOOP_MAX, PREFETCH_BLOCKS_BEFORE_LOOP_MIN, PREFETCH_BLOCKS_IN_LOOP_MIN): New constants. (check_store_data): New structure. (check_store, emit_prefetch_instructions, rtx_equal_for_prefetch_p): New functions. * toplev.c: Include insn-flags.h. (flag_prefetch_loop_arrays): New global variable. (lang_independent_option): Add -fprefetch-loop-arrays. (rest_of_compilation) Pass LOOP_PREFETCH when flag_prefetch_loop_arrays is set. * Makefile.in (toplev.c): Depend on insn-flags.h. * invoke.texi (-fprefetch-loop-arrays): Document. * predict.c (estimate_probability): Distribute the loop exit probability according to number of exit edges. * cfgcleanup.c (insns_match_p): Break out from ...; (flow_find_cross_jump): ... here; (outgoing_edges_match): Add parameter MODE; attempt to match everything except for tablejumps. (try_crossjump_to_edge): Accept complex edges. (try_crossjump_bb): Likewise. From-SVN: r47969
This commit is contained in:
parent
85230e5255
commit
0dd0e980b5
@ -1,3 +1,47 @@
|
||||
Thu Dec 13 12:31:07 CET 2001 Jan Hubicka <jh@suse.cz>
|
||||
|
||||
* predict.c (estimate_probability): Reorganize opcode heuristics.
|
||||
* predict.def (PRED_OPCODE_POSITIVE, PRED_OPCODE_NONEQUAL,
|
||||
PRED_FPOPCODE): New.
|
||||
|
||||
* i386.c (override_options): Recognize various CPU variants and set
|
||||
SSE/MMX/3dNOW flags accordingly.
|
||||
* i386.h (MASK_MMX_SET, MASK_SSE_SET, MASK_SSE2_SET, MASK_3DNOW_SET,
|
||||
MASK_3DNOW_A_SET): New.
|
||||
(MASK_ACCUMULATE_OUTGOING_ARGS_SET): New.
|
||||
(MASK_NO_ACCUMULATE_OUTGOING_ARGS): Delete.
|
||||
(MASK_*): Renumber.
|
||||
(TARGET_FLAGS): Use new masks.
|
||||
(CPP_CPU_SPECS): Recognize new CPU variants.
|
||||
* invoke.texi (-mcpu): Update documentation.
|
||||
|
||||
* flags.h (flag_prefetch_loop_arrays): Declare.
|
||||
* loop.h (LOOP_PREFETCH): Define new constant.
|
||||
* loop.c (strength_reduce): Call emit_prefetch_instructions.
|
||||
(MAX_PREFETCHES, PREFETCH_BLOCKS_BEFORE_LOOP_MAX,
|
||||
PREFETCH_BLOCKS_BEFORE_LOOP_MIN, PREFETCH_BLOCKS_IN_LOOP_MIN): New
|
||||
constants.
|
||||
(check_store_data): New structure.
|
||||
(check_store, emit_prefetch_instructions, rtx_equal_for_prefetch_p):
|
||||
New functions.
|
||||
* toplev.c: Include insn-flags.h.
|
||||
(flag_prefetch_loop_arrays): New global variable.
|
||||
(lang_independent_option): Add -fprefetch-loop-arrays.
|
||||
(rest_of_compilation) Pass LOOP_PREFETCH when flag_prefetch_loop_arrays
|
||||
is set.
|
||||
* Makefile.in (toplev.c): Depend on insn-flags.h.
|
||||
* invoke.texi (-fprefetch-loop-arrays): Document.
|
||||
|
||||
* predict.c (estimate_probability): Distribute the loop exit
|
||||
probability according to number of exit edges.
|
||||
|
||||
* cfgcleanup.c (insns_match_p): Break out from ...;
|
||||
(flow_find_cross_jump): ... here;
|
||||
(outgoing_edges_match): Add parameter MODE; attempt to match everything
|
||||
except for tablejumps.
|
||||
(try_crossjump_to_edge): Accept complex edges.
|
||||
(try_crossjump_bb): Likewise.
|
||||
|
||||
2001-11-29 Corey Minyard <minyard@acm.org>
|
||||
|
||||
* recog.c (validate_replace_rtx_1): Use simplify_gen_binary
|
||||
|
@ -1321,7 +1321,7 @@ toplev.o : toplev.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(RTL_H) function.h \
|
||||
dwarf2out.h sdbout.h dbxout.h $(EXPR_H) hard-reg-set.h $(BASIC_BLOCK_H) \
|
||||
graph.h $(LOOP_H) except.h $(REGS_H) $(TIMEVAR_H) $(lang_options_files) \
|
||||
ssa.h $(PARAMS_H) $(TM_P_H) reload.h dwarf2asm.h $(TARGET_H) halfpic.h \
|
||||
langhooks.h
|
||||
langhooks.h insn-flags.h
|
||||
$(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
|
||||
-DTARGET_NAME=\"$(target_alias)\" \
|
||||
-c $(srcdir)/toplev.c $(OUTPUT_OPTION)
|
||||
|
284
gcc/cfgcleanup.c
284
gcc/cfgcleanup.c
@ -64,9 +64,11 @@ enum bb_flags {
|
||||
|
||||
static bool try_crossjump_to_edge PARAMS ((int, edge, edge));
|
||||
static bool try_crossjump_bb PARAMS ((int, basic_block));
|
||||
static bool outgoing_edges_match PARAMS ((basic_block, basic_block));
|
||||
static bool outgoing_edges_match PARAMS ((int,
|
||||
basic_block, basic_block));
|
||||
static int flow_find_cross_jump PARAMS ((int, basic_block, basic_block,
|
||||
rtx *, rtx *));
|
||||
static bool insns_match_p PARAMS ((int, rtx, rtx));
|
||||
|
||||
static bool delete_unreachable_blocks PARAMS ((void));
|
||||
static bool label_is_jump_target_p PARAMS ((rtx, rtx));
|
||||
@ -546,6 +548,108 @@ merge_blocks (e, b, c, mode)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/* Return true if I1 and I2 are equivalent and thus can be crossjumped. */
|
||||
|
||||
static bool
|
||||
insns_match_p (mode, i1, i2)
|
||||
int mode;
|
||||
rtx i1, i2;
|
||||
{
|
||||
rtx p1, p2;
|
||||
|
||||
/* Verify that I1 and I2 are equivalent. */
|
||||
if (GET_CODE (i1) != GET_CODE (i2))
|
||||
return false;
|
||||
|
||||
p1 = PATTERN (i1);
|
||||
p2 = PATTERN (i2);
|
||||
|
||||
if (GET_CODE (p1) != GET_CODE (p2))
|
||||
return false;
|
||||
|
||||
/* If this is a CALL_INSN, compare register usage information.
|
||||
If we don't check this on stack register machines, the two
|
||||
CALL_INSNs might be merged leaving reg-stack.c with mismatching
|
||||
numbers of stack registers in the same basic block.
|
||||
If we don't check this on machines with delay slots, a delay slot may
|
||||
be filled that clobbers a parameter expected by the subroutine.
|
||||
|
||||
??? We take the simple route for now and assume that if they're
|
||||
equal, they were constructed identically. */
|
||||
|
||||
if (GET_CODE (i1) == CALL_INSN
|
||||
&& !rtx_equal_p (CALL_INSN_FUNCTION_USAGE (i1),
|
||||
CALL_INSN_FUNCTION_USAGE (i2)))
|
||||
return false;
|
||||
|
||||
#ifdef STACK_REGS
|
||||
/* If cross_jump_death_matters is not 0, the insn's mode
|
||||
indicates whether or not the insn contains any stack-like
|
||||
regs. */
|
||||
|
||||
if ((mode & CLEANUP_POST_REGSTACK) && stack_regs_mentioned (i1))
|
||||
{
|
||||
/* If register stack conversion has already been done, then
|
||||
death notes must also be compared before it is certain that
|
||||
the two instruction streams match. */
|
||||
|
||||
rtx note;
|
||||
HARD_REG_SET i1_regset, i2_regset;
|
||||
|
||||
CLEAR_HARD_REG_SET (i1_regset);
|
||||
CLEAR_HARD_REG_SET (i2_regset);
|
||||
|
||||
for (note = REG_NOTES (i1); note; note = XEXP (note, 1))
|
||||
if (REG_NOTE_KIND (note) == REG_DEAD && STACK_REG_P (XEXP (note, 0)))
|
||||
SET_HARD_REG_BIT (i1_regset, REGNO (XEXP (note, 0)));
|
||||
|
||||
for (note = REG_NOTES (i2); note; note = XEXP (note, 1))
|
||||
if (REG_NOTE_KIND (note) == REG_DEAD && STACK_REG_P (XEXP (note, 0)))
|
||||
SET_HARD_REG_BIT (i2_regset, REGNO (XEXP (note, 0)));
|
||||
|
||||
GO_IF_HARD_REG_EQUAL (i1_regset, i2_regset, done);
|
||||
|
||||
return false;
|
||||
|
||||
done:
|
||||
;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (reload_completed
|
||||
? ! rtx_renumbered_equal_p (p1, p2) : ! rtx_equal_p (p1, p2))
|
||||
{
|
||||
/* The following code helps take care of G++ cleanups. */
|
||||
rtx equiv1 = find_reg_equal_equiv_note (i1);
|
||||
rtx equiv2 = find_reg_equal_equiv_note (i2);
|
||||
|
||||
if (equiv1 && equiv2
|
||||
/* If the equivalences are not to a constant, they may
|
||||
reference pseudos that no longer exist, so we can't
|
||||
use them. */
|
||||
&& (! reload_completed
|
||||
|| (CONSTANT_P (XEXP (equiv1, 0))
|
||||
&& rtx_equal_p (XEXP (equiv1, 0), XEXP (equiv2, 0)))))
|
||||
{
|
||||
rtx s1 = single_set (i1);
|
||||
rtx s2 = single_set (i2);
|
||||
if (s1 != 0 && s2 != 0
|
||||
&& rtx_renumbered_equal_p (SET_DEST (s1), SET_DEST (s2)))
|
||||
{
|
||||
validate_change (i1, &SET_SRC (s1), XEXP (equiv1, 0), 1);
|
||||
validate_change (i2, &SET_SRC (s2), XEXP (equiv2, 0), 1);
|
||||
if (! rtx_renumbered_equal_p (p1, p2))
|
||||
cancel_changes (0);
|
||||
else if (apply_change_group ())
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Look through the insns at the end of BB1 and BB2 and find the longest
|
||||
sequence that are equivalent. Store the first insns for that sequence
|
||||
in *F1 and *F2 and return the sequence length.
|
||||
@ -559,7 +663,7 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
|
||||
basic_block bb1, bb2;
|
||||
rtx *f1, *f2;
|
||||
{
|
||||
rtx i1, i2, p1, p2, last1, last2, afterlast1, afterlast2;
|
||||
rtx i1, i2, last1, last2, afterlast1, afterlast2;
|
||||
int ninsns = 0;
|
||||
|
||||
/* Skip simple jumps at the end of the blocks. Complex jumps still
|
||||
@ -586,100 +690,11 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
|
||||
if (i1 == bb1->head || i2 == bb2->head)
|
||||
break;
|
||||
|
||||
/* Verify that I1 and I2 are equivalent. */
|
||||
|
||||
if (GET_CODE (i1) != GET_CODE (i2))
|
||||
if (!insns_match_p (mode, i1, i2))
|
||||
break;
|
||||
|
||||
p1 = PATTERN (i1);
|
||||
p2 = PATTERN (i2);
|
||||
|
||||
/* If this is a CALL_INSN, compare register usage information.
|
||||
If we don't check this on stack register machines, the two
|
||||
CALL_INSNs might be merged leaving reg-stack.c with mismatching
|
||||
numbers of stack registers in the same basic block.
|
||||
If we don't check this on machines with delay slots, a delay slot may
|
||||
be filled that clobbers a parameter expected by the subroutine.
|
||||
|
||||
??? We take the simple route for now and assume that if they're
|
||||
equal, they were constructed identically. */
|
||||
|
||||
if (GET_CODE (i1) == CALL_INSN
|
||||
&& ! rtx_equal_p (CALL_INSN_FUNCTION_USAGE (i1),
|
||||
CALL_INSN_FUNCTION_USAGE (i2)))
|
||||
break;
|
||||
|
||||
#ifdef STACK_REGS
|
||||
/* If cross_jump_death_matters is not 0, the insn's mode
|
||||
indicates whether or not the insn contains any stack-like
|
||||
regs. */
|
||||
|
||||
if ((mode & CLEANUP_POST_REGSTACK) && stack_regs_mentioned (i1))
|
||||
{
|
||||
/* If register stack conversion has already been done, then
|
||||
death notes must also be compared before it is certain that
|
||||
the two instruction streams match. */
|
||||
|
||||
rtx note;
|
||||
HARD_REG_SET i1_regset, i2_regset;
|
||||
|
||||
CLEAR_HARD_REG_SET (i1_regset);
|
||||
CLEAR_HARD_REG_SET (i2_regset);
|
||||
|
||||
for (note = REG_NOTES (i1); note; note = XEXP (note, 1))
|
||||
if (REG_NOTE_KIND (note) == REG_DEAD
|
||||
&& STACK_REG_P (XEXP (note, 0)))
|
||||
SET_HARD_REG_BIT (i1_regset, REGNO (XEXP (note, 0)));
|
||||
|
||||
for (note = REG_NOTES (i2); note; note = XEXP (note, 1))
|
||||
if (REG_NOTE_KIND (note) == REG_DEAD
|
||||
&& STACK_REG_P (XEXP (note, 0)))
|
||||
SET_HARD_REG_BIT (i2_regset, REGNO (XEXP (note, 0)));
|
||||
|
||||
GO_IF_HARD_REG_EQUAL (i1_regset, i2_regset, done);
|
||||
|
||||
break;
|
||||
|
||||
done:
|
||||
;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (GET_CODE (p1) != GET_CODE (p2))
|
||||
break;
|
||||
|
||||
if (! rtx_renumbered_equal_p (p1, p2))
|
||||
{
|
||||
/* The following code helps take care of G++ cleanups. */
|
||||
rtx equiv1 = find_reg_equal_equiv_note (i1);
|
||||
rtx equiv2 = find_reg_equal_equiv_note (i2);
|
||||
|
||||
if (equiv1 && equiv2
|
||||
/* If the equivalences are not to a constant, they may
|
||||
reference pseudos that no longer exist, so we can't
|
||||
use them. */
|
||||
&& CONSTANT_P (XEXP (equiv1, 0))
|
||||
&& rtx_equal_p (XEXP (equiv1, 0), XEXP (equiv2, 0)))
|
||||
{
|
||||
rtx s1 = single_set (i1);
|
||||
rtx s2 = single_set (i2);
|
||||
if (s1 != 0 && s2 != 0
|
||||
&& rtx_renumbered_equal_p (SET_DEST (s1), SET_DEST (s2)))
|
||||
{
|
||||
validate_change (i1, &SET_SRC (s1), XEXP (equiv1, 0), 1);
|
||||
validate_change (i2, &SET_SRC (s2), XEXP (equiv2, 0), 1);
|
||||
if (! rtx_renumbered_equal_p (p1, p2))
|
||||
cancel_changes (0);
|
||||
else if (apply_change_group ())
|
||||
goto win;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
win:
|
||||
/* Don't begin a cross-jump with a USE or CLOBBER insn. */
|
||||
if (GET_CODE (p1) != USE && GET_CODE (p1) != CLOBBER)
|
||||
if (active_insn_p (i1))
|
||||
{
|
||||
/* If the merged insns have different REG_EQUAL notes, then
|
||||
remove them. */
|
||||
@ -743,10 +758,15 @@ flow_find_cross_jump (mode, bb1, bb2, f1, f2)
|
||||
We may assume that there exists one edge with a common destination. */
|
||||
|
||||
static bool
|
||||
outgoing_edges_match (bb1, bb2)
|
||||
outgoing_edges_match (mode, bb1, bb2)
|
||||
int mode;
|
||||
basic_block bb1;
|
||||
basic_block bb2;
|
||||
{
|
||||
int nehedges1 = 0, nehedges2 = 0;
|
||||
edge fallthru1 = 0, fallthru2 = 0;
|
||||
edge e1, e2;
|
||||
|
||||
/* If BB1 has only one successor, we must be looking at an unconditional
|
||||
jump. Which, by the assumption above, means that we only need to check
|
||||
that BB2 has one successor. */
|
||||
@ -862,10 +882,60 @@ outgoing_edges_match (bb1, bb2)
|
||||
return match;
|
||||
}
|
||||
|
||||
/* ??? We can handle computed jumps too. This may be important for
|
||||
inlined functions containing switch statements. Also jumps w/o
|
||||
fallthru edges can be handled by simply matching whole insn. */
|
||||
return false;
|
||||
/* Generic case - we are seeing an computed jump, table jump or trapping
|
||||
instruction. */
|
||||
|
||||
/* First ensure that the instructions match. There may be many outgoing
|
||||
edges so this test is generally cheaper.
|
||||
??? Currently the tablejumps will never match, as they do have
|
||||
different tables. */
|
||||
if (!insns_match_p (mode, bb1->end, bb2->end))
|
||||
return false;
|
||||
|
||||
/* Search the outgoing edges, ensure that the counts do match, find possible
|
||||
fallthru and exception handling edges since these needs more
|
||||
validation. */
|
||||
for (e1 = bb1->succ, e2 = bb2->succ; e1 && e2;
|
||||
e1 = e1->succ_next, e2 = e2->succ_next)
|
||||
{
|
||||
if (e1->flags & EDGE_EH)
|
||||
nehedges1++;
|
||||
if (e2->flags & EDGE_EH)
|
||||
nehedges2++;
|
||||
if (e1->flags & EDGE_FALLTHRU)
|
||||
fallthru1 = e1;
|
||||
if (e2->flags & EDGE_FALLTHRU)
|
||||
fallthru2 = e2;
|
||||
}
|
||||
/* If number of edges of various types does not match, fail. */
|
||||
if (e1 || e2)
|
||||
return false;
|
||||
if (nehedges1 != nehedges2)
|
||||
return false;
|
||||
if ((fallthru1 != 0) != (fallthru2 != 0))
|
||||
return false;
|
||||
|
||||
/* fallthru edges must be forwarded to the same destination. */
|
||||
if (fallthru1)
|
||||
{
|
||||
basic_block d1 = (forwarder_block_p (fallthru1->dest)
|
||||
? fallthru1->dest->succ->dest: fallthru1->dest);
|
||||
basic_block d2 = (forwarder_block_p (fallthru2->dest)
|
||||
? fallthru2->dest->succ->dest: fallthru2->dest);
|
||||
if (d1 != d2)
|
||||
return false;
|
||||
}
|
||||
/* In case we do have EH edges, ensure we are in the same region. */
|
||||
if (nehedges1)
|
||||
{
|
||||
rtx n1 = find_reg_note (bb1->end, REG_EH_REGION, 0);
|
||||
rtx n2 = find_reg_note (bb2->end, REG_EH_REGION, 0);
|
||||
if (XEXP (n1, 0) != XEXP (n2, 0))
|
||||
return false;
|
||||
}
|
||||
/* We don't need to match the rest of edges as above checks should be enought
|
||||
to ensure that they are equivalent. */
|
||||
return true;
|
||||
}
|
||||
|
||||
/* E1 and E2 are edges with the same destination block. Search their
|
||||
@ -924,14 +994,8 @@ try_crossjump_to_edge (mode, e1, e2)
|
||||
if (!src1->pred || !src2->pred)
|
||||
return false;
|
||||
|
||||
/* Likewise with complex edges.
|
||||
??? We should be able to handle most complex edges later with some
|
||||
care. */
|
||||
if (e1->flags & EDGE_COMPLEX)
|
||||
return false;
|
||||
|
||||
/* Look for the common insn sequence, part the first ... */
|
||||
if (!outgoing_edges_match (src1, src2))
|
||||
if (!outgoing_edges_match (mode, src1, src2))
|
||||
return false;
|
||||
|
||||
/* ... and part the second. */
|
||||
@ -1066,11 +1130,6 @@ try_crossjump_bb (mode, bb)
|
||||
{
|
||||
nexte = e->pred_next;
|
||||
|
||||
/* Elide complex edges now, as neither try_crossjump_to_edge
|
||||
nor outgoing_edges_match can handle them. */
|
||||
if (e->flags & EDGE_COMPLEX)
|
||||
continue;
|
||||
|
||||
/* As noted above, first try with the fallthru predecessor. */
|
||||
if (fallthru)
|
||||
{
|
||||
@ -1113,11 +1172,6 @@ try_crossjump_bb (mode, bb)
|
||||
if (e2 == fallthru)
|
||||
continue;
|
||||
|
||||
/* Again, neither try_crossjump_to_edge nor outgoing_edges_match
|
||||
can handle complex edges. */
|
||||
if (e2->flags & EDGE_COMPLEX)
|
||||
continue;
|
||||
|
||||
/* The "first successor" check above only prevents multiple
|
||||
checks of crossjump(A,B). In order to prevent redundant
|
||||
checks of crossjump(B,A), require that A be the block
|
||||
|
@ -817,18 +817,42 @@ override_options ()
|
||||
{
|
||||
const char *const name; /* processor name or nickname. */
|
||||
const enum processor_type processor;
|
||||
const enum pta_flags
|
||||
{
|
||||
PTA_SSE = 1,
|
||||
PTA_SSE2 = 2,
|
||||
PTA_MMX = 4,
|
||||
PTA_SSEPREFETCH = 8,
|
||||
PTA_3DNOW = 16,
|
||||
PTA_3DNOW_A = 64
|
||||
} flags;
|
||||
}
|
||||
const processor_alias_table[] =
|
||||
{
|
||||
{"i386", PROCESSOR_I386},
|
||||
{"i486", PROCESSOR_I486},
|
||||
{"i586", PROCESSOR_PENTIUM},
|
||||
{"pentium", PROCESSOR_PENTIUM},
|
||||
{"i686", PROCESSOR_PENTIUMPRO},
|
||||
{"pentiumpro", PROCESSOR_PENTIUMPRO},
|
||||
{"k6", PROCESSOR_K6},
|
||||
{"athlon", PROCESSOR_ATHLON},
|
||||
{"pentium4", PROCESSOR_PENTIUM4},
|
||||
{"i386", PROCESSOR_I386, 0},
|
||||
{"i486", PROCESSOR_I486, 0},
|
||||
{"i586", PROCESSOR_PENTIUM, 0},
|
||||
{"pentium", PROCESSOR_PENTIUM, 0},
|
||||
{"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
|
||||
{"i686", PROCESSOR_PENTIUMPRO, 0},
|
||||
{"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
|
||||
{"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
|
||||
{"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_SSEPREFETCH},
|
||||
{"pentium4", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2 |
|
||||
PTA_MMX | PTA_SSEPREFETCH},
|
||||
{"k6", PROCESSOR_K6, PTA_MMX},
|
||||
{"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
|
||||
{"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
|
||||
{"athlon", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
|
||||
| PTA_3DNOW_A},
|
||||
{"athlon-tbird", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH
|
||||
| PTA_3DNOW | PTA_3DNOW_A},
|
||||
{"athlon-4", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
|
||||
| PTA_3DNOW_A | PTA_SSE},
|
||||
{"athlon-xp", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
|
||||
| PTA_3DNOW_A | PTA_SSE},
|
||||
{"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_SSEPREFETCH | PTA_3DNOW
|
||||
| PTA_3DNOW_A | PTA_SSE},
|
||||
};
|
||||
|
||||
int const pta_size = sizeof (processor_alias_table) / sizeof (struct pta);
|
||||
@ -880,6 +904,21 @@ override_options ()
|
||||
ix86_arch = processor_alias_table[i].processor;
|
||||
/* Default cpu tuning to the architecture. */
|
||||
ix86_cpu = ix86_arch;
|
||||
if (processor_alias_table[i].flags & PTA_MMX
|
||||
&& !(target_flags & MASK_MMX_SET))
|
||||
target_flags |= MASK_MMX;
|
||||
if (processor_alias_table[i].flags & PTA_3DNOW
|
||||
&& !(target_flags & MASK_3DNOW_SET))
|
||||
target_flags |= MASK_3DNOW;
|
||||
if (processor_alias_table[i].flags & PTA_3DNOW_A
|
||||
&& !(target_flags & MASK_3DNOW_A_SET))
|
||||
target_flags |= MASK_3DNOW_A;
|
||||
if (processor_alias_table[i].flags & PTA_SSE
|
||||
&& !(target_flags & MASK_SSE_SET))
|
||||
target_flags |= MASK_SSE;
|
||||
if (processor_alias_table[i].flags & PTA_SSE2
|
||||
&& !(target_flags & MASK_SSE2_SET))
|
||||
target_flags |= MASK_SSE2;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1045,7 +1084,7 @@ override_options ()
|
||||
target_flags |= MASK_3DNOW_A;
|
||||
}
|
||||
if ((x86_accumulate_outgoing_args & CPUMASK)
|
||||
&& !(target_flags & MASK_NO_ACCUMULATE_OUTGOING_ARGS)
|
||||
&& !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS_SET)
|
||||
&& !optimize_size)
|
||||
target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
|
||||
|
||||
|
@ -112,25 +112,30 @@ extern int target_flags;
|
||||
#define MASK_NO_FANCY_MATH_387 0x00000040 /* Disable sin, cos, sqrt */
|
||||
#define MASK_OMIT_LEAF_FRAME_POINTER 0x080 /* omit leaf frame pointers */
|
||||
#define MASK_STACK_PROBE 0x00000100 /* Enable stack probing */
|
||||
#define MASK_NO_ALIGN_STROPS 0x00001000 /* Enable aligning of string ops. */
|
||||
#define MASK_INLINE_ALL_STROPS 0x00002000 /* Inline stringops in all cases */
|
||||
#define MASK_NO_PUSH_ARGS 0x00004000 /* Use push instructions */
|
||||
#define MASK_ACCUMULATE_OUTGOING_ARGS 0x00008000/* Accumulate outgoing args */
|
||||
#define MASK_NO_ACCUMULATE_OUTGOING_ARGS 0x00010000
|
||||
#define MASK_MMX 0x00020000 /* Support MMX regs/builtins */
|
||||
#define MASK_SSE 0x00040000 /* Support SSE regs/builtins */
|
||||
#define MASK_SSE2 0x00080000 /* Support SSE2 regs/builtins */
|
||||
#define MASK_NO_ALIGN_STROPS 0x00000200 /* Enable aligning of string ops. */
|
||||
#define MASK_INLINE_ALL_STROPS 0x00000400 /* Inline stringops in all cases */
|
||||
#define MASK_NO_PUSH_ARGS 0x00000800 /* Use push instructions */
|
||||
#define MASK_ACCUMULATE_OUTGOING_ARGS 0x00001000/* Accumulate outgoing args */
|
||||
#define MASK_ACCUMULATE_OUTGOING_ARGS_SET 0x00002000
|
||||
#define MASK_MMX 0x00004000 /* Support MMX regs/builtins */
|
||||
#define MASK_MMX_SET 0x00008000
|
||||
#define MASK_SSE 0x00010000 /* Support SSE regs/builtins */
|
||||
#define MASK_SSE_SET 0x00020000
|
||||
#define MASK_SSE2 0x00040000 /* Support SSE2 regs/builtins */
|
||||
#define MASK_SSE2_SET 0x00080000
|
||||
#define MASK_3DNOW 0x00100000 /* Support 3Dnow builtins */
|
||||
#define MASK_3DNOW_A 0x00200000 /* Support Athlon 3Dnow builtins */
|
||||
#define MASK_128BIT_LONG_DOUBLE 0x00400000 /* long double size is 128bit */
|
||||
#define MASK_MIX_SSE_I387 0x00800000 /* Mix SSE and i387 instructions */
|
||||
#define MASK_64BIT 0x01000000 /* Produce 64bit code */
|
||||
#define MASK_NO_RED_ZONE 0x02000000 /* Do not use red zone */
|
||||
#define MASK_3DNOW_SET 0x00200000
|
||||
#define MASK_3DNOW_A 0x00400000 /* Support Athlon 3Dnow builtins */
|
||||
#define MASK_3DNOW_A_SET 0x00800000
|
||||
#define MASK_128BIT_LONG_DOUBLE 0x01000000 /* long double size is 128bit */
|
||||
#define MASK_MIX_SSE_I387 0x02000000 /* Mix SSE and i387 instructions */
|
||||
#define MASK_64BIT 0x04000000 /* Produce 64bit code */
|
||||
#define MASK_NO_RED_ZONE 0x08000000 /* Do not use red zone */
|
||||
|
||||
/* Temporary codegen switches */
|
||||
#define MASK_INTEL_SYNTAX 0x00000200
|
||||
#define MASK_DEBUG_ARG 0x00000400 /* function_arg */
|
||||
#define MASK_DEBUG_ADDR 0x00000800 /* GO_IF_LEGITIMATE_ADDRESS */
|
||||
#define MASK_INTEL_SYNTAX 0x10000000
|
||||
#define MASK_DEBUG_ARG 0x20000000 /* function_arg */
|
||||
#define MASK_DEBUG_ADDR 0x40000000 /* GO_IF_LEGITIMATE_ADDRESS */
|
||||
|
||||
/* Use the floating point instructions */
|
||||
#define TARGET_80387 (target_flags & MASK_80387)
|
||||
@ -335,24 +340,30 @@ extern const int x86_epilogue_using_move, x86_decompose_lea;
|
||||
N_("Use push instructions to save outgoing arguments") }, \
|
||||
{ "no-push-args", MASK_NO_PUSH_ARGS, \
|
||||
N_("Do not use push instructions to save outgoing arguments") }, \
|
||||
{ "accumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS, \
|
||||
{ "accumulate-outgoing-args", (MASK_ACCUMULATE_OUTGOING_ARGS \
|
||||
| MASK_ACCUMULATE_OUTGOING_ARGS_SET), \
|
||||
N_("Use push instructions to save outgoing arguments") }, \
|
||||
{ "no-accumulate-outgoing-args",MASK_NO_ACCUMULATE_OUTGOING_ARGS, \
|
||||
{ "no-accumulate-outgoing-args",MASK_ACCUMULATE_OUTGOING_ARGS_SET, \
|
||||
N_("Do not use push instructions to save outgoing arguments") }, \
|
||||
{ "mmx", MASK_MMX, N_("Support MMX builtins") }, \
|
||||
{ "no-mmx", -MASK_MMX, \
|
||||
{ "mmx", MASK_MMX | MASK_MMX_SET, \
|
||||
N_("Support MMX builtins") }, \
|
||||
{ "no-mmx", -MASK_MMX, \
|
||||
N_("Do not support MMX builtins") }, \
|
||||
{ "3dnow", MASK_3DNOW, \
|
||||
{ "no-mmx", MASK_MMX_SET, N_("") }, \
|
||||
{ "3dnow", MASK_3DNOW | MASK_3DNOW_SET, \
|
||||
N_("Support 3DNow! builtins") }, \
|
||||
{ "no-3dnow", -MASK_3DNOW, \
|
||||
{ "no-3dnow", -MASK_3DNOW, N_("") }, \
|
||||
{ "no-3dnow", MASK_3DNOW_SET, \
|
||||
N_("Do not support 3DNow! builtins") }, \
|
||||
{ "sse", MASK_SSE, \
|
||||
{ "sse", MASK_SSE | MASK_SSE_SET, \
|
||||
N_("Support MMX and SSE builtins and code generation") }, \
|
||||
{ "no-sse", -MASK_SSE, \
|
||||
{ "no-sse", -MASK_SSE, N_("") }, \
|
||||
{ "no-sse", MASK_SSE_SET, \
|
||||
N_("Do not support MMX and SSE builtins and code generation") }, \
|
||||
{ "sse2", MASK_SSE2, \
|
||||
{ "sse2", MASK_SSE2 | MASK_SSE2_SET, \
|
||||
N_("Support MMX, SSE and SSE2 builtins and code generation") }, \
|
||||
{ "no-sse2", -MASK_SSE2, \
|
||||
{ "no-sse2", -MASK_SSE2, N_("") }, \
|
||||
{ "no-sse2", MASK_SSE2_SET, \
|
||||
N_("Do not support MMX, SSE and SSE2 builtins and code generation") }, \
|
||||
{ "mix-sse-i387", MASK_MIX_SSE_I387, \
|
||||
N_("Use both SSE and i387 instruction sets for floating point arithmetics") },\
|
||||
@ -522,11 +533,22 @@ extern int ix86_arch;
|
||||
%{march=pentium4:-D__pentium4 -D__pentium4__ %{!mcpu*:-D__tune_pentium4__ }}\
|
||||
%{m386|mcpu=i386:-D__tune_i386__ }\
|
||||
%{m486|mcpu=i486:-D__tune_i486__ }\
|
||||
%{mpentium|mcpu=pentium|mcpu=i586:-D__tune_i586__ -D__tune_pentium__ }\
|
||||
%{mpentiumpro|mcpu=pentiumpro|mcpu=i686:-D__tune_i686__ -D__tune_pentiumpro__ }\
|
||||
%{mcpu=k6:-D__tune_k6__ }\
|
||||
%{mcpu=athlon:-D__tune_athlon__ }\
|
||||
%{mpentium|mcpu=pentium|mcpu=i586|mcpu=pentium-mmx:-D__tune_i586__ -D__tune_pentium__ }\
|
||||
%{mpentiumpro|mcpu=pentiumpro|mcpu=i686|cpu=pentium2|cpu=pentium3:-D__tune_i686__\
|
||||
-D__tune_pentiumpro__ }\
|
||||
%{mcpu=k6|mcpu=k6-2|mcpu=k6-3:-D__tune_k6__ }\
|
||||
%{mcpu=athlon|mcpu=athlon-tbird|mcpu=athlon-4|mcpu=athlon-xp|mcpu=athlon-mp:\
|
||||
-D__tune_athlon__ }\
|
||||
%{mcpu=pentium4:-D__tune_pentium4__ }\
|
||||
%{march=march=athlon-tbird|march=athlon-xp|march=athlon-mp|march=pentium3|march=pentium4:\
|
||||
-D__SSE__ }\
|
||||
%{march=pentium-mmx|march=k6|march=k6-2|march=k6-3\
|
||||
march=athlon|march=athlon-tbird|march=athlon-4|march=athlon-xp\
|
||||
|march=athlon-mp|march=pentium2|march=pentium3|march=pentium4: -D__MMX__ }\
|
||||
%{march=k6|march=k6-2|march=k6-3\
|
||||
march=athlon|march=athlon-tbird|march=athlon-4|march=athlon-xp\
|
||||
|march=athlon-mp: -D__3dNOW__ }\
|
||||
%{mcpu=mcpu=pentium4: -D__SSE2__ }\
|
||||
%{!march*:%{!mcpu*:%{!m386:%{!m486:%{!mpentium*:%(cpp_cpu_default)}}}}}"
|
||||
|
||||
#ifndef CPP_CPU_SPEC
|
||||
|
@ -272,7 +272,7 @@ in the following sections.
|
||||
-fno-inline -fno-math-errno -fno-peephole -fno-peephole2 @gol
|
||||
-funsafe-math-optimizations -fno-trapping-math @gol
|
||||
-fomit-frame-pointer -foptimize-register-move @gol
|
||||
-foptimize-sibling-calls -freduce-all-givs @gol
|
||||
-foptimize-sibling-calls -fprefetch-loop-arrays -freduce-all-givs @gol
|
||||
-fregmove -frename-registers @gol
|
||||
-frerun-cse-after-loop -frerun-loop-opt @gol
|
||||
-fschedule-insns -fschedule-insns2 @gol
|
||||
@ -3570,6 +3570,10 @@ the loop is entered. This usually makes programs run more slowly.
|
||||
@option{-funroll-all-loops} implies the same options as
|
||||
@option{-funroll-loops},
|
||||
|
||||
@item -fprefetch-loop-arrays
|
||||
@opindex fprefetch-loop-arrays
|
||||
If supported by the target machine, generate instructions to prefetch
|
||||
memory to improve the performance of loops that access large arrays.
|
||||
|
||||
@item -fmove-all-movables
|
||||
@opindex fmove-all-movables
|
||||
@ -7476,10 +7480,13 @@ computers:
|
||||
@table @gcctabopt
|
||||
@item -mcpu=@var{cpu-type}
|
||||
@opindex mcpu
|
||||
Assume the defaults for the machine type @var{cpu-type} when scheduling
|
||||
instructions. The choices for @var{cpu-type} are @samp{i386},
|
||||
@samp{i486}, @samp{i586}, @samp{i686}, @samp{pentium},
|
||||
@samp{pentiumpro}, @samp{pentium4}, @samp{k6}, and @samp{athlon}
|
||||
Tune to @var{cpu-type} everything applicable about the generated code, except
|
||||
for the ABI and the set of available instructions. The choices for
|
||||
@var{cpu-type} are @samp{i386}, @samp{i486}, @samp{i586}, @samp{i686},
|
||||
@samp{pentium}, @samp{pentium-mmx}, @samp{pentiumpro}, @samp{pentium2},
|
||||
@samp{pentium3}, @samp{pentium4}, @samp{k6}, @samp{k6-2}, @samp{k6-3},
|
||||
@samp{athlon}, @samp{athlon-tbird}, @samp{athlon-4}, @samp{athlon-xp}
|
||||
and @samp{athlon-mp}.
|
||||
|
||||
While picking a specific @var{cpu-type} will schedule things appropriately
|
||||
for that particular chip, the compiler will not generate any code that
|
||||
|
@ -269,6 +269,10 @@ extern int flag_unroll_all_loops;
|
||||
|
||||
extern int flag_move_all_movables;
|
||||
|
||||
/* Nonzero enables prefetch optimizations for arrays in loops. */
|
||||
|
||||
extern int flag_prefetch_loop_arrays;
|
||||
|
||||
/* Nonzero forces all general induction variables in loops to be
|
||||
strength reduced. */
|
||||
|
||||
|
593
gcc/loop.c
593
gcc/loop.c
@ -53,6 +53,90 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
|
||||
#include "except.h"
|
||||
#include "toplev.h"
|
||||
#include "predict.h"
|
||||
#include "insn-flags.h"
|
||||
|
||||
/* Not really meaningful values, but at least something. */
|
||||
#ifndef SIMULTANEOUS_PREFETCHES
|
||||
#define SIMULTANEOUS_PREFETCHES 3
|
||||
#endif
|
||||
#ifndef PREFETCH_BLOCK
|
||||
#define PREFETCH_BLOCK 32
|
||||
#endif
|
||||
#ifndef HAVE_prefetch
|
||||
#define HAVE_prefetch 0
|
||||
#define gen_prefetch(a,b,c) (abort(), NULL_RTX)
|
||||
#endif
|
||||
|
||||
/* Give up the prefetch optimizations once we exceed a given threshhold.
|
||||
It is unlikely that we would be able to optimize something in a loop
|
||||
with so many detected prefetches. */
|
||||
#define MAX_PREFETCHES 100
|
||||
/* The number of prefetch blocks that are beneficial to fetch at once before
|
||||
a loop with a known (and low) iteration count. */
|
||||
#define PREFETCH_BLOCKS_BEFORE_LOOP_MAX 6
|
||||
/* For very tiny loops it is not worthwhile to prefetch even before the loop,
|
||||
since it is likely that the data are already in the cache. */
|
||||
#define PREFETCH_BLOCKS_BEFORE_LOOP_MIN 2
|
||||
/* The minimal number of prefetch blocks that a loop must consume to make
|
||||
the emitting of prefetch instruction in the body of loop worthwhile. */
|
||||
#define PREFETCH_BLOCKS_IN_LOOP_MIN 6
|
||||
|
||||
/* Parameterize some prefetch heuristics so they can be turned on and off
|
||||
easily for performance testing on new architecures. These can be
|
||||
defined in target-dependent files. */
|
||||
|
||||
/* Prefetch is worthwhile only when loads/stores are dense. */
|
||||
#ifndef PREFETCH_ONLY_DENSE_MEM
|
||||
#define PREFETCH_ONLY_DENSE_MEM 1
|
||||
#endif
|
||||
|
||||
/* Define what we mean by "dense" loads and stores; This value divided by 256
|
||||
is the minimum percentage of memory references that worth prefetching. */
|
||||
#ifndef PREFETCH_DENSE_MEM
|
||||
#define PREFETCH_DENSE_MEM 220
|
||||
#endif
|
||||
|
||||
/* Do not prefetch for a loop whose iteration count is known to be low. */
|
||||
#ifndef PREFETCH_NO_LOW_LOOPCNT
|
||||
#define PREFETCH_NO_LOW_LOOPCNT 1
|
||||
#endif
|
||||
|
||||
/* Define what we mean by a "low" iteration count. */
|
||||
#ifndef PREFETCH_LOW_LOOPCNT
|
||||
#define PREFETCH_LOW_LOOPCNT 32
|
||||
#endif
|
||||
|
||||
/* Do not prefetch for a loop that contains a function call; such a loop is
|
||||
probably not an internal loop. */
|
||||
#ifndef PREFETCH_NO_CALL
|
||||
#define PREFETCH_NO_CALL 1
|
||||
#endif
|
||||
|
||||
/* Do not prefetch accesses with an extreme stride. */
|
||||
#ifndef PREFETCH_NO_EXTREME_STRIDE
|
||||
#define PREFETCH_NO_EXTREME_STRIDE 1
|
||||
#endif
|
||||
|
||||
/* Define what we mean by an "extreme" stride. */
|
||||
#ifndef PREFETCH_EXTREME_STRIDE
|
||||
#define PREFETCH_EXTREME_STRIDE 4096
|
||||
#endif
|
||||
|
||||
/* Do not handle reversed order prefetches (negative stride). */
|
||||
#ifndef PREFETCH_NO_REVERSE_ORDER
|
||||
#define PREFETCH_NO_REVERSE_ORDER 1
|
||||
#endif
|
||||
|
||||
/* Prefetch even if the GIV is not always executed. */
|
||||
#ifndef PREFETCH_NOT_ALWAYS
|
||||
#define PREFETCH_NOT_ALWAYS 0
|
||||
#endif
|
||||
|
||||
/* If the loop requires more prefetches than the target can process in
|
||||
parallel then don't prefetch anything in that loop. */
|
||||
#ifndef PREFETCH_LIMIT_TO_SIMULTANEOUS
|
||||
#define PREFETCH_LIMIT_TO_SIMULTANEOUS 1
|
||||
#endif
|
||||
|
||||
#define LOOP_REG_LIFETIME(LOOP, REGNO) \
|
||||
((REGNO_LAST_LUID (REGNO) - REGNO_FIRST_LUID (REGNO)))
|
||||
@ -262,6 +346,7 @@ static rtx loop_insn_sink_or_swim PARAMS((const struct loop *, rtx));
|
||||
|
||||
static void loop_dump_aux PARAMS ((const struct loop *, FILE *, int));
|
||||
static void loop_delete_insns PARAMS ((rtx, rtx));
|
||||
static int remove_constant_addition PARAMS ((rtx *));
|
||||
void debug_ivs PARAMS ((const struct loop *));
|
||||
void debug_iv_class PARAMS ((const struct iv_class *));
|
||||
void debug_biv PARAMS ((const struct induction *));
|
||||
@ -3412,6 +3497,509 @@ loop_reg_used_before_p (loop, set, insn)
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Information we collect about arrays that we might want to prefetch. */
|
||||
struct prefetch_info
|
||||
{
|
||||
struct iv_class *class; /* Class this prefetch is based on. */
|
||||
struct induction *giv; /* GIV this prefetch is based on. */
|
||||
rtx base_address; /* Start prefetching from this address plus
|
||||
index. */
|
||||
HOST_WIDE_INT index;
|
||||
HOST_WIDE_INT stride; /* Prefetch stride in bytes in each
|
||||
iteration. */
|
||||
unsigned int bytes_accesed; /* Sum of sizes of all acceses to this
|
||||
prefetch area in one iteration. */
|
||||
unsigned int total_bytes; /* Total bytes loop will access in this block.
|
||||
This is set only for loops with known
|
||||
iteration counts and is 0xffffffff
|
||||
otherwise. */
|
||||
unsigned int write : 1; /* 1 for read/write prefetches. */
|
||||
unsigned int prefetch_in_loop : 1;
|
||||
/* 1 for those chosen for prefetching. */
|
||||
unsigned int prefetch_before_loop : 1;
|
||||
/* 1 for those chosen for prefetching. */
|
||||
};
|
||||
|
||||
/* Data used by check_store function. */
|
||||
struct check_store_data
|
||||
{
|
||||
rtx mem_address;
|
||||
int mem_write;
|
||||
};
|
||||
|
||||
static void check_store PARAMS ((rtx, rtx, void *));
|
||||
static void emit_prefetch_instructions PARAMS ((struct loop *));
|
||||
static int rtx_equal_for_prefetch_p PARAMS ((rtx, rtx));
|
||||
|
||||
/* Set mem_write when mem_address is found. Used as callback to
|
||||
note_stores. */
|
||||
static void
|
||||
check_store (x, pat, data)
|
||||
rtx x, pat ATTRIBUTE_UNUSED;
|
||||
void *data;
|
||||
{
|
||||
struct check_store_data *d = (struct check_store_data *)data;
|
||||
|
||||
if ((GET_CODE (x) == MEM) && rtx_equal_p (d->mem_address, XEXP (x, 0)))
|
||||
d->mem_write = 1;
|
||||
}
|
||||
|
||||
/* Like rtx_equal_p, but attempts to swap commutative operands. This is
|
||||
important to get some addresses combined. Later more sophisticated
|
||||
transformations can be added when necesary.
|
||||
|
||||
??? Same trick with swapping operand is done at several other places.
|
||||
It can be nice to develop some common way to handle this. */
|
||||
|
||||
static int
|
||||
rtx_equal_for_prefetch_p (x, y)
|
||||
rtx x, y;
|
||||
{
|
||||
int i;
|
||||
int j;
|
||||
enum rtx_code code = GET_CODE (x);
|
||||
const char *fmt;
|
||||
|
||||
if (x == y)
|
||||
return 1;
|
||||
if (code != GET_CODE (y))
|
||||
return 0;
|
||||
|
||||
code = GET_CODE (x);
|
||||
|
||||
if (GET_RTX_CLASS (code) == 'c')
|
||||
{
|
||||
return ((rtx_equal_for_prefetch_p (XEXP (x, 0), XEXP (y, 0))
|
||||
&& rtx_equal_for_prefetch_p (XEXP (x, 1), XEXP (y, 1)))
|
||||
|| (rtx_equal_for_prefetch_p (XEXP (x, 0), XEXP (y, 1))
|
||||
&& rtx_equal_for_prefetch_p (XEXP (x, 1), XEXP (y, 0))));
|
||||
}
|
||||
/* Compare the elements. If any pair of corresponding elements fails to
|
||||
match, return 0 for the whole thing. */
|
||||
|
||||
fmt = GET_RTX_FORMAT (code);
|
||||
for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
|
||||
{
|
||||
switch (fmt[i])
|
||||
{
|
||||
case 'w':
|
||||
if (XWINT (x, i) != XWINT (y, i))
|
||||
return 0;
|
||||
break;
|
||||
|
||||
case 'i':
|
||||
if (XINT (x, i) != XINT (y, i))
|
||||
return 0;
|
||||
break;
|
||||
|
||||
case 'E':
|
||||
/* Two vectors must have the same length. */
|
||||
if (XVECLEN (x, i) != XVECLEN (y, i))
|
||||
return 0;
|
||||
|
||||
/* And the corresponding elements must match. */
|
||||
for (j = 0; j < XVECLEN (x, i); j++)
|
||||
if (rtx_equal_for_prefetch_p (XVECEXP (x, i, j),
|
||||
XVECEXP (y, i, j)) == 0)
|
||||
return 0;
|
||||
break;
|
||||
|
||||
case 'e':
|
||||
if (rtx_equal_for_prefetch_p (XEXP (x, i), XEXP (y, i)) == 0)
|
||||
return 0;
|
||||
break;
|
||||
|
||||
case 's':
|
||||
if (strcmp (XSTR (x, i), XSTR (y, i)))
|
||||
return 0;
|
||||
break;
|
||||
|
||||
case 'u':
|
||||
/* These are just backpointers, so they don't matter. */
|
||||
break;
|
||||
|
||||
case '0':
|
||||
break;
|
||||
|
||||
/* It is believed that rtx's at this level will never
|
||||
contain anything but integers and other rtx's,
|
||||
except for within LABEL_REFs and SYMBOL_REFs. */
|
||||
default:
|
||||
abort ();
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Remove constant addition value from the expression X (when present)
|
||||
and return it. */
|
||||
static HOST_WIDE_INT
|
||||
remove_constant_addition (x)
|
||||
rtx *x;
|
||||
{
|
||||
HOST_WIDE_INT addval = 0;
|
||||
rtx exp=*x;
|
||||
|
||||
if (GET_CODE (exp) == CONST)
|
||||
exp = XEXP (exp, 0);
|
||||
if (GET_CODE (exp) == CONST_INT)
|
||||
{
|
||||
addval = INTVAL (exp);
|
||||
*x = const0_rtx;
|
||||
}
|
||||
/* For plus expression recurse on ourself. */
|
||||
else if (GET_CODE (exp) == PLUS)
|
||||
{
|
||||
addval += remove_constant_addition (&XEXP (exp, 0));
|
||||
addval += remove_constant_addition (&XEXP (exp, 1));
|
||||
/* In case our parameter was constant, remove extra zero
|
||||
from the expression. */
|
||||
if (XEXP (exp, 0) == const0_rtx)
|
||||
*x = XEXP (exp, 1);
|
||||
else if (XEXP (exp, 1) == const0_rtx)
|
||||
*x = XEXP (exp, 0);
|
||||
}
|
||||
return addval;
|
||||
}
|
||||
|
||||
/* Attempt to identify accesses to arrays that are most likely to cause cache
|
||||
misses, and emit prefetch instructions a few prefetch blocks forward.
|
||||
|
||||
To detect the arrays we use the GIV information that was collected by the
|
||||
strength reduction pass.
|
||||
|
||||
The prefetch instructions are generated after the GIV information is done
|
||||
and before the strength reduction process. The new GIVs are injected into
|
||||
the strength reduction tables, so the prefetch addresses are optimized as
|
||||
well.
|
||||
|
||||
GIVs are split into base address, stride, and constant addition values.
|
||||
GIVs with the same address, stride and close addition values are combined
|
||||
into a single prefetch. Also writes to GIVs are detected, so that prefetch
|
||||
for write instructions can be used for the block we write to, on machines
|
||||
that support write prefetches.
|
||||
|
||||
Several heuristics are used to determine when to prefetch. They are
|
||||
controlled by defined symbols that can be overridden for each target.
|
||||
*/
|
||||
static void
|
||||
emit_prefetch_instructions (struct loop *loop)
|
||||
{
|
||||
int num_prefetches = 0;
|
||||
int num_real_prefetches = 0;
|
||||
int num_real_write_prefetches = 0;
|
||||
int ahead;
|
||||
int i;
|
||||
struct iv_class *bl;
|
||||
struct induction *iv;
|
||||
struct prefetch_info info[MAX_PREFETCHES];
|
||||
struct loop_ivs *ivs = LOOP_IVS (loop);
|
||||
|
||||
if (!HAVE_prefetch)
|
||||
return;
|
||||
|
||||
/* Consider only loops w/o calls. When a call is done, the loop is probably
|
||||
slow enough to read the memory. */
|
||||
if (PREFETCH_NO_CALL && LOOP_INFO (loop)->has_call)
|
||||
{
|
||||
if (loop_dump_stream)
|
||||
fprintf (loop_dump_stream, "Prefetch: ignoring loop - has call.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (PREFETCH_NO_LOW_LOOPCNT
|
||||
&& LOOP_INFO (loop)->n_iterations
|
||||
&& LOOP_INFO (loop)->n_iterations <= PREFETCH_LOW_LOOPCNT)
|
||||
{
|
||||
if (loop_dump_stream)
|
||||
fprintf (loop_dump_stream,
|
||||
"Prefetch: ignoring loop - not enought iterations.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Search all induction variables and pick those interesting for the prefetch
|
||||
machinery. */
|
||||
for (bl = ivs->list; bl; bl = bl->next)
|
||||
{
|
||||
struct induction *biv = bl->biv, *biv1;
|
||||
int basestride = 0;
|
||||
|
||||
biv1 = biv;
|
||||
/* Expect all BIVs to be executed in each iteration. This makes our
|
||||
analysis more conservative. */
|
||||
while (biv1)
|
||||
{
|
||||
/* Discard non-constant additions that we can't handle well yet, and
|
||||
BIVs that are executed multiple times; such BIVs ought to be
|
||||
handled in the nested loop. We accept not_every_iteration BIVs,
|
||||
since these only result in larger strides and make our
|
||||
heuristics more conservative.
|
||||
??? What does the last sentence mean? */
|
||||
|
||||
if (GET_CODE (biv->add_val) != CONST_INT)
|
||||
{
|
||||
if (loop_dump_stream)
|
||||
{
|
||||
fprintf (loop_dump_stream, "Prefetch: biv %i ignored: non-constant addition at insn %i:",
|
||||
REGNO (biv->src_reg), INSN_UID (biv->insn));
|
||||
print_rtl (loop_dump_stream, biv->add_val);
|
||||
fprintf (loop_dump_stream, "\n");
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (biv->maybe_multiple)
|
||||
{
|
||||
if (loop_dump_stream)
|
||||
{
|
||||
fprintf (loop_dump_stream, "Prefetch: biv %i ignored: maybe_multiple at insn %i:",
|
||||
REGNO (biv->src_reg), INSN_UID (biv->insn));
|
||||
print_rtl (loop_dump_stream, biv->add_val);
|
||||
fprintf (loop_dump_stream, "\n");
|
||||
}
|
||||
break;
|
||||
}
|
||||
basestride += INTVAL (biv1->add_val);
|
||||
biv1 = biv1->next_iv;
|
||||
}
|
||||
if (biv1 || !basestride)
|
||||
continue;
|
||||
for (iv = bl->giv; iv; iv = iv->next_iv)
|
||||
{
|
||||
rtx address;
|
||||
rtx temp;
|
||||
HOST_WIDE_INT index = 0;
|
||||
int add = 1;
|
||||
HOST_WIDE_INT stride;
|
||||
struct check_store_data d;
|
||||
int size = GET_MODE_SIZE (GET_MODE (iv));
|
||||
|
||||
/* There are several reasons why an induction variable is not
|
||||
interesting to us. */
|
||||
if (iv->giv_type != DEST_ADDR
|
||||
/* We are interested only in constant stride memory references
|
||||
in order to be able to compute density easily. */
|
||||
|| GET_CODE (iv->mult_val) != CONST_INT
|
||||
/* Don't handle reversed order prefetches, since they are usually
|
||||
ineffective. Later we may be able to reverse such BIVs. */
|
||||
|| (PREFETCH_NO_REVERSE_ORDER
|
||||
&& (stride = INTVAL (iv->mult_val) * basestride) < 0)
|
||||
/* Prefetching of accesses with such a extreme stride is probably
|
||||
not worthwhile, either. */
|
||||
|| (PREFETCH_NO_EXTREME_STRIDE
|
||||
&& stride > PREFETCH_EXTREME_STRIDE)
|
||||
/* Ignore GIVs with varying add values; we can't predict the value
|
||||
for the next iteration. */
|
||||
|| !loop_invariant_p (loop, iv->add_val)
|
||||
/* Ignore GIVs in the nested loops; they ought to have been handled
|
||||
already. */
|
||||
|| iv->maybe_multiple)
|
||||
{
|
||||
if (loop_dump_stream)
|
||||
{
|
||||
fprintf (loop_dump_stream, "Prefetch: Ignoring giv at %i\n",
|
||||
INSN_UID (iv->insn));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Determine the pointer to the basic array we are examining. It is
|
||||
the sum of the BIV's initial value and the GIV's add_val. */
|
||||
index = 0;
|
||||
|
||||
address = copy_rtx (iv->add_val);
|
||||
temp = copy_rtx (bl->initial_value);
|
||||
|
||||
address = simplify_gen_binary (PLUS, Pmode, temp, address);
|
||||
index = remove_constant_addition (&address);
|
||||
|
||||
index += size;
|
||||
d.mem_write = 0;
|
||||
d.mem_address = *iv->location;
|
||||
/* When the GIV is not always executed, we might be better off by
|
||||
not dirtying the cache pages. */
|
||||
if (PREFETCH_NOT_ALWAYS || iv->always_executed)
|
||||
note_stores (PATTERN (iv->insn), check_store, &d);
|
||||
|
||||
/* Attempt to find another prefetch to the same array and see if we
|
||||
can merge this one. */
|
||||
for (i = 0; i < num_prefetches; i++)
|
||||
if (rtx_equal_for_prefetch_p (address, info[i].base_address)
|
||||
&& stride == info[i].stride)
|
||||
{
|
||||
/* In case both access same array (same location
|
||||
just with small difference in constant indexes), merge
|
||||
the prefetches. Just do the later and the earlier will
|
||||
get prefetched from previous iteration.
|
||||
4096 is artificial threshold. It should not be too small,
|
||||
but also not bigger than small portion of memory usually
|
||||
traversed by single loop. */
|
||||
|
||||
if (index >= info[i].index && index - info[i].index < 4096)
|
||||
{
|
||||
info[i].write |= d.mem_write;
|
||||
info[i].bytes_accesed += size;
|
||||
info[i].index = index;
|
||||
info[i].giv = iv;
|
||||
info[i].class = bl;
|
||||
info[num_prefetches].base_address = address;
|
||||
add = 0;
|
||||
break;
|
||||
}
|
||||
if (index < info[i].index && info[i].index - index < 4096)
|
||||
{
|
||||
info[i].write |= d.mem_write;
|
||||
info[i].bytes_accesed += size;
|
||||
add = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* Merging failed. */
|
||||
if (add)
|
||||
{
|
||||
info[num_prefetches].giv = iv;
|
||||
info[num_prefetches].class = bl;
|
||||
info[num_prefetches].index = index;
|
||||
info[num_prefetches].stride = stride;
|
||||
info[num_prefetches].base_address = address;
|
||||
info[num_prefetches].write = d.mem_write;
|
||||
info[num_prefetches].bytes_accesed = size;
|
||||
num_prefetches++;
|
||||
if (num_prefetches >= MAX_PREFETCHES)
|
||||
{
|
||||
if (loop_dump_stream)
|
||||
fprintf(loop_dump_stream,"Maximal number of prefetches exceeded.\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (i = 0; i < num_prefetches; i++)
|
||||
{
|
||||
/* Attempt to calculate the number of bytes fetched by the loop.
|
||||
Avoid overflow. */
|
||||
if (LOOP_INFO (loop)->n_iterations
|
||||
&& (0xffffffff / info[i].stride) >= LOOP_INFO (loop)->n_iterations)
|
||||
info[i].total_bytes = info[i].stride * LOOP_INFO (loop)->n_iterations;
|
||||
else
|
||||
info[i].total_bytes = 0xffffffff;
|
||||
|
||||
|
||||
/* Prefetch is worthwhile only when the loads/stores are dense. */
|
||||
if (PREFETCH_ONLY_DENSE_MEM
|
||||
&& (info[i].bytes_accesed * 256 / info[i].stride > PREFETCH_DENSE_MEM)
|
||||
&& (info[i].total_bytes / PREFETCH_BLOCK >=
|
||||
PREFETCH_BLOCKS_BEFORE_LOOP_MIN))
|
||||
{
|
||||
info[i].prefetch_before_loop = 1;
|
||||
if (info[i].total_bytes / PREFETCH_BLOCK <=
|
||||
PREFETCH_BLOCKS_BEFORE_LOOP_MAX)
|
||||
info[i].prefetch_in_loop = 0;
|
||||
else
|
||||
info[i].prefetch_in_loop = 1;
|
||||
}
|
||||
else
|
||||
info[i].prefetch_in_loop = 0, info[i].prefetch_before_loop = 0;
|
||||
|
||||
if (info[i].prefetch_in_loop)
|
||||
{
|
||||
num_real_prefetches += ((info[i].stride + PREFETCH_BLOCK - 1)
|
||||
/ PREFETCH_BLOCK);
|
||||
if (info[i].write)
|
||||
num_real_write_prefetches +=
|
||||
((info[i].stride + PREFETCH_BLOCK - 1) / PREFETCH_BLOCK);
|
||||
}
|
||||
}
|
||||
if (loop_dump_stream)
|
||||
{
|
||||
for (i = 0; i < num_prefetches; i++)
|
||||
{
|
||||
fprintf (loop_dump_stream, "Prefetch insn %i address: ",
|
||||
INSN_UID (info[i].giv->insn));
|
||||
print_rtl (loop_dump_stream, info[i].base_address);
|
||||
fprintf (loop_dump_stream, " Index:%i stride:%i density:%i%% total_bytes: %u %s in loop:%s before:%s\n",
|
||||
info[i].index, info[i].stride,
|
||||
info[i].bytes_accesed * 100 / info[i].stride,
|
||||
info[i].total_bytes,
|
||||
info[i].write ? "read/write" : "read only",
|
||||
info[i].prefetch_in_loop ? "yes" : "no",
|
||||
info[i].prefetch_before_loop ? "yes" : "no");
|
||||
}
|
||||
fprintf (loop_dump_stream, "Real prefetches needed:%i (write:%i)\n",
|
||||
num_real_prefetches, num_real_write_prefetches);
|
||||
}
|
||||
|
||||
if (!num_real_prefetches)
|
||||
return;
|
||||
|
||||
ahead = (SIMULTANEOUS_PREFETCHES / (num_real_prefetches));
|
||||
|
||||
if (!ahead)
|
||||
return;
|
||||
for (i = 0; i < num_prefetches; i++)
|
||||
{
|
||||
if (info[i].prefetch_in_loop)
|
||||
{
|
||||
int y;
|
||||
for (y = 0; y < ((info[i].stride + PREFETCH_BLOCK - 1)
|
||||
/ PREFETCH_BLOCK); y++)
|
||||
{
|
||||
rtx loc = copy_rtx (*info[i].giv->location);
|
||||
rtx insn;
|
||||
int bytes_ahead = PREFETCH_BLOCK * (ahead + y);
|
||||
rtx before_insn = info[i].giv->insn;
|
||||
rtx prev_insn = PREV_INSN (info[i].giv->insn);
|
||||
|
||||
/* We can save some effort by offsetting the address on
|
||||
architectures with offsettable memory references. */
|
||||
if (offsettable_address_p (0, VOIDmode, loc))
|
||||
loc = plus_constant (loc, bytes_ahead);
|
||||
else
|
||||
{
|
||||
rtx reg = gen_reg_rtx (Pmode);
|
||||
loop_iv_add_mult_emit_before (loop, loc, const1_rtx,
|
||||
GEN_INT (bytes_ahead), reg,
|
||||
0, before_insn);
|
||||
loc = reg;
|
||||
}
|
||||
|
||||
emit_insn_before (gen_prefetch (loc, GEN_INT (info[i].write),
|
||||
GEN_INT (3)), before_insn);
|
||||
|
||||
/* Check all insns emitted and record the new GIV information. */
|
||||
insn = NEXT_INSN (prev_insn);
|
||||
while (insn != before_insn)
|
||||
{
|
||||
insn = check_insn_for_givs (loop, insn,
|
||||
info[i].giv->always_executed,
|
||||
info[i].giv->maybe_multiple);
|
||||
insn = NEXT_INSN (insn);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (info[i].prefetch_before_loop)
|
||||
{
|
||||
int y;
|
||||
/* Emit INSNs before the loop to fetch the first cache lines. */
|
||||
for (y = 0; ((!info[i].prefetch_in_loop || y < ahead)
|
||||
&& y * PREFETCH_BLOCK < (int)info[i].total_bytes); y ++)
|
||||
{
|
||||
rtx reg = gen_reg_rtx (Pmode);
|
||||
rtx loop_start = loop->start;
|
||||
rtx add_val = simplify_gen_binary (PLUS, Pmode,
|
||||
info[i].giv->add_val,
|
||||
GEN_INT (y * PREFETCH_BLOCK));
|
||||
loop_iv_add_mult_emit_before (loop, info[i].class->initial_value,
|
||||
info[i].giv->mult_val,
|
||||
add_val, reg, 0, loop_start);
|
||||
emit_insn_before (gen_prefetch (reg, GEN_INT (info[i].write),
|
||||
GEN_INT (3)), loop_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* A "basic induction variable" or biv is a pseudo reg that is set
|
||||
(within this loop) only by incrementing or decrementing it. */
|
||||
/* A "general induction variable" or giv is a pseudo reg whose
|
||||
@ -4298,6 +4886,11 @@ strength_reduce (loop, flags)
|
||||
fail if the iteration variable is a giv. */
|
||||
loop_iterations (loop);
|
||||
|
||||
#ifdef HAVE_prefetch
|
||||
if (flags & LOOP_PREFETCH)
|
||||
emit_prefetch_instructions (loop);
|
||||
#endif
|
||||
|
||||
/* Now for each giv for which we still don't know whether or not it is
|
||||
replaceable, check to see if it is replaceable because its final value
|
||||
can be calculated. This must be done after loop_iterations is called,
|
||||
|
@ -27,6 +27,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
|
||||
/* Flags passed to loop_optimize. */
|
||||
#define LOOP_UNROLL 1
|
||||
#define LOOP_BCT 2
|
||||
#define LOOP_PREFETCH 4
|
||||
|
||||
/* Get the loop info pointer of a loop. */
|
||||
#define LOOP_INFO(LOOP) ((struct loop_info *) (LOOP)->aux)
|
||||
|
151
gcc/predict.c
151
gcc/predict.c
@ -329,12 +329,17 @@ estimate_probability (loops_info)
|
||||
for (i = 0; i < loops_info->num; i++)
|
||||
{
|
||||
int j;
|
||||
int exits;
|
||||
struct loop *loop = &loops_info->array[i];
|
||||
|
||||
for (j = loops_info->array[i].first->index;
|
||||
j <= loops_info->array[i].last->index;
|
||||
flow_loop_scan (loops_info, loop, LOOP_EXIT_EDGES);
|
||||
exits = loop->num_exits;
|
||||
|
||||
for (j = loop->first->index;
|
||||
j <= loop->last->index;
|
||||
++j)
|
||||
{
|
||||
if (TEST_BIT (loops_info->array[i].nodes, j))
|
||||
if (TEST_BIT (loop->nodes, j))
|
||||
{
|
||||
int header_found = 0;
|
||||
edge e;
|
||||
@ -342,8 +347,8 @@ estimate_probability (loops_info)
|
||||
/* Loop branch heuristics - predict as taken an edge back to
|
||||
a loop's head. */
|
||||
for (e = BASIC_BLOCK(j)->succ; e; e = e->succ_next)
|
||||
if (e->dest == loops_info->array[i].header
|
||||
&& e->src == loops_info->array[i].latch)
|
||||
if (e->dest == loop->header
|
||||
&& e->src == loop->latch)
|
||||
{
|
||||
header_found = 1;
|
||||
predict_edge_def (e, PRED_LOOP_BRANCH, TAKEN);
|
||||
@ -354,8 +359,11 @@ estimate_probability (loops_info)
|
||||
if (!header_found)
|
||||
for (e = BASIC_BLOCK(j)->succ; e; e = e->succ_next)
|
||||
if (e->dest->index <= 0
|
||||
|| !TEST_BIT (loops_info->array[i].nodes, e->dest->index))
|
||||
predict_edge_def (e, PRED_LOOP_EXIT, NOT_TAKEN);
|
||||
|| !TEST_BIT (loop->nodes, e->dest->index))
|
||||
predict_edge (e, PRED_LOOP_EXIT,
|
||||
(REG_BR_PROB_BASE
|
||||
- predictor_info [(int)PRED_LOOP_EXIT].hitrate)
|
||||
/ exits);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -435,74 +443,83 @@ estimate_probability (loops_info)
|
||||
/* Try "pointer heuristic."
|
||||
A comparison ptr == 0 is predicted as false.
|
||||
Similarly, a comparison ptr1 == ptr2 is predicted as false. */
|
||||
switch (GET_CODE (cond))
|
||||
{
|
||||
case EQ:
|
||||
if (GET_CODE (XEXP (cond, 0)) == REG
|
||||
&& REG_POINTER (XEXP (cond, 0))
|
||||
&& (XEXP (cond, 1) == const0_rtx
|
||||
|| (GET_CODE (XEXP (cond, 1)) == REG
|
||||
&& REG_POINTER (XEXP (cond, 1)))))
|
||||
|
||||
if (GET_RTX_CLASS (GET_CODE (cond)) == '<'
|
||||
&& ((REG_P (XEXP (cond, 0)) && REG_POINTER (XEXP (cond, 0)))
|
||||
|| (REG_P (XEXP (cond, 1)) && REG_POINTER (XEXP (cond, 1)))))
|
||||
switch (GET_CODE (cond))
|
||||
{
|
||||
case EQ:
|
||||
predict_insn_def (last_insn, PRED_POINTER, NOT_TAKEN);
|
||||
break;
|
||||
case NE:
|
||||
if (GET_CODE (XEXP (cond, 0)) == REG
|
||||
&& REG_POINTER (XEXP (cond, 0))
|
||||
&& (XEXP (cond, 1) == const0_rtx
|
||||
|| (GET_CODE (XEXP (cond, 1)) == REG
|
||||
&& REG_POINTER (XEXP (cond, 1)))))
|
||||
break;
|
||||
case NE:
|
||||
predict_insn_def (last_insn, PRED_POINTER, TAKEN);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
else
|
||||
/* Try "opcode heuristic."
|
||||
EQ tests are usually false and NE tests are usually true. Also,
|
||||
most quantities are positive, so we can make the appropriate guesses
|
||||
about signed comparisons against zero. */
|
||||
switch (GET_CODE (cond))
|
||||
{
|
||||
case CONST_INT:
|
||||
/* Unconditional branch. */
|
||||
predict_insn_def (last_insn, PRED_UNCONDITIONAL,
|
||||
cond == const0_rtx ? NOT_TAKEN : TAKEN);
|
||||
break;
|
||||
switch (GET_CODE (cond))
|
||||
{
|
||||
case CONST_INT:
|
||||
/* Unconditional branch. */
|
||||
predict_insn_def (last_insn, PRED_UNCONDITIONAL,
|
||||
cond == const0_rtx ? NOT_TAKEN : TAKEN);
|
||||
break;
|
||||
|
||||
case EQ:
|
||||
case UNEQ:
|
||||
predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
|
||||
break;
|
||||
case NE:
|
||||
case LTGT:
|
||||
predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
|
||||
break;
|
||||
case ORDERED:
|
||||
predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
|
||||
break;
|
||||
case UNORDERED:
|
||||
predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
|
||||
break;
|
||||
case LE:
|
||||
case LT:
|
||||
if (XEXP (cond, 1) == const0_rtx
|
||||
|| (GET_CODE (XEXP (cond, 1)) == CONST_INT
|
||||
&& INTVAL (XEXP (cond, 1)) == -1))
|
||||
predict_insn_def (last_insn, PRED_OPCODE, NOT_TAKEN);
|
||||
break;
|
||||
case GE:
|
||||
case GT:
|
||||
if (XEXP (cond, 1) == const0_rtx
|
||||
|| (GET_CODE (XEXP (cond, 1)) == CONST_INT
|
||||
&& INTVAL (XEXP (cond, 1)) == -1))
|
||||
predict_insn_def (last_insn, PRED_OPCODE, TAKEN);
|
||||
break;
|
||||
case EQ:
|
||||
case UNEQ:
|
||||
/* Floating point comparisons appears to behave in a very
|
||||
inpredictable way because of special role of = tests in
|
||||
FP code. */
|
||||
if (FLOAT_MODE_P (GET_MODE (XEXP (cond, 0))))
|
||||
;
|
||||
/* Comparisons with 0 are often used for booleans and there is
|
||||
nothing usefull to predict about them. */
|
||||
else if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 0) == const0_rtx)
|
||||
;
|
||||
else
|
||||
predict_insn_def (last_insn, PRED_OPCODE_NONEQUAL, NOT_TAKEN);
|
||||
break;
|
||||
case NE:
|
||||
case LTGT:
|
||||
/* Floating point comparisons appears to behave in a very
|
||||
inpredictable way because of special role of = tests in
|
||||
FP code. */
|
||||
if (FLOAT_MODE_P (GET_MODE (XEXP (cond, 0))))
|
||||
;
|
||||
/* Comparisons with 0 are often used for booleans and there is
|
||||
nothing usefull to predict about them. */
|
||||
else if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 0) == const0_rtx)
|
||||
;
|
||||
else
|
||||
predict_insn_def (last_insn, PRED_OPCODE_NONEQUAL, TAKEN);
|
||||
break;
|
||||
case ORDERED:
|
||||
predict_insn_def (last_insn, PRED_FPOPCODE, TAKEN);
|
||||
break;
|
||||
case UNORDERED:
|
||||
predict_insn_def (last_insn, PRED_FPOPCODE, NOT_TAKEN);
|
||||
break;
|
||||
case LE:
|
||||
case LT:
|
||||
if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 1) == const1_rtx
|
||||
|| XEXP (cond, 1) == constm1_rtx)
|
||||
predict_insn_def (last_insn, PRED_OPCODE_POSITIVE, NOT_TAKEN);
|
||||
break;
|
||||
case GE:
|
||||
case GT:
|
||||
if (XEXP (cond, 1) == const0_rtx || XEXP (cond, 1) == const1_rtx
|
||||
|| XEXP (cond, 1) == constm1_rtx)
|
||||
predict_insn_def (last_insn, PRED_OPCODE_POSITIVE, TAKEN);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Attach the combined probability to each conditional jump. */
|
||||
|
@ -89,7 +89,9 @@ DEF_PREDICTOR (PRED_LOOP_HEADER, "loop header", HITRATE (64), 0)
|
||||
DEF_PREDICTOR (PRED_POINTER, "pointer", HITRATE (83), 0)
|
||||
|
||||
/* NE is probable, EQ not etc... */
|
||||
DEF_PREDICTOR (PRED_OPCODE, "opcode", HITRATE (55), 0)
|
||||
DEF_PREDICTOR (PRED_OPCODE_POSITIVE, "opcode values positive", HITRATE (78), 0)
|
||||
DEF_PREDICTOR (PRED_OPCODE_NONEQUAL, "opcode values nonequal", HITRATE (70), 0)
|
||||
DEF_PREDICTOR (PRED_FPOPCODE, "fp_opcode", HITRATE (90), 0)
|
||||
|
||||
/* Branch guarding call is probably taken. */
|
||||
DEF_PREDICTOR (PRED_CALL, "call", HITRATE (70), 0)
|
||||
|
24
gcc/toplev.c
24
gcc/toplev.c
@ -46,6 +46,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
|
||||
#include "flags.h"
|
||||
#include "insn-attr.h"
|
||||
#include "insn-config.h"
|
||||
#include "insn-flags.h"
|
||||
#include "hard-reg-set.h"
|
||||
#include "recog.h"
|
||||
#include "output.h"
|
||||
@ -544,6 +545,10 @@ int flag_unroll_loops;
|
||||
|
||||
int flag_unroll_all_loops;
|
||||
|
||||
/* Nonzero enables prefetch optimizations for arrays in loops. */
|
||||
|
||||
int flag_prefetch_loop_arrays;
|
||||
|
||||
/* Nonzero forces all invariant computations in loops to be moved
|
||||
outside the loop. */
|
||||
|
||||
@ -1001,6 +1006,8 @@ lang_independent_options f_options[] =
|
||||
N_("Perform loop unrolling when iteration count is known") },
|
||||
{"unroll-all-loops", &flag_unroll_all_loops, 1,
|
||||
N_("Perform loop unrolling for all loops") },
|
||||
{"prefetch-loop-arrays", &flag_prefetch_loop_arrays, 1,
|
||||
N_("Generate prefetch instructions, if available, for arrays in loops") },
|
||||
{"move-all-movables", &flag_move_all_movables, 1,
|
||||
N_("Force all loop invariant computations out of loops") },
|
||||
{"reduce-all-givs", &flag_reduce_all_givs, 1,
|
||||
@ -2863,7 +2870,8 @@ rest_of_compilation (decl)
|
||||
}
|
||||
cleanup_barriers ();
|
||||
loop_optimize (insns, rtl_dump_file,
|
||||
(flag_unroll_loops ? LOOP_UNROLL : 0) | LOOP_BCT);
|
||||
(flag_unroll_loops ? LOOP_UNROLL : 0) | LOOP_BCT
|
||||
| (flag_prefetch_loop_arrays ? LOOP_PREFETCH : 0));
|
||||
|
||||
close_dump_file (DFI_loop, print_rtl, insns);
|
||||
timevar_pop (TV_LOOP);
|
||||
@ -4928,6 +4936,20 @@ process_options ()
|
||||
flag_function_sections = 0;
|
||||
}
|
||||
|
||||
#ifndef HAVE_prefetch
|
||||
if (flag_prefetch_loop_arrays)
|
||||
{
|
||||
warning ("-fprefetch-loop-arrays not supported for this target");
|
||||
flag_prefetch_loop_arrays = 0;
|
||||
}
|
||||
#else
|
||||
if (flag_prefetch_loop_arrays && !HAVE_prefetch)
|
||||
{
|
||||
warning ("-fprefetch-loop-arrays not supported for this target (try -march switches)");
|
||||
flag_prefetch_loop_arrays = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef OBJECT_FORMAT_ELF
|
||||
if (flag_function_sections && write_symbols != NO_DEBUG)
|
||||
warning ("-ffunction-sections may affect debugging on some targets");
|
||||
|
Loading…
Reference in New Issue
Block a user