diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f3a3b0b3bf8..24493252e2c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,48 @@ +2008-08-30 Jan Hubicka + + * optabs.c (expand_abs_nojump): Update BRANCH_COST call. + * fold-cost.c (LOGICAL_OP_NON_SHORT_CIRCUIT, fold_truthop): Likewise. + * dojump.c (do_jump): Likewise. + * ifcvt.c (MAX_CONDITIONAL_EXECUTE): Likewise. + (note-if_info): Add BRANCH_COST. + (noce_try_store_flag_constants, noce_try_addcc, noce_try_store_flag_mask, + noce_try_cmove_arith, noce_try_cmove_arith, noce_try_cmove_arith, + noce_find_if_block, find_if_case_1, find_if_case_2): Use compuated + branch cost. + * expr.h (BRANCH_COST): Update default. + * predict.c (predictable_edge_p): New function. + * expmed.c (expand_smod_pow2, expand_sdiv_pow2, emit_store_flag): + Update BRANCH_COST call. + * basic-block.h (predictable_edge_p): Declare. + * config/alpha/alpha.h (BRANCH_COST): Update. + * config/frv/frv.h (BRANCH_COST): Update. + * config/s390/s390.h (BRANCH_COST): Update. + * config/spu/spu.h (BRANCH_COST): Update. + * config/sparc/sparc.h (BRANCH_COST): Update. + * config/m32r/m32r.h (BRANCH_COST): Update. + * config/i386/i386.h (BRANCH_COST): Update. + * config/i386/i386.c (ix86_expand_int_movcc): Update use of BRANCH_COST. + * config/sh/sh.h (BRANCH_COST): Update. + * config/pdp11/pdp11.h (BRANCH_COST): Update. + * config/avr/avr.h (BRANCH_COST): Update. + * config/crx/crx.h (BRANCH_COST): Update. + * config/xtensa/xtensa.h (BRANCH_COST): Update. + * config/stormy16/stormy16.h (BRANCH_COST): Update. + * config/m68hc11/m68hc11.h (BRANCH_COST): Update. + * config/iq2000/iq2000.h (BRANCH_COST): Update. + * config/ia64/ia64.h (BRANCH_COST): Update. + * config/rs6000/rs6000.h (BRANCH_COST): Update. + * config/arc/arc.h (BRANCH_COST): Update. + * config/score/score.h (BRANCH_COST): Update. + * config/arm/arm.h (BRANCH_COST): Update. + * config/pa/pa.h (BRANCH_COST): Update. + * config/mips/mips.h (BRANCH_COST): Update. + * config/vax/vax.h (BRANCH_COST): Update. + * config/h8300/h8300.h (BRANCH_COST): Update. + * params.def (PARAM_PREDICTABLE_BRANCH_OUTCOME): New. + * doc/invoke.texi (predictable-branch-cost-outcome): Document. + * doc/tm.texi (BRANCH_COST): Update. + 2008-08-30 Samuel Tardieu PR target/37283 diff --git a/gcc/basic-block.h b/gcc/basic-block.h index e891f9e5675..e1d13ea27c6 100644 --- a/gcc/basic-block.h +++ b/gcc/basic-block.h @@ -852,6 +852,7 @@ extern void guess_outgoing_edge_probabilities (basic_block); extern void remove_predictions_associated_with_edge (edge); extern bool edge_probability_reliable_p (const_edge); extern bool br_prob_note_reliable_p (const_rtx); +extern bool predictable_edge_p (edge); /* In cfg.c */ extern void dump_regset (regset, FILE *); diff --git a/gcc/config/alpha/alpha.h b/gcc/config/alpha/alpha.h index 4336e6c9357..8e022d6a5f8 100644 --- a/gcc/config/alpha/alpha.h +++ b/gcc/config/alpha/alpha.h @@ -640,7 +640,7 @@ extern int alpha_memory_latency; #define MEMORY_MOVE_COST(MODE,CLASS,IN) (2*alpha_memory_latency) /* Provide the cost of a branch. Exact meaning under development. */ -#define BRANCH_COST 5 +#define BRANCH_COST(speed_p, predictable_p) 5 /* Stack layout; function entry, exit and calling. */ diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index ea40fb23ab7..ff473c3f73e 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -824,7 +824,7 @@ arc_select_cc_mode (OP, X, Y) /* The cost of a branch insn. */ /* ??? What's the right value here? Branches are certainly more expensive than reg->reg moves. */ -#define BRANCH_COST 2 +#define BRANCH_COST(speed_p, predictable_p) 2 /* Nonzero if access to memory by bytes is slow and undesirable. For RISC chips, it means that access to memory by bytes is no diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 2f236e02442..4132b06b024 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -2297,7 +2297,7 @@ do { \ /* Try to generate sequences that don't involve branches, we can then use conditional instructions */ -#define BRANCH_COST \ +#define BRANCH_COST(speed_p, predictable_p) \ (TARGET_32BIT ? 4 : (optimize > 0 ? 2 : 0)) /* Position Independent Code. */ diff --git a/gcc/config/avr/avr.h b/gcc/config/avr/avr.h index b5132e26313..74409851805 100644 --- a/gcc/config/avr/avr.h +++ b/gcc/config/avr/avr.h @@ -511,7 +511,7 @@ do { \ (MODE)==SImode ? 8 : \ (MODE)==SFmode ? 8 : 16) -#define BRANCH_COST 0 +#define BRANCH_COST(speed_p, predictable_p) 0 #define SLOW_BYTE_ACCESS 0 diff --git a/gcc/config/crx/crx.h b/gcc/config/crx/crx.h index c3e1e203113..91c5f31d1e1 100644 --- a/gcc/config/crx/crx.h +++ b/gcc/config/crx/crx.h @@ -420,7 +420,7 @@ struct cumulative_args /* Moving to processor register flushes pipeline - thus asymmetric */ #define REGISTER_MOVE_COST(MODE, FROM, TO) ((TO != GENERAL_REGS) ? 8 : 2) /* Assume best case (branch predicted) */ -#define BRANCH_COST 2 +#define BRANCH_COST(speed_p, predictable_p) 2 #define SLOW_BYTE_ACCESS 1 diff --git a/gcc/config/frv/frv.h b/gcc/config/frv/frv.h index 6c86ef569f2..8a71337fcc1 100644 --- a/gcc/config/frv/frv.h +++ b/gcc/config/frv/frv.h @@ -2193,7 +2193,7 @@ do { \ /* A C expression for the cost of a branch instruction. A value of 1 is the default; other values are interpreted relative to that. */ -#define BRANCH_COST frv_branch_cost_int +#define BRANCH_COST(speed_p, predictable_p) frv_branch_cost_int /* Define this macro as a C expression which is nonzero if accessing less than a word of memory (i.e. a `char' or a `short') is no faster than accessing a diff --git a/gcc/config/h8300/h8300.h b/gcc/config/h8300/h8300.h index 5ed4205dcd3..7305fc32f85 100644 --- a/gcc/config/h8300/h8300.h +++ b/gcc/config/h8300/h8300.h @@ -1004,7 +1004,7 @@ struct cum_arg #define DELAY_SLOT_LENGTH(JUMP) \ (NEXT_INSN (PREV_INSN (JUMP)) == JUMP ? 0 : 2) -#define BRANCH_COST 0 +#define BRANCH_COST(speed_p, predictable_p) 0 /* Tell final.c how to eliminate redundant test instructions. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 9ef95b503ec..002a9a7e2a0 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -14636,7 +14636,8 @@ ix86_expand_int_movcc (rtx operands[]) */ if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) - && BRANCH_COST >= 2) + && BRANCH_COST (optimize_insn_for_speed_p (), + false) >= 2) { if (cf == 0) { @@ -14721,7 +14722,7 @@ ix86_expand_int_movcc (rtx operands[]) optab op; rtx var, orig_out, out, tmp; - if (BRANCH_COST <= 2) + if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2) return 0; /* FAIL */ /* If one of the two operands is an interesting constant, load a diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 7ad70694242..d933c5e2389 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1975,7 +1975,8 @@ do { \ /* A C expression for the cost of a branch instruction. A value of 1 is the default; other values are interpreted relative to that. */ -#define BRANCH_COST ix86_branch_cost +#define BRANCH_COST(speed_p, predictable_p) \ + (!(speed_p) ? 2 : (predictable_p) ? 0 : ix86_branch_cost) /* Define this macro as a C expression which is nonzero if accessing less than a word of memory (i.e. a `char' or a `short') is no diff --git a/gcc/config/ia64/ia64.h b/gcc/config/ia64/ia64.h index 6fca6902408..e97bcf6e934 100644 --- a/gcc/config/ia64/ia64.h +++ b/gcc/config/ia64/ia64.h @@ -1384,7 +1384,7 @@ do { \ many additional insn groups we run into, vs how good the dynamic branch predictor is. */ -#define BRANCH_COST 6 +#define BRANCH_COST(speed_p, predictable_p) 6 /* Define this macro as a C expression which is nonzero if accessing less than a word of memory (i.e. a `char' or a `short') is no faster than accessing a diff --git a/gcc/config/iq2000/iq2000.h b/gcc/config/iq2000/iq2000.h index f09a9f7bf2a..f6c7ec5edb6 100644 --- a/gcc/config/iq2000/iq2000.h +++ b/gcc/config/iq2000/iq2000.h @@ -624,7 +624,7 @@ typedef struct iq2000_args #define MEMORY_MOVE_COST(MODE,CLASS,TO_P) \ (TO_P ? 2 : 16) -#define BRANCH_COST 2 +#define BRANCH_COST(speed_p, predictable_p) 2 #define SLOW_BYTE_ACCESS 1 diff --git a/gcc/config/m32r/m32r.h b/gcc/config/m32r/m32r.h index 33fe7e0666a..f2f7e891201 100644 --- a/gcc/config/m32r/m32r.h +++ b/gcc/config/m32r/m32r.h @@ -1224,7 +1224,7 @@ L2: .word STATIC /* A value of 2 here causes GCC to avoid using branches in comparisons like while (a < N && a). Branches aren't that expensive on the M32R so we define this as 1. Defining it as 2 had a heavy hit in fp-bit.c. */ -#define BRANCH_COST ((TARGET_BRANCH_COST) ? 2 : 1) +#define BRANCH_COST(speed_p, predictable_p) ((TARGET_BRANCH_COST) ? 2 : 1) /* Nonzero if access to memory by bytes is slow and undesirable. For RISC chips, it means that access to memory by bytes is no diff --git a/gcc/config/m68hc11/m68hc11.h b/gcc/config/m68hc11/m68hc11.h index 7034c93b30e..cae57e3a9fe 100644 --- a/gcc/config/m68hc11/m68hc11.h +++ b/gcc/config/m68hc11/m68hc11.h @@ -1266,7 +1266,7 @@ extern unsigned char m68hc11_reg_valid_for_index[FIRST_PSEUDO_REGISTER]; Pretend branches are cheap because GCC generates sub-optimal code for the default value. */ -#define BRANCH_COST 0 +#define BRANCH_COST(speed_p, predictable_p) 0 /* Nonzero if access to memory by bytes is slow and undesirable. */ #define SLOW_BYTE_ACCESS 0 diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h index 66788c17e6e..e008e804781 100644 --- a/gcc/config/mips/mips.h +++ b/gcc/config/mips/mips.h @@ -2557,7 +2557,7 @@ typedef struct mips_args { /* A C expression for the cost of a branch instruction. A value of 1 is the default; other values are interpreted relative to that. */ -#define BRANCH_COST mips_branch_cost +#define BRANCH_COST(speed_p, predictable_p) mips_branch_cost #define LOGICAL_OP_NON_SHORT_CIRCUIT 0 /* If defined, modifies the length assigned to instruction INSN as a diff --git a/gcc/config/pa/pa.h b/gcc/config/pa/pa.h index 610bcf5da3c..5e272a2f346 100644 --- a/gcc/config/pa/pa.h +++ b/gcc/config/pa/pa.h @@ -1570,7 +1570,7 @@ do { \ : 2) /* Adjust the cost of branches. */ -#define BRANCH_COST (pa_cpu == PROCESSOR_8000 ? 2 : 1) +#define BRANCH_COST(speed_p, predictable_p) (pa_cpu == PROCESSOR_8000 ? 2 : 1) /* Handling the special cases is going to get too complicated for a macro, just call `pa_adjust_insn_length' to do the real work. */ diff --git a/gcc/config/pdp11/pdp11.h b/gcc/config/pdp11/pdp11.h index 662886cf8db..e572d6cf9c3 100644 --- a/gcc/config/pdp11/pdp11.h +++ b/gcc/config/pdp11/pdp11.h @@ -1057,7 +1057,7 @@ JMP FUNCTION 0x0058 0x0000 <- FUNCTION /* there is no point in avoiding branches on a pdp, since branches are really cheap - I just want to find out how much difference the BRANCH_COST macro makes in code */ -#define BRANCH_COST (TARGET_BRANCH_CHEAP ? 0 : 1) +#define BRANCH_COST(speed_p, predictable_p) (TARGET_BRANCH_CHEAP ? 0 : 1) #define COMPARE_FLAG_MODE HImode diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 048d163ff14..2d5bbff39ab 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -967,7 +967,7 @@ extern enum rs6000_nop_insertion rs6000_sched_insert_nops; Set this to 3 on the RS/6000 since that is roughly the average cost of an unscheduled conditional branch. */ -#define BRANCH_COST 3 +#define BRANCH_COST(speed_p, predictable_p) 3 /* Override BRANCH_COST heuristic which empirically produces worse performance for removing short circuiting from the logical ops. */ diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h index b96f10026e7..a31efd24a23 100644 --- a/gcc/config/s390/s390.h +++ b/gcc/config/s390/s390.h @@ -828,7 +828,7 @@ extern struct rtx_def *s390_compare_op0, *s390_compare_op1, *s390_compare_emitte /* A C expression for the cost of a branch instruction. A value of 1 is the default; other values are interpreted relative to that. */ -#define BRANCH_COST 1 +#define BRANCH_COST(speed_p, predictable_p) 1 /* Nonzero if access to memory by bytes is slow and undesirable. */ #define SLOW_BYTE_ACCESS 1 diff --git a/gcc/config/score/score.h b/gcc/config/score/score.h index d400f6ab0ce..d9900a50214 100644 --- a/gcc/config/score/score.h +++ b/gcc/config/score/score.h @@ -793,7 +793,7 @@ typedef struct score_args (4 + memory_move_secondary_cost ((MODE), (CLASS), (TO_P))) /* Try to generate sequences that don't involve branches. */ -#define BRANCH_COST 2 +#define BRANCH_COST(speed_p, predictable_p) 2 /* Nonzero if access to memory by bytes is slow and undesirable. */ #define SLOW_BYTE_ACCESS 1 diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h index 47ecfc494da..15bc744fd1d 100644 --- a/gcc/config/sh/sh.h +++ b/gcc/config/sh/sh.h @@ -2847,7 +2847,8 @@ struct sh_args { The SH1 does not have delay slots, hence we get a pipeline stall at every branch. The SH4 is superscalar, so the single delay slot is not sufficient to keep both pipelines filled. */ -#define BRANCH_COST (TARGET_SH5 ? 1 : ! TARGET_SH2 || TARGET_HARD_SH4 ? 2 : 1) +#define BRANCH_COST(speed_p, predictable_p) \ + (TARGET_SH5 ? 1 : ! TARGET_SH2 || TARGET_HARD_SH4 ? 2 : 1) /* Assembler output control. */ diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h index 42894705361..de5f52089a3 100644 --- a/gcc/config/sparc/sparc.h +++ b/gcc/config/sparc/sparc.h @@ -2196,7 +2196,7 @@ do { \ On Niagara-2, a not-taken branch costs 1 cycle whereas a taken branch costs 6 cycles. */ -#define BRANCH_COST \ +#define BRANCH_COST (speed_p, predictable_p) \ ((sparc_cpu == PROCESSOR_V9 \ || sparc_cpu == PROCESSOR_ULTRASPARC) \ ? 7 \ diff --git a/gcc/config/spu/spu.h b/gcc/config/spu/spu.h index b27e9b7b12e..f78eb73c429 100644 --- a/gcc/config/spu/spu.h +++ b/gcc/config/spu/spu.h @@ -434,7 +434,7 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \ /* Costs */ -#define BRANCH_COST spu_branch_cost +#define BRANCH_COST(speed_p, predictable_p) spu_branch_cost #define SLOW_BYTE_ACCESS 0 diff --git a/gcc/config/stormy16/stormy16.h b/gcc/config/stormy16/stormy16.h index 4cd4084a2bb..005108de349 100644 --- a/gcc/config/stormy16/stormy16.h +++ b/gcc/config/stormy16/stormy16.h @@ -587,7 +587,7 @@ do { \ #define MEMORY_MOVE_COST(M,C,I) (5 + memory_move_secondary_cost (M, C, I)) -#define BRANCH_COST 5 +#define BRANCH_COST(speed_p, predictable_p) 5 #define SLOW_BYTE_ACCESS 0 diff --git a/gcc/config/vax/vax.h b/gcc/config/vax/vax.h index 84a6ba4bd1a..20d781bf27e 100644 --- a/gcc/config/vax/vax.h +++ b/gcc/config/vax/vax.h @@ -648,7 +648,7 @@ enum reg_class { NO_REGS, ALL_REGS, LIM_REG_CLASSES }; Branches are extremely cheap on the VAX while the shift insns often used to replace branches can be expensive. */ -#define BRANCH_COST 0 +#define BRANCH_COST(speed_p, predictable_p) 0 /* Tell final.c how to eliminate redundant test instructions. */ diff --git a/gcc/config/xtensa/xtensa.h b/gcc/config/xtensa/xtensa.h index 1c23b8d568c..fecb56996e3 100644 --- a/gcc/config/xtensa/xtensa.h +++ b/gcc/config/xtensa/xtensa.h @@ -887,7 +887,7 @@ typedef struct xtensa_args #define MEMORY_MOVE_COST(MODE, CLASS, IN) 4 -#define BRANCH_COST 3 +#define BRANCH_COST(speed_p, predictable_p) 3 /* How to refer to registers in assembler output. This sequence is indexed by compiler's hard-register-number (see above). */ diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 64b6a40f6c1..29b67c72b61 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -6900,6 +6900,10 @@ to the hottest structure frequency in the program is less than this parameter, then structure reorganization is not applied to this structure. The default is 10. +@item predictable-branch-cost-outcome +When branch is predicted to be taken with probability lower than this threshold +(in percent), then it is considered well predictable. The default is 10. + @item max-crossjump-edges The maximum number of incoming edges to consider for crossjumping. The algorithm used by @option{-fcrossjumping} is @math{O(N^2)} in diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 3087694a9cd..2cd0877601d 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -5874,9 +5874,14 @@ value to the result of that function. The arguments to that function are the same as to this macro. @end defmac -@defmac BRANCH_COST -A C expression for the cost of a branch instruction. A value of 1 is -the default; other values are interpreted relative to that. +@defmac BRANCH_COST (@var{speed_p}, @var{predictable_p}) +A C expression for the cost of a branch instruction. A value of 1 is the +default; other values are interpreted relative to that. Parameter @var{speed_p} +is true when the branch in question should be optimized for speed. When +it is false, @code{BRANCH_COST} should be returning value optimal for code size +rather then performance considerations. @var{predictable_p} is true for well +predictable branches. On many architectures the @code{BRANCH_COST} can be +reduced then. @end defmac Here are additional macros which do not specify precise relative costs, diff --git a/gcc/dojump.c b/gcc/dojump.c index 46aa4f2ebb8..bee8da0fd03 100644 --- a/gcc/dojump.c +++ b/gcc/dojump.c @@ -510,7 +510,9 @@ do_jump (tree exp, rtx if_false_label, rtx if_true_label) /* High branch cost, expand as the bitwise AND of the conditions. Do the same if the RHS has side effects, because we're effectively turning a TRUTH_AND_EXPR into a TRUTH_ANDIF_EXPR. */ - if (BRANCH_COST >= 4 || TREE_SIDE_EFFECTS (TREE_OPERAND (exp, 1))) + if (BRANCH_COST (optimize_insn_for_speed_p (), + false) >= 4 + || TREE_SIDE_EFFECTS (TREE_OPERAND (exp, 1))) goto normal; case TRUTH_ANDIF_EXPR: @@ -531,7 +533,8 @@ do_jump (tree exp, rtx if_false_label, rtx if_true_label) /* High branch cost, expand as the bitwise OR of the conditions. Do the same if the RHS has side effects, because we're effectively turning a TRUTH_OR_EXPR into a TRUTH_ORIF_EXPR. */ - if (BRANCH_COST >= 4 || TREE_SIDE_EFFECTS (TREE_OPERAND (exp, 1))) + if (BRANCH_COST (optimize_insn_for_speed_p (), false)>= 4 + || TREE_SIDE_EFFECTS (TREE_OPERAND (exp, 1))) goto normal; case TRUTH_ORIF_EXPR: diff --git a/gcc/expmed.c b/gcc/expmed.c index 8212992d10d..399139b97b5 100644 --- a/gcc/expmed.c +++ b/gcc/expmed.c @@ -3492,7 +3492,7 @@ expand_smod_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d) result = gen_reg_rtx (mode); /* Avoid conditional branches when they're expensive. */ - if (BRANCH_COST >= 2 + if (BRANCH_COST (optimize_insn_for_speed_p (), false) >= 2 && optimize_insn_for_speed_p ()) { rtx signmask = emit_store_flag (result, LT, op0, const0_rtx, @@ -3592,7 +3592,9 @@ expand_sdiv_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d) logd = floor_log2 (d); shift = build_int_cst (NULL_TREE, logd); - if (d == 2 && BRANCH_COST >= 1) + if (d == 2 + && BRANCH_COST (optimize_insn_for_speed_p (), + false) >= 1) { temp = gen_reg_rtx (mode); temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, 1); @@ -3602,7 +3604,8 @@ expand_sdiv_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d) } #ifdef HAVE_conditional_move - if (BRANCH_COST >= 2) + if (BRANCH_COST (optimize_insn_for_speed_p (), false) + >= 2) { rtx temp2; @@ -3631,7 +3634,8 @@ expand_sdiv_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d) } #endif - if (BRANCH_COST >= 2) + if (BRANCH_COST (optimize_insn_for_speed_p (), + false) >= 2) { int ushift = GET_MODE_BITSIZE (mode) - logd; @@ -5345,7 +5349,8 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1, comparison with zero. Don't do any of these cases if branches are very cheap. */ - if (BRANCH_COST > 0 + if (BRANCH_COST (optimize_insn_for_speed_p (), + false) > 0 && GET_MODE_CLASS (mode) == MODE_INT && (code == EQ || code == NE) && op1 != const0_rtx) { @@ -5368,10 +5373,12 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1, do LE and GT if branches are expensive since they are expensive on 2-operand machines. */ - if (BRANCH_COST == 0 + if (BRANCH_COST (optimize_insn_for_speed_p (), + false) == 0 || GET_MODE_CLASS (mode) != MODE_INT || op1 != const0_rtx || (code != EQ && code != NE - && (BRANCH_COST <= 1 || (code != LE && code != GT)))) + && (BRANCH_COST (optimize_insn_for_speed_p (), + false) <= 1 || (code != LE && code != GT)))) return 0; /* See what we need to return. We can only return a 1, -1, or the @@ -5467,7 +5474,10 @@ emit_store_flag (rtx target, enum rtx_code code, rtx op0, rtx op1, that "or", which is an extra insn, so we only handle EQ if branches are expensive. */ - if (tem == 0 && (code == NE || BRANCH_COST > 1)) + if (tem == 0 + && (code == NE + || BRANCH_COST (optimize_insn_for_speed_p (), + false) > 1)) { if (rtx_equal_p (subtarget, op0)) subtarget = 0; diff --git a/gcc/expr.h b/gcc/expr.h index 29b1f3e9312..30dbaef1037 100644 --- a/gcc/expr.h +++ b/gcc/expr.h @@ -36,7 +36,7 @@ along with GCC; see the file COPYING3. If not see /* The default branch cost is 1. */ #ifndef BRANCH_COST -#define BRANCH_COST 1 +#define BRANCH_COST(speed_p, predictable_p) 1 #endif /* This is the 4th arg to `expand_expr'. diff --git a/gcc/fold-const.c b/gcc/fold-const.c index 01936bd55fd..ae66ad4ee2d 100644 --- a/gcc/fold-const.c +++ b/gcc/fold-const.c @@ -5109,7 +5109,9 @@ fold_cond_expr_with_comparison (tree type, tree arg0, tree arg1, tree arg2) #ifndef LOGICAL_OP_NON_SHORT_CIRCUIT -#define LOGICAL_OP_NON_SHORT_CIRCUIT (BRANCH_COST >= 2) +#define LOGICAL_OP_NON_SHORT_CIRCUIT \ + (BRANCH_COST (!cfun || optimize_function_for_speed_p (cfun), \ + false) >= 2) #endif /* EXP is some logical combination of boolean tests. See if we can @@ -5357,7 +5359,8 @@ fold_truthop (enum tree_code code, tree truth_type, tree lhs, tree rhs) that can be merged. Avoid doing this if the RHS is a floating-point comparison since those can trap. */ - if (BRANCH_COST >= 2 + if (BRANCH_COST (!cfun || optimize_function_for_speed_p (cfun), + false) >= 2 && ! FLOAT_TYPE_P (TREE_TYPE (rl_arg)) && simple_operand_p (rl_arg) && simple_operand_p (rr_arg)) diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c index da8afde3106..51fc48c6043 100644 --- a/gcc/ifcvt.c +++ b/gcc/ifcvt.c @@ -67,7 +67,9 @@ #endif #ifndef MAX_CONDITIONAL_EXECUTE -#define MAX_CONDITIONAL_EXECUTE (BRANCH_COST + 1) +#define MAX_CONDITIONAL_EXECUTE \ + (BRANCH_COST (optimize_function_for_speed_p (cfun), false) \ + + 1) #endif #define IFCVT_MULTIPLE_DUMPS 1 @@ -626,6 +628,9 @@ struct noce_if_info from TEST_BB. For the noce transformations, we allow the symmetric form as well. */ bool then_else_reversed; + + /* Estimated cost of the particular branch instruction. */ + int branch_cost; }; static rtx noce_emit_store_flag (struct noce_if_info *, rtx, int, int); @@ -963,20 +968,20 @@ noce_try_store_flag_constants (struct noce_if_info *if_info) normalize = 0; else if (ifalse == 0 && exact_log2 (itrue) >= 0 && (STORE_FLAG_VALUE == 1 - || BRANCH_COST >= 2)) + || if_info->branch_cost >= 2)) normalize = 1; else if (itrue == 0 && exact_log2 (ifalse) >= 0 && can_reverse - && (STORE_FLAG_VALUE == 1 || BRANCH_COST >= 2)) + && (STORE_FLAG_VALUE == 1 || if_info->branch_cost >= 2)) normalize = 1, reversep = 1; else if (itrue == -1 && (STORE_FLAG_VALUE == -1 - || BRANCH_COST >= 2)) + || if_info->branch_cost >= 2)) normalize = -1; else if (ifalse == -1 && can_reverse - && (STORE_FLAG_VALUE == -1 || BRANCH_COST >= 2)) + && (STORE_FLAG_VALUE == -1 || if_info->branch_cost >= 2)) normalize = -1, reversep = 1; - else if ((BRANCH_COST >= 2 && STORE_FLAG_VALUE == -1) - || BRANCH_COST >= 3) + else if ((if_info->branch_cost >= 2 && STORE_FLAG_VALUE == -1) + || if_info->branch_cost >= 3) normalize = -1; else return FALSE; @@ -1107,7 +1112,7 @@ noce_try_addcc (struct noce_if_info *if_info) /* If that fails, construct conditional increment or decrement using setcc. */ - if (BRANCH_COST >= 2 + if (if_info->branch_cost >= 2 && (XEXP (if_info->a, 1) == const1_rtx || XEXP (if_info->a, 1) == constm1_rtx)) { @@ -1158,7 +1163,7 @@ noce_try_store_flag_mask (struct noce_if_info *if_info) int reversep; reversep = 0; - if ((BRANCH_COST >= 2 + if ((if_info->branch_cost >= 2 || STORE_FLAG_VALUE == -1) && ((if_info->a == const0_rtx && rtx_equal_p (if_info->b, if_info->x)) @@ -1317,7 +1322,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info) /* ??? FIXME: Magic number 5. */ if (cse_not_expected && MEM_P (a) && MEM_P (b) - && BRANCH_COST >= 5) + && if_info->branch_cost >= 5) { a = XEXP (a, 0); b = XEXP (b, 0); @@ -1347,7 +1352,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info) if (insn_a) { insn_cost = insn_rtx_cost (PATTERN (insn_a)); - if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (BRANCH_COST)) + if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (if_info->branch_cost)) return FALSE; } else @@ -1356,7 +1361,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info) if (insn_b) { insn_cost += insn_rtx_cost (PATTERN (insn_b)); - if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (BRANCH_COST)) + if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (if_info->branch_cost)) return FALSE; } @@ -2831,6 +2836,8 @@ noce_find_if_block (basic_block test_bb, if_info.cond_earliest = cond_earliest; if_info.jump = jump; if_info.then_else_reversed = then_else_reversed; + if_info.branch_cost = BRANCH_COST (optimize_bb_for_speed_p (test_bb), + predictable_edge_p (then_edge)); /* Do the real work. */ @@ -3597,7 +3604,9 @@ find_if_case_1 (basic_block test_bb, edge then_edge, edge else_edge) test_bb->index, then_bb->index); /* THEN is small. */ - if (! cheap_bb_rtx_cost_p (then_bb, COSTS_N_INSNS (BRANCH_COST))) + if (! cheap_bb_rtx_cost_p (then_bb, + COSTS_N_INSNS (BRANCH_COST (optimize_bb_for_speed_p (then_edge->src), + predictable_edge_p (then_edge))))) return FALSE; /* Registers set are dead, or are predicable. */ @@ -3711,7 +3720,9 @@ find_if_case_2 (basic_block test_bb, edge then_edge, edge else_edge) test_bb->index, else_bb->index); /* ELSE is small. */ - if (! cheap_bb_rtx_cost_p (else_bb, COSTS_N_INSNS (BRANCH_COST))) + if (! cheap_bb_rtx_cost_p (else_bb, + COSTS_N_INSNS (BRANCH_COST (optimize_bb_for_speed_p (else_edge->src), + predictable_edge_p (else_edge))))) return FALSE; /* Registers set are dead, or are predicable. */ diff --git a/gcc/optabs.c b/gcc/optabs.c index 5d2545dd3b9..0c3b9caeed5 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -3443,7 +3443,9 @@ expand_abs_nojump (enum machine_mode mode, rtx op0, rtx target, value of X as (((signed) x >> (W-1)) ^ x) - ((signed) x >> (W-1)), where W is the width of MODE. */ - if (GET_MODE_CLASS (mode) == MODE_INT && BRANCH_COST >= 2) + if (GET_MODE_CLASS (mode) == MODE_INT + && BRANCH_COST (optimize_insn_for_speed_p (), + false) >= 2) { rtx extended = expand_shift (RSHIFT_EXPR, mode, op0, size_int (GET_MODE_BITSIZE (mode) - 1), diff --git a/gcc/params.def b/gcc/params.def index c71b2b6500e..5e9a664208f 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -78,6 +78,13 @@ DEFPARAM (PARAM_STRUCT_REORG_COLD_STRUCT_RATIO, "The threshold ratio between current and hottest structure counts", 10, 0, 100) +/* When branch is predicted to be taken with probability lower than this + threshold (in percent), then it is considered well predictable. */ +DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME, + "predictable-branch-outcome", + "Maximal esitmated outcome of branch considered predictable", + 2, 0, 50) + /* The single function inlining limit. This is the maximum size of a function counted in internal gcc instructions (not in real machine instructions) that is eligible for inlining diff --git a/gcc/predict.c b/gcc/predict.c index d5de938ccfa..6ca1a0cddc2 100644 --- a/gcc/predict.c +++ b/gcc/predict.c @@ -318,6 +318,23 @@ optimize_loop_nest_for_size_p (struct loop *loop) return !optimize_loop_nest_for_speed_p (loop); } +/* Return true when edge E is likely to be well predictable by branch + predictor. */ + +bool +predictable_edge_p (edge e) +{ + if (profile_status == PROFILE_ABSENT) + return false; + if ((e->probability + <= PARAM_VALUE (PARAM_PREDICTABLE_BRANCH_OUTCOME) * REG_BR_PROB_BASE / 100) + || (REG_BR_PROB_BASE - e->probability + <= PARAM_VALUE (PARAM_PREDICTABLE_BRANCH_OUTCOME) * REG_BR_PROB_BASE / 100)) + return true; + return false; +} + + /* Set RTL expansion for BB profile. */ void