re PR tree-optimization/79390 (10% performance drop in SciMark2 LU after r242550)

PR tree-optimization/79390
	* target.h (struct noce_if_info): Declare.
	* targhooks.h (default_noce_conversion_profitable_p): Declare.
	* target.def (noce_conversion_profitable_p): New target hook.
	* ifcvt.h (struct noce_if_info): New type, moved from ...
	* ifcvt.c (struct noce_if_info): ... here.
	(noce_conversion_profitable_p): Renamed to ...
	(default_noce_conversion_profitable_p): ... this.  No longer
	static nor inline.
	(noce_try_store_flag_constants, noce_try_addcc,
	noce_try_store_flag_mask, noce_try_cmove, noce_try_cmove_arith,
	noce_convert_multiple_sets): Use targetm.noce_conversion_profitable_p
	instead of noce_conversion_profitable_p.
	* config/i386/i386.c: Include ifcvt.h.
	(ix86_option_override_internal): Don't override
	PARAM_MAX_RTL_IF_CONVERSION_INSNS default.
	(ix86_noce_conversion_profitable_p): New function.
	(TARGET_NOCE_CONVERSION_PROFITABLE_P): Redefine.
	* config/i386/x86-tune.def (X86_TUNE_ONE_IF_CONV_INSN): Adjust comment.
	* doc/tm.texi.in (TARGET_NOCE_CONVERSION_PROFITABLE_P): Add.
	* doc/tm.texi: Regenerated.

	* gcc.target/i386/pr79390.c: New test.
	* gcc.dg/ifcvt-4.c: Use -mtune-ctrl=^one_if_conv_insn for i?86/x86_64.

From-SVN: r246686
This commit is contained in:
Jakub Jelinek 2017-04-04 19:52:27 +02:00 committed by Jakub Jelinek
parent e2b2c4b9fb
commit d16e2ddd26
13 changed files with 209 additions and 91 deletions

View File

@ -1,3 +1,27 @@
2017-04-04 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/79390
* target.h (struct noce_if_info): Declare.
* targhooks.h (default_noce_conversion_profitable_p): Declare.
* target.def (noce_conversion_profitable_p): New target hook.
* ifcvt.h (struct noce_if_info): New type, moved from ...
* ifcvt.c (struct noce_if_info): ... here.
(noce_conversion_profitable_p): Renamed to ...
(default_noce_conversion_profitable_p): ... this. No longer
static nor inline.
(noce_try_store_flag_constants, noce_try_addcc,
noce_try_store_flag_mask, noce_try_cmove, noce_try_cmove_arith,
noce_convert_multiple_sets): Use targetm.noce_conversion_profitable_p
instead of noce_conversion_profitable_p.
* config/i386/i386.c: Include ifcvt.h.
(ix86_option_override_internal): Don't override
PARAM_MAX_RTL_IF_CONVERSION_INSNS default.
(ix86_noce_conversion_profitable_p): New function.
(TARGET_NOCE_CONVERSION_PROFITABLE_P): Redefine.
* config/i386/x86-tune.def (X86_TUNE_ONE_IF_CONV_INSN): Adjust comment.
* doc/tm.texi.in (TARGET_NOCE_CONVERSION_PROFITABLE_P): Add.
* doc/tm.texi: Regenerated.
2017-04-04 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
* doc/extend.texi (PowerPC AltiVec Built-in Functions): Grammar

View File

@ -84,6 +84,7 @@ along with GCC; see the file COPYING3. If not see
#include "selftest-rtl.h"
#include "print-rtl.h"
#include "intl.h"
#include "ifcvt.h"
/* This file should be included last. */
#include "target-def.h"
@ -6104,13 +6105,6 @@ ix86_option_override_internal (bool main_args_p,
opts->x_param_values,
opts_set->x_param_values);
/* Restrict number of if-converted SET insns to 1. */
if (TARGET_ONE_IF_CONV_INSN)
maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS,
1,
opts->x_param_values,
opts_set->x_param_values);
/* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
if (opts->x_flag_prefetch_loop_arrays < 0
&& HAVE_prefetch
@ -50629,6 +50623,41 @@ ix86_max_noce_ifcvt_seq_cost (edge e)
return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
}
/* Return true if SEQ is a good candidate as a replacement for the
if-convertible sequence described in IF_INFO. */
static bool
ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
{
if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
{
int cmov_cnt = 0;
/* Punt if SEQ contains more than one CMOV or FCMOV instruction.
Maybe we should allow even more conditional moves as long as they
are used far enough not to stall the CPU, or also consider
IF_INFO->TEST_BB succ edge probabilities. */
for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
{
rtx set = single_set (insn);
if (!set)
continue;
if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
continue;
rtx src = SET_SRC (set);
enum machine_mode mode = GET_MODE (src);
if (GET_MODE_CLASS (mode) != MODE_INT
&& GET_MODE_CLASS (mode) != MODE_FLOAT)
continue;
if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
|| (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
continue;
/* insn is CMOV or FCMOV. */
if (++cmov_cnt > 1)
return false;
}
}
return default_noce_conversion_profitable_p (seq, if_info);
}
/* Implement targetm.vectorize.init_cost. */
@ -52181,6 +52210,10 @@ ix86_run_selftests (void)
#undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
#define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
#undef TARGET_NOCE_CONVERSION_PROFITABLE_P
#define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
#if CHECKING_P
#undef TARGET_RUN_TARGET_SELFTESTS
#define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests

View File

@ -547,7 +547,7 @@ DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U)
the unroll factor so that the unrolled loop fits the loop buffer. */
DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of set insns to be
if-converted to one. */
/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
if-converted sequence to one. */
DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
m_SILVERMONT | m_KNL | m_INTEL | m_CORE_ALL | m_GENERIC)

View File

@ -6644,6 +6644,12 @@ The default implementation of this hook uses the
and uses a multiple of @code{BRANCH_COST} otherwise.
@end deftypefn
@deftypefn {Target Hook} bool TARGET_NOCE_CONVERSION_PROFITABLE_P (rtx_insn *@var{seq}, struct noce_if_info *@var{if_info})
This hook returns true if the instruction sequence @code{seq} is a good
candidate as a replacement for the if-convertible sequence described in
@code{if_info}.
@end deftypefn
@deftypefn {Target Hook} bool TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P (void)
This predicate controls the use of the eager delay slot filler to disallow
speculatively executed instructions being placed in delay slots. Targets

View File

@ -4796,6 +4796,8 @@ Define this macro if a non-short-circuit operation produced by
@hook TARGET_MAX_NOCE_IFCVT_SEQ_COST
@hook TARGET_NOCE_CONVERSION_PROFITABLE_P
@hook TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P
@node Scheduling

View File

@ -760,76 +760,6 @@ cond_exec_process_if_block (ce_if_block * ce_info,
return FALSE;
}
/* Used by noce_process_if_block to communicate with its subroutines.
The subroutines know that A and B may be evaluated freely. They
know that X is a register. They should insert new instructions
before cond_earliest. */
struct noce_if_info
{
/* The basic blocks that make up the IF-THEN-{ELSE-,}JOIN block. */
basic_block test_bb, then_bb, else_bb, join_bb;
/* The jump that ends TEST_BB. */
rtx_insn *jump;
/* The jump condition. */
rtx cond;
/* Reversed jump condition. */
rtx rev_cond;
/* New insns should be inserted before this one. */
rtx_insn *cond_earliest;
/* Insns in the THEN and ELSE block. There is always just this
one insns in those blocks. The insns are single_set insns.
If there was no ELSE block, INSN_B is the last insn before
COND_EARLIEST, or NULL_RTX. In the former case, the insn
operands are still valid, as if INSN_B was moved down below
the jump. */
rtx_insn *insn_a, *insn_b;
/* The SET_SRC of INSN_A and INSN_B. */
rtx a, b;
/* The SET_DEST of INSN_A. */
rtx x;
/* The original set destination that the THEN and ELSE basic blocks finally
write their result to. */
rtx orig_x;
/* True if this if block is not canonical. In the canonical form of
if blocks, the THEN_BB is the block reached via the fallthru edge
from TEST_BB. For the noce transformations, we allow the symmetric
form as well. */
bool then_else_reversed;
/* True if the contents of then_bb and else_bb are a
simple single set instruction. */
bool then_simple;
bool else_simple;
/* True if we're optimisizing the control block for speed, false if
we're optimizing for size. */
bool speed_p;
/* An estimate of the original costs. When optimizing for size, this is the
combined cost of COND, JUMP and the costs for THEN_BB and ELSE_BB.
When optimizing for speed, we use the costs of COND plus the minimum of
the costs for THEN_BB and ELSE_BB, as computed in the next field. */
unsigned int original_cost;
/* Maximum permissible cost for the unconditional sequence we should
generate to replace this branch. */
unsigned int max_seq_cost;
/* The name of the noce transform that succeeded in if-converting
this structure. Used for debugging. */
const char *transform_name;
};
static rtx noce_emit_store_flag (struct noce_if_info *, rtx, int, int);
static int noce_try_move (struct noce_if_info *);
static int noce_try_ifelse_collapse (struct noce_if_info *);
@ -857,11 +787,14 @@ noce_reversed_cond_code (struct noce_if_info *if_info)
return reversed_comparison_code (if_info->cond, if_info->jump);
}
/* Return TRUE if SEQ is a good candidate as a replacement for the
if-convertible sequence described in IF_INFO. */
/* Return true if SEQ is a good candidate as a replacement for the
if-convertible sequence described in IF_INFO.
This is the default implementation that targets can override
through a target hook. */
inline static bool
noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
bool
default_noce_conversion_profitable_p (rtx_insn *seq,
struct noce_if_info *if_info)
{
bool speed_p = if_info->speed_p;
@ -1544,7 +1477,7 @@ noce_try_store_flag_constants (struct noce_if_info *if_info)
noce_emit_move_insn (if_info->x, target);
seq = end_ifcvt_sequence (if_info);
if (!seq || !noce_conversion_profitable_p (seq, if_info))
if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info))
return FALSE;
emit_insn_before_setloc (seq, if_info->jump,
@ -1605,7 +1538,7 @@ noce_try_addcc (struct noce_if_info *if_info)
noce_emit_move_insn (if_info->x, target);
seq = end_ifcvt_sequence (if_info);
if (!seq || !noce_conversion_profitable_p (seq, if_info))
if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info))
return FALSE;
emit_insn_before_setloc (seq, if_info->jump,
@ -1647,7 +1580,7 @@ noce_try_addcc (struct noce_if_info *if_info)
noce_emit_move_insn (if_info->x, target);
seq = end_ifcvt_sequence (if_info);
if (!seq || !noce_conversion_profitable_p (seq, if_info))
if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info))
return FALSE;
emit_insn_before_setloc (seq, if_info->jump,
@ -1698,7 +1631,7 @@ noce_try_store_flag_mask (struct noce_if_info *if_info)
noce_emit_move_insn (if_info->x, target);
seq = end_ifcvt_sequence (if_info);
if (!seq || !noce_conversion_profitable_p (seq, if_info))
if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info))
return FALSE;
emit_insn_before_setloc (seq, if_info->jump,
@ -1850,7 +1783,7 @@ noce_try_cmove (struct noce_if_info *if_info)
noce_emit_move_insn (if_info->x, target);
seq = end_ifcvt_sequence (if_info);
if (!seq || !noce_conversion_profitable_p (seq, if_info))
if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info))
return FALSE;
emit_insn_before_setloc (seq, if_info->jump,
@ -1903,7 +1836,7 @@ noce_try_cmove (struct noce_if_info *if_info)
noce_emit_move_insn (if_info->x, target);
seq = end_ifcvt_sequence (if_info);
if (!seq || !noce_conversion_profitable_p (seq, if_info))
if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info))
return FALSE;
emit_insn_before_setloc (seq, if_info->jump,
@ -2345,7 +2278,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
noce_emit_move_insn (x, target);
ifcvt_seq = end_ifcvt_sequence (if_info);
if (!ifcvt_seq || !noce_conversion_profitable_p (ifcvt_seq, if_info))
if (!ifcvt_seq || !targetm.noce_conversion_profitable_p (ifcvt_seq, if_info))
return FALSE;
emit_insn_before_setloc (ifcvt_seq, if_info->jump,
@ -3308,7 +3241,7 @@ noce_convert_multiple_sets (struct noce_if_info *if_info)
/* Actually emit the sequence if it isn't too expensive. */
rtx_insn *seq = get_insns ();
if (!noce_conversion_profitable_p (seq, if_info))
if (!targetm.noce_conversion_profitable_p (seq, if_info))
{
end_sequence ();
return FALSE;

View File

@ -40,4 +40,74 @@ struct ce_if_block
int pass; /* Pass number. */
};
/* Used by noce_process_if_block to communicate with its subroutines.
The subroutines know that A and B may be evaluated freely. They
know that X is a register. They should insert new instructions
before cond_earliest. */
struct noce_if_info
{
/* The basic blocks that make up the IF-THEN-{ELSE-,}JOIN block. */
basic_block test_bb, then_bb, else_bb, join_bb;
/* The jump that ends TEST_BB. */
rtx_insn *jump;
/* The jump condition. */
rtx cond;
/* Reversed jump condition. */
rtx rev_cond;
/* New insns should be inserted before this one. */
rtx_insn *cond_earliest;
/* Insns in the THEN and ELSE block. There is always just this
one insns in those blocks. The insns are single_set insns.
If there was no ELSE block, INSN_B is the last insn before
COND_EARLIEST, or NULL_RTX. In the former case, the insn
operands are still valid, as if INSN_B was moved down below
the jump. */
rtx_insn *insn_a, *insn_b;
/* The SET_SRC of INSN_A and INSN_B. */
rtx a, b;
/* The SET_DEST of INSN_A. */
rtx x;
/* The original set destination that the THEN and ELSE basic blocks finally
write their result to. */
rtx orig_x;
/* True if this if block is not canonical. In the canonical form of
if blocks, the THEN_BB is the block reached via the fallthru edge
from TEST_BB. For the noce transformations, we allow the symmetric
form as well. */
bool then_else_reversed;
/* True if the contents of then_bb and else_bb are a
simple single set instruction. */
bool then_simple;
bool else_simple;
/* True if we're optimisizing the control block for speed, false if
we're optimizing for size. */
bool speed_p;
/* An estimate of the original costs. When optimizing for size, this is the
combined cost of COND, JUMP and the costs for THEN_BB and ELSE_BB.
When optimizing for speed, we use the costs of COND plus the minimum of
the costs for THEN_BB and ELSE_BB, as computed in the next field. */
unsigned int original_cost;
/* Maximum permissible cost for the unconditional sequence we should
generate to replace this branch. */
unsigned int max_seq_cost;
/* The name of the noce transform that succeeded in if-converting
this structure. Used for debugging. */
const char *transform_name;
};
#endif /* GCC_IFCVT_H */

View File

@ -3660,6 +3660,16 @@ and uses a multiple of @code{BRANCH_COST} otherwise.",
unsigned int, (edge e),
default_max_noce_ifcvt_seq_cost)
/* Return true if the given instruction sequence is a good candidate
as a replacement for the if-convertible sequence. */
DEFHOOK
(noce_conversion_profitable_p,
"This hook returns true if the instruction sequence @code{seq} is a good\n\
candidate as a replacement for the if-convertible sequence described in\n\
@code{if_info}.",
bool, (rtx_insn *seq, struct noce_if_info *if_info),
default_noce_conversion_profitable_p)
/* Permit speculative instructions in delay slots during delayed-branch
scheduling. */
DEFHOOK

View File

@ -140,6 +140,9 @@ struct ddg;
/* This is defined in cfgloop.h . */
struct loop;
/* This is defined in ifcvt.h. */
struct noce_if_info;
/* This is defined in tree-ssa-alias.h. */
struct ao_ref;

View File

@ -257,6 +257,8 @@ extern void default_setup_incoming_vararg_bounds (cumulative_args_t ca ATTRIBUTE
extern bool default_optab_supported_p (int, machine_mode, machine_mode,
optimization_type);
extern unsigned int default_max_noce_ifcvt_seq_cost (edge);
extern bool default_noce_conversion_profitable_p (rtx_insn *,
struct noce_if_info *);
extern unsigned int default_min_arithmetic_precision (void);
extern enum flt_eval_method

View File

@ -1,3 +1,9 @@
2017-04-04 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/79390
* gcc.target/i386/pr79390.c: New test.
* gcc.dg/ifcvt-4.c: Use -mtune-ctrl=^one_if_conv_insn for i?86/x86_64.
2017-04-04 Volker Reichelt <v.reichelt@netcologne.de>
PR c++/80296

View File

@ -1,6 +1,7 @@
/* { dg-options "-fdump-rtl-ce1 -O2 --param max-rtl-if-conversion-insns=3 --param max-rtl-if-conversion-unpredictable-cost=100" } */
/* { dg-additional-options "-misel" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-march=z196" { target { s390x-*-* } } } */
/* { dg-additional-options "-mtune-ctrl=^one_if_conv_insn" { target { i?86-*-* x86_64-*-* } } } */
/* { dg-skip-if "Multiple set if-conversion not guaranteed on all subtargets" { "arm*-*-* hppa*64*-*-* s390-*-* visium-*-*" riscv*-*-* } } */
/* { dg-skip-if "" { "s390x-*-*" } { "-m31" } } */

View File

@ -0,0 +1,28 @@
/* PR tree-optimization/79390 */
/* { dg-do compile { target lp64 } } */
/* { dg-options "-O2 -march=haswell -mtune=haswell -mfpmath=sse" } */
/* Make sure we emit a conditional move in this loop. */
extern double A[32];
int
foo (void)
{
double t = A[0];
int jp = 0;
int i;
for (i = 0; i < 32; i++)
{
double ab = A[i];
if (ab > t)
{
jp = i;
t = ab;
}
}
return jp;
}
/* { dg-final { scan-assembler "\[ \\t\]cmov\[a-z]+\[ \\t\]" } } */