re PR target/51244 ([SH] Inefficient conditional branch and code around T bit)

PR target/51244
	* config/sh/sh_treg_combine.cc: New SH specific RTL pass.
	* config.gcc (SH extra_objs): Add sh_ifcvt.o.
	* config/sh/t-sh (sh_treg_combine.o): New entry.
	* config/sh/sh.c (sh_fixed_condition_code_regs): New function that
	implements the target hook TARGET_FIXED_CONDITION_CODE_REGS.
	(register_sh_passes): New function.  Register sh_treg_combine pass.
	(sh_option_override): Invoke it.
	(sh_canonicalize_comparison): Handle op0_preserve_value.
	* sh.md (*cbranch_t"): Do not try to optimize missed test and branch
	opportunities.  Canonicalize branch condition.
	(nott): Allow only if pseudos can be created for non-SH2A.

	PR target/51244
	* gcc.dg/torture/p51244-21.c: New.
	* gcc.target/sh/pr51244-20.c: New.
	* gcc.target/sh/pr51244-20-sh2a.c: New.

From-SVN: r203492
This commit is contained in:
Oleg Endo 2013-10-12 20:47:22 +00:00
parent 585a0b9916
commit 5d30dc5b6d
10 changed files with 1802 additions and 80 deletions

View File

@ -1,3 +1,18 @@
2013-10-12 Oleg Endo <olegendo@gcc.gnu.org>
PR target/51244
* config/sh/sh_treg_combine.cc: New SH specific RTL pass.
* config.gcc (SH extra_objs): Add sh_ifcvt.o.
* config/sh/t-sh (sh_treg_combine.o): New entry.
* config/sh/sh.c (sh_fixed_condition_code_regs): New function that
implements the target hook TARGET_FIXED_CONDITION_CODE_REGS.
(register_sh_passes): New function. Register sh_treg_combine pass.
(sh_option_override): Invoke it.
(sh_canonicalize_comparison): Handle op0_preserve_value.
* sh.md (*cbranch_t"): Do not try to optimize missed test and branch
opportunities. Canonicalize branch condition.
(nott): Allow only if pseudos can be created for non-SH2A.
2013-10-12 H.J. Lu <hongjiu.lu@intel.com>
PR target/58690

View File

@ -465,6 +465,7 @@ sh[123456789lbe]*-*-* | sh-*-*)
cpu_type=sh
need_64bit_hwint=yes
extra_options="${extra_options} fused-madd.opt"
extra_objs="${extra_objs} sh_treg_combine.o"
;;
v850*-*-*)
cpu_type=v850

View File

@ -53,6 +53,9 @@ along with GCC; see the file COPYING3. If not see
#include "alloc-pool.h"
#include "tm-constrs.h"
#include "opts.h"
#include "tree-pass.h"
#include "pass_manager.h"
#include "context.h"
#include <sstream>
#include <vector>
@ -311,6 +314,7 @@ static bool sequence_insn_p (rtx);
static void sh_canonicalize_comparison (int *, rtx *, rtx *, bool);
static void sh_canonicalize_comparison (enum rtx_code&, rtx&, rtx&,
enum machine_mode, bool);
static bool sh_fixed_condition_code_regs (unsigned int* p1, unsigned int* p2);
static void sh_init_sync_libfuncs (void) ATTRIBUTE_UNUSED;
@ -587,6 +591,9 @@ static const struct attribute_spec sh_attribute_table[] =
#undef TARGET_CANONICALIZE_COMPARISON
#define TARGET_CANONICALIZE_COMPARISON sh_canonicalize_comparison
#undef TARGET_FIXED_CONDITION_CODE_REGS
#define TARGET_FIXED_CONDITION_CODE_REGS sh_fixed_condition_code_regs
/* Machine-specific symbol_ref flags. */
#define SYMBOL_FLAG_FUNCVEC_FUNCTION (SYMBOL_FLAG_MACH_DEP << 0)
@ -710,6 +717,34 @@ got_mode_name:;
#undef err_ret
}
/* Register SH specific RTL passes. */
extern opt_pass* make_pass_sh_treg_combine (gcc::context* ctx, bool split_insns,
const char* name);
static void
register_sh_passes (void)
{
if (!TARGET_SH1)
return;
/* Running the sh_treg_combine pass after ce1 generates better code when
comparisons are combined and reg-reg moves are introduced, because
reg-reg moves will be eliminated afterwards. However, there are quite
some cases where combine will be unable to fold comparison related insns,
thus for now don't do it.
register_pass (make_pass_sh_treg_combine (g, false, "sh_treg_combine1"),
PASS_POS_INSERT_AFTER, "ce1", 1);
*/
/* Run sh_treg_combine pass after combine but before register allocation. */
register_pass (make_pass_sh_treg_combine (g, true, "sh_treg_combine2"),
PASS_POS_INSERT_AFTER, "split1", 1);
/* Run sh_treg_combine pass after register allocation and basic block
reordering as this sometimes creates new opportunities. */
register_pass (make_pass_sh_treg_combine (g, true, "sh_treg_combine3"),
PASS_POS_INSERT_AFTER, "split4", 1);
}
/* Implement TARGET_OPTION_OVERRIDE macro. Validate and override
various options, and do some machine dependent initialization. */
static void
@ -1022,6 +1057,8 @@ sh_option_override (void)
target CPU. */
selected_atomic_model_
= parse_validate_atomic_model_option (sh_atomic_model_str);
register_sh_passes ();
}
/* Print the operand address in x to the stream. */
@ -1908,7 +1945,7 @@ prepare_move_operands (rtx operands[], enum machine_mode mode)
static void
sh_canonicalize_comparison (enum rtx_code& cmp, rtx& op0, rtx& op1,
enum machine_mode mode,
bool op0_preserve_value ATTRIBUTE_UNUSED)
bool op0_preserve_value)
{
/* When invoked from within the combine pass the mode is not specified,
so try to get it from one of the operands. */
@ -1928,6 +1965,9 @@ sh_canonicalize_comparison (enum rtx_code& cmp, rtx& op0, rtx& op1,
// Make sure that the constant operand is the second operand.
if (CONST_INT_P (op0) && !CONST_INT_P (op1))
{
if (op0_preserve_value)
return;
std::swap (op0, op1);
cmp = swap_condition (cmp);
}
@ -2016,6 +2056,14 @@ sh_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
*code = (int)tmp_code;
}
bool
sh_fixed_condition_code_regs (unsigned int* p1, unsigned int* p2)
{
*p1 = T_REG;
*p2 = INVALID_REGNUM;
return true;
}
enum rtx_code
prepare_cbranch_operands (rtx *operands, enum machine_mode mode,
enum rtx_code comparison)

View File

@ -8419,89 +8419,32 @@ label:
return output_branch (sh_eval_treg_value (operands[1]), insn, operands);
}
"&& 1"
[(set (pc) (if_then_else (eq (reg:SI T_REG) (match_dup 2))
(label_ref (match_dup 0))
(pc)))]
[(const_int 0)]
{
/* Try to find missed test and branch combine opportunities which result
in redundant T bit tests before conditional branches.
This is done not only after combine (and before reload) but in every
split pass, because some opportunities are formed also after combine.
FIXME: Probably this would not be needed if CCmode was used
together with TARGET_FIXED_CONDITION_CODE_REGS. */
/* Try to canonicalize the branch condition if it is not one of:
(ne (reg:SI T_REG) (const_int 0))
(eq (reg:SI T_REG) (const_int 0))
const int treg_value = sh_eval_treg_value (operands[1]);
operands[2] = NULL_RTX;
Instead of splitting out a new insn, we modify the current insn's
operands as needed. This preserves things such as REG_DEAD notes. */
/* Scan the insns backwards for an insn that sets the T bit by testing a
reg against zero like:
(set (reg T_REG) (eq (reg) (const_int 0))) */
rtx testing_insn = NULL_RTX;
rtx tested_reg = NULL_RTX;
if ((GET_CODE (operands[1]) == EQ || GET_CODE (operands[1]) == NE)
&& REG_P (XEXP (operands[1], 0)) && REGNO (XEXP (operands[1], 0)) == T_REG
&& XEXP (operands[1], 1) == const0_rtx)
DONE;
set_of_reg s0 = sh_find_set_of_reg (get_t_reg_rtx (), curr_insn,
prev_nonnote_insn_bb);
if (s0.set_src != NULL_RTX
&& GET_CODE (s0.set_src) == EQ
&& REG_P (XEXP (s0.set_src, 0))
&& satisfies_constraint_Z (XEXP (s0.set_src, 1)))
{
testing_insn = s0.insn;
tested_reg = XEXP (s0.set_src, 0);
}
else
FAIL;
int branch_cond = sh_eval_treg_value (operands[1]);
rtx new_cond_rtx = NULL_RTX;
/* Continue scanning the insns backwards and try to find the insn that
sets the tested reg which we found above. If the reg is set by storing
the T bit or the negated T bit we can eliminate the test insn before
the branch. Notice that the branch condition has to be inverted if the
test is eliminated. */
if (branch_cond == 0)
new_cond_rtx = gen_rtx_EQ (VOIDmode, get_t_reg_rtx (), const0_rtx);
else if (branch_cond == 1)
new_cond_rtx = gen_rtx_NE (VOIDmode, get_t_reg_rtx (), const0_rtx);
/* If the T bit is used between the testing insn and the brach insn
leave it alone. */
if (reg_used_between_p (get_t_reg_rtx (), testing_insn, curr_insn))
FAIL;
while (true)
{
/* It's not safe to go beyond the current basic block after reload. */
set_of_reg s1 = sh_find_set_of_reg (tested_reg, s0.insn,
reload_completed
? prev_nonnote_insn_bb
: prev_nonnote_insn);
if (s1.set_src == NULL_RTX)
break;
if (t_reg_operand (s1.set_src, VOIDmode))
operands[2] = GEN_INT (treg_value ^ 1);
else if (negt_reg_operand (s1.set_src, VOIDmode))
operands[2] = GEN_INT (treg_value);
else if (REG_P (s1.set_src))
{
/* If it's a reg-reg copy follow the copied reg. This can
happen e.g. when T bit store zero-extensions are
eliminated. */
tested_reg = s1.set_src;
s0.insn = s1.insn;
continue;
}
/* It's only safe to remove the testing insn if the T bit is not
modified between the testing insn and the insn that stores the
T bit. Notice that some T bit stores such as negc also modify
the T bit. */
if (modified_between_p (get_t_reg_rtx (), s1.insn, testing_insn)
|| modified_in_p (get_t_reg_rtx (), s1.insn))
operands[2] = NULL_RTX;
break;
}
if (operands[2] == NULL_RTX)
FAIL;
set_insn_deleted (testing_insn);
if (new_cond_rtx != NULL_RTX)
validate_change (curr_insn, &XEXP (XEXP (PATTERN (curr_insn), 1), 0),
new_cond_rtx, false);
DONE;
}
[(set_attr "type" "cbranch")])
@ -11480,10 +11423,13 @@ label:
;; multiple insns like:
;; movt Rn
;; tst Rn,Rn
;; This requires an additional pseudo. The SH specific sh_treg_combine RTL
;; pass will look for this insn. Disallow using it if pseudos can't be
;; created.
(define_insn_and_split "nott"
[(set (reg:SI T_REG)
(xor:SI (match_operand:SI 0 "t_reg_operand" "") (const_int 1)))]
"TARGET_SH1"
(xor:SI (match_operand:SI 0 "t_reg_operand") (const_int 1)))]
"TARGET_SH2A || (TARGET_SH1 && can_create_pseudo_p ())"
{
gcc_assert (TARGET_SH2A);
return "nott";

File diff suppressed because it is too large Load Diff

View File

@ -21,6 +21,10 @@ sh-c.o: $(srcdir)/config/sh/sh-c.c \
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/sh/sh-c.c
sh_treg_combine.o: $(srcdir)/config/sh/sh_treg_combine.cc \
$(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(TM_H) $(TM_P_H) coretypes.h
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
DEFAULT_ENDIAN = $(word 1,$(TM_ENDIAN_CONFIG))
OTHER_ENDIAN = $(word 2,$(TM_ENDIAN_CONFIG))

View File

@ -1,3 +1,10 @@
2013-10-12 Oleg Endo <olegendo@gcc.gnu.org>
PR target/51244
* gcc.dg/torture/p51244-21.c: New.
* gcc.target/sh/pr51244-20.c: New.
* gcc.target/sh/pr51244-20-sh2a.c: New.
2013-10-12 Arnaud Charlet <charlet@adacore.com>
* gnat.dg/specs/linker_section.ads: Update test.

View File

@ -0,0 +1,75 @@
/* { dg-do run } */
#include <assert.h>
static inline int
blk_oversized_queue (int* q)
{
if (q[2])
return q[1] != 0;
return q[0] == 0;
}
int __attribute__ ((noinline))
get_request (int* q, int rw)
{
if (blk_oversized_queue (q))
{
if ((rw == 1) || (rw == 0))
return -33;
return 0;
}
return -100;
}
int main (void)
{
int x[3];
int r;
x[0] = 0; x[1] = 1; x[2] = 1;
r = get_request (x, 0);
assert (r == -33);
r = get_request (x, 1);
assert (r == -33);
r = get_request (x, 2);
assert (r == 0);
x[0] = 0; x[1] = 0; x[2] = 1;
r = get_request (x, 0);
assert (r == -100);
r = get_request (x, 1);
assert (r == -100);
r = get_request (x, 2);
assert (r == -100);
x[0] = 0; x[1] = 0; x[2] = 0;
r = get_request (x, 0);
assert (r == -33);
r = get_request (x, 1);
assert (r == -33);
r = get_request (x, 2);
assert (r == 0);
x[0] = 0; x[1] = 0; x[2] = 0;
r = get_request (x, 0);
assert (r == -33);
r = get_request (x, 1);
assert (r == -33);
r = get_request (x, 2);
assert (r == 0);
return 0;
}

View File

@ -0,0 +1,14 @@
/* Check that the SH specific sh_treg_combine RTL optimization pass works as
expected. */
/* { dg-do compile { target "sh*-*-*" } } */
/* { dg-options "-O2" } */
/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m2a*" } } */
/* { dg-final { scan-assembler-times "tst" 5 } } */
/* { dg-final { scan-assembler-times "movt" 0 } } */
/* { dg-final { scan-assembler-times "nott" 1 } } */
/* { dg-final { scan-assembler-times "cmp/eq" 2 } } */
/* { dg-final { scan-assembler-times "cmp/hi" 4 } } */
/* { dg-final { scan-assembler-times "cmp/gt" 3 } } */
/* { dg-final { scan-assembler-times "not\t" 1 } } */
#include "pr51244-20.c"

View File

@ -0,0 +1,103 @@
/* Check that the SH specific sh_treg_combine RTL optimization pass works as
expected. On SH2A the expected insns are slightly different, see
pr51244-21.c. */
/* { dg-do compile { target "sh*-*-*" } } */
/* { dg-options "-O2" } */
/* { dg-skip-if "" { "sh*-*-*" } { "-m5*" "-m2a*" } { "" } } */
/* { dg-final { scan-assembler-times "tst" 6 } } */
/* { dg-final { scan-assembler-times "movt" 1 } } */
/* { dg-final { scan-assembler-times "cmp/eq" 2 } } */
/* { dg-final { scan-assembler-times "cmp/hi" 4 } } */
/* { dg-final { scan-assembler-times "cmp/gt" 2 } } */
/* { dg-final { scan-assembler-times "not\t" 1 } } */
/* non-SH2A: 2x tst, 1x movt, 2x cmp/eq, 1x cmp/hi
SH2A: 1x tst, 1x nott, 2x cmp/eq, 1x cmp/hi */
static inline int
blk_oversized_queue_0 (int* q)
{
if (q[2])
return q[1] == 5;
return (q[0] != 5);
}
int __attribute__ ((noinline))
get_request_0 (int* q, int rw)
{
if (blk_oversized_queue_0 (q))
{
if ((rw == 1) || (rw == 0))
return -33;
return 0;
}
return -100;
}
/* 1x tst, 1x cmp/gt, 1x cmp/hi
On SH2A mem loads/stores have a wrong length of 4 bytes and thus will
not be placed in a delay slot. This introduces an extra cmp/gt insn. */
static inline int
blk_oversized_queue_1 (int* q)
{
if (q[2])
return q[1] > 5;
return (q[0] > 5);
}
int __attribute__ ((noinline))
get_request_1 (int* q, int rw)
{
if (blk_oversized_queue_1 (q))
{
if ((rw == 1) || (rw == 0))
return -33;
return 0;
}
return -100;
}
/* 1x tst, 1x cmp/gt, 1x cmp/hi, 1x cmp/hi */
static inline int
blk_oversized_queue_2 (int* q)
{
if (q[2])
return q[1] > 5;
return (q[0] < 5);
}
int __attribute__ ((noinline))
get_request_2 (int* q, int rw)
{
if (blk_oversized_queue_2 (q))
{
if ((rw == 1) || (rw == 0))
return -33;
return 0;
}
return -100;
}
/* 2x tst, 1x cmp/hi, 1x not */
static inline int
blk_oversized_queue_5 (int* q)
{
if (q[2])
return q[1] != 0;
return q[0] == 0;
}
int __attribute__ ((noinline))
get_request_5 (int* q, int rw)
{
if (blk_oversized_queue_5 (q))
{
if ((rw == 1) || (rw == 0))
return -33;
return 0;
}
return -100;
}