re PR target/51244 ([SH] Inefficient conditional branch and code around T bit)

PR target/51244 * config/sh/sh_treg_combine.cc: New SH specific RTL pass. * config.gcc (SH extra_objs): Add sh_ifcvt.o. * config/sh/t-sh (sh_treg_combine.o): New entry. * config/sh/sh.c (sh_fixed_condition_code_regs): New function that implements the target hook TARGET_FIXED_CONDITION_CODE_REGS. (register_sh_passes): New function. Register sh_treg_combine pass. (sh_option_override): Invoke it. (sh_canonicalize_comparison): Handle op0_preserve_value. * sh.md (*cbranch_t"): Do not try to optimize missed test and branch opportunities. Canonicalize branch condition. (nott): Allow only if pseudos can be created for non-SH2A. PR target/51244 * gcc.dg/torture/p51244-21.c: New. * gcc.target/sh/pr51244-20.c: New. * gcc.target/sh/pr51244-20-sh2a.c: New. From-SVN: r203492
2013-10-12 20:47:22 +00:00 · 2013-10-12 20:47:22 +00:00 · 5d30dc5b6d
parent 585a0b9916
commit 5d30dc5b6d
10 changed files with 1802 additions and 80 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,18 @@
+2013-10-12  Oleg Endo  <olegendo@gcc.gnu.org>
+
+	PR target/51244
+	* config/sh/sh_treg_combine.cc: New SH specific RTL pass.
+	* config.gcc (SH extra_objs): Add sh_ifcvt.o.
+	* config/sh/t-sh (sh_treg_combine.o): New entry.
+	* config/sh/sh.c (sh_fixed_condition_code_regs): New function that
+	implements the target hook TARGET_FIXED_CONDITION_CODE_REGS.
+	(register_sh_passes): New function.  Register sh_treg_combine pass.
+	(sh_option_override): Invoke it.
+	(sh_canonicalize_comparison): Handle op0_preserve_value.
+	* sh.md (*cbranch_t"): Do not try to optimize missed test and branch
+	opportunities.  Canonicalize branch condition.
+	(nott): Allow only if pseudos can be created for non-SH2A.
+
 2013-10-12  H.J. Lu  <hongjiu.lu@intel.com>

 	PR target/58690
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@ -465,6 +465,7 @@ sh[123456789lbe]*-*-* | sh-*-*)
 	cpu_type=sh
 	need_64bit_hwint=yes
 	extra_options="${extra_options} fused-madd.opt"
+	extra_objs="${extra_objs} sh_treg_combine.o"
 	;;
 v850*-*-*)
 	cpu_type=v850
--- a/gcc/config/sh/sh.c
+++ b/gcc/config/sh/sh.c
@ -53,6 +53,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "alloc-pool.h"
 #include "tm-constrs.h"
 #include "opts.h"
+#include "tree-pass.h"
+#include "pass_manager.h"
+#include "context.h"

 #include <sstream>
 #include <vector>
@ -311,6 +314,7 @@ static bool sequence_insn_p (rtx);
 static void sh_canonicalize_comparison (int *, rtx *, rtx *, bool);
 static void sh_canonicalize_comparison (enum rtx_code&, rtx&, rtx&,
 					enum machine_mode, bool);
+static bool sh_fixed_condition_code_regs (unsigned int* p1, unsigned int* p2);

 static void sh_init_sync_libfuncs (void) ATTRIBUTE_UNUSED;

@ -587,6 +591,9 @@ static const struct attribute_spec sh_attribute_table[] =
 #undef TARGET_CANONICALIZE_COMPARISON
 #define TARGET_CANONICALIZE_COMPARISON	sh_canonicalize_comparison

+#undef TARGET_FIXED_CONDITION_CODE_REGS
+#define TARGET_FIXED_CONDITION_CODE_REGS sh_fixed_condition_code_regs
+
 /* Machine-specific symbol_ref flags.  */
 #define SYMBOL_FLAG_FUNCVEC_FUNCTION	(SYMBOL_FLAG_MACH_DEP << 0)

@ -710,6 +717,34 @@ got_mode_name:;
 #undef err_ret
 }

+/* Register SH specific RTL passes.  */
+extern opt_pass* make_pass_sh_treg_combine (gcc::context* ctx, bool split_insns,
+				     const char* name);
+static void
+register_sh_passes (void)
+{
+  if (!TARGET_SH1)
+    return;
+
+/* Running the sh_treg_combine pass after ce1 generates better code when
+   comparisons are combined and reg-reg moves are introduced, because
+   reg-reg moves will be eliminated afterwards.  However, there are quite
+   some cases where combine will be unable to fold comparison related insns,
+   thus for now don't do it.
+  register_pass (make_pass_sh_treg_combine (g, false, "sh_treg_combine1"),
+		 PASS_POS_INSERT_AFTER, "ce1", 1);
+*/
+
+  /* Run sh_treg_combine pass after combine but before register allocation.  */
+  register_pass (make_pass_sh_treg_combine (g, true, "sh_treg_combine2"),
+		 PASS_POS_INSERT_AFTER, "split1", 1);
+
+  /* Run sh_treg_combine pass after register allocation and basic block
+     reordering as this sometimes creates new opportunities.  */
+  register_pass (make_pass_sh_treg_combine (g, true, "sh_treg_combine3"),
+		 PASS_POS_INSERT_AFTER, "split4", 1);
+}
+
 /* Implement TARGET_OPTION_OVERRIDE macro.  Validate and override 
   various options, and do some machine dependent initialization.  */
 static void
@ -1022,6 +1057,8 @@ sh_option_override (void)
     target CPU.  */
  selected_atomic_model_
    = parse_validate_atomic_model_option (sh_atomic_model_str);
+
+  register_sh_passes ();
 }

 /* Print the operand address in x to the stream.  */
@ -1908,7 +1945,7 @@ prepare_move_operands (rtx operands[], enum machine_mode mode)
 static void
 sh_canonicalize_comparison (enum rtx_code& cmp, rtx& op0, rtx& op1,
 			    enum machine_mode mode,
-			    bool op0_preserve_value ATTRIBUTE_UNUSED)
+			    bool op0_preserve_value)
 {
  /* When invoked from within the combine pass the mode is not specified,
     so try to get it from one of the operands.  */
@ -1928,6 +1965,9 @@ sh_canonicalize_comparison (enum rtx_code& cmp, rtx& op0, rtx& op1,
  // Make sure that the constant operand is the second operand.
  if (CONST_INT_P (op0) && !CONST_INT_P (op1))
    {
+      if (op0_preserve_value)
+	return;
+
      std::swap (op0, op1);
      cmp = swap_condition (cmp);
    }
@ -2016,6 +2056,14 @@ sh_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
  *code = (int)tmp_code;
 }

+bool
+sh_fixed_condition_code_regs (unsigned int* p1, unsigned int* p2)
+{
+  *p1 = T_REG;
+  *p2 = INVALID_REGNUM;
+  return true;
+}
+
 enum rtx_code
 prepare_cbranch_operands (rtx *operands, enum machine_mode mode,
 			  enum rtx_code comparison)
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@ -8419,89 +8419,32 @@ label:
  return output_branch (sh_eval_treg_value (operands[1]), insn, operands);
 }
  "&& 1"
-  [(set (pc) (if_then_else (eq (reg:SI T_REG) (match_dup 2))
-			   (label_ref (match_dup 0))
-			   (pc)))]
+  [(const_int 0)]
 {
-  /* Try to find missed test and branch combine opportunities which result
-     in redundant T bit tests before conditional branches.
-     This is done not only after combine (and before reload) but in every
-     split pass, because some opportunities are formed also after combine.
-     FIXME: Probably this would not be needed if CCmode was used
-     together with TARGET_FIXED_CONDITION_CODE_REGS.  */
+  /* Try to canonicalize the branch condition if it is not one of:
+	(ne (reg:SI T_REG) (const_int 0))
+	(eq (reg:SI T_REG) (const_int 0))

-  const int treg_value = sh_eval_treg_value (operands[1]);
-  operands[2] = NULL_RTX;
+     Instead of splitting out a new insn, we modify the current insn's
+     operands as needed.  This preserves things such as REG_DEAD notes.  */

-  /* Scan the insns backwards for an insn that sets the T bit by testing a
-     reg against zero like:
-	(set (reg T_REG) (eq (reg) (const_int 0)))  */
-  rtx testing_insn = NULL_RTX;
-  rtx tested_reg = NULL_RTX;
+  if ((GET_CODE (operands[1]) == EQ || GET_CODE (operands[1]) == NE)
+      && REG_P (XEXP (operands[1], 0)) && REGNO (XEXP (operands[1], 0)) == T_REG
+      && XEXP (operands[1], 1) == const0_rtx)
+    DONE;

-  set_of_reg s0 = sh_find_set_of_reg (get_t_reg_rtx (), curr_insn,
-				      prev_nonnote_insn_bb);
-  if (s0.set_src != NULL_RTX
-      && GET_CODE (s0.set_src) == EQ
-      && REG_P (XEXP (s0.set_src, 0))
-      && satisfies_constraint_Z (XEXP (s0.set_src, 1)))
-    {
-      testing_insn = s0.insn;
-      tested_reg = XEXP (s0.set_src, 0);
-    }
-  else
-    FAIL;
+  int branch_cond = sh_eval_treg_value (operands[1]);
+  rtx new_cond_rtx = NULL_RTX;

-  /* Continue scanning the insns backwards and try to find the insn that
-     sets the tested reg which we found above.  If the reg is set by storing
-     the T bit or the negated T bit we can eliminate the test insn before
-     the branch.  Notice that the branch condition has to be inverted if the
-     test is eliminated.  */
+  if (branch_cond == 0)
+    new_cond_rtx = gen_rtx_EQ (VOIDmode, get_t_reg_rtx (), const0_rtx);
+  else if (branch_cond == 1)
+    new_cond_rtx = gen_rtx_NE (VOIDmode, get_t_reg_rtx (), const0_rtx);

-  /* If the T bit is used between the testing insn and the brach insn
-     leave it alone.  */
-  if (reg_used_between_p (get_t_reg_rtx (), testing_insn, curr_insn))
-    FAIL;
-
-  while (true)
-    {
-      /* It's not safe to go beyond the current basic block after reload.  */
-      set_of_reg s1 = sh_find_set_of_reg (tested_reg, s0.insn,
-					  reload_completed
-					  ? prev_nonnote_insn_bb
-					  : prev_nonnote_insn);
-      if (s1.set_src == NULL_RTX)
-	break;
-
-      if (t_reg_operand (s1.set_src, VOIDmode))
-	operands[2] = GEN_INT (treg_value ^ 1);
-      else if (negt_reg_operand (s1.set_src, VOIDmode))
-	operands[2] = GEN_INT (treg_value);
-      else if (REG_P (s1.set_src))
-	{
-	   /* If it's a reg-reg copy follow the copied reg.  This can
-	      happen e.g. when T bit store zero-extensions are
-	      eliminated.  */
-	  tested_reg = s1.set_src;
-	  s0.insn = s1.insn;
-	  continue;
-	}
-
-	/* It's only safe to remove the testing insn if the T bit is not
-	   modified between the testing insn and the insn that stores the
-	   T bit.  Notice that some T bit stores such as negc also modify
-	   the T bit.  */
-	if (modified_between_p (get_t_reg_rtx (), s1.insn, testing_insn)
-	    || modified_in_p (get_t_reg_rtx (), s1.insn))
-	  operands[2] = NULL_RTX;
-
-	break;
-    }
-
-  if (operands[2] == NULL_RTX)
-    FAIL;
-
-  set_insn_deleted (testing_insn);
+  if (new_cond_rtx != NULL_RTX)
+    validate_change (curr_insn, &XEXP (XEXP (PATTERN (curr_insn), 1), 0),
+		     new_cond_rtx, false);
+  DONE;
 }
  [(set_attr "type" "cbranch")])

@ -11480,10 +11423,13 @@ label:
 ;; multiple insns like:
 ;;	movt	Rn
 ;;	tst	Rn,Rn
+;; This requires an additional pseudo.  The SH specific sh_treg_combine RTL
+;; pass will look for this insn.  Disallow using it if pseudos can't be
+;; created.
 (define_insn_and_split "nott"
  [(set (reg:SI T_REG)
-	(xor:SI (match_operand:SI 0 "t_reg_operand" "") (const_int 1)))]
-  "TARGET_SH1"
+	(xor:SI (match_operand:SI 0 "t_reg_operand") (const_int 1)))]
+  "TARGET_SH2A || (TARGET_SH1 && can_create_pseudo_p ())"
 {
  gcc_assert (TARGET_SH2A);
  return "nott";
--- a/gcc/config/sh/sh_treg_combine.cc
+++ b/gcc/config/sh/sh_treg_combine.cc
--- a/gcc/config/sh/t-sh
+++ b/gcc/config/sh/t-sh
@ -21,6 +21,10 @@ sh-c.o: $(srcdir)/config/sh/sh-c.c \
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/sh/sh-c.c

+sh_treg_combine.o: $(srcdir)/config/sh/sh_treg_combine.cc \
+  $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(TM_H) $(TM_P_H) coretypes.h
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
+
 DEFAULT_ENDIAN = $(word 1,$(TM_ENDIAN_CONFIG))
 OTHER_ENDIAN = $(word 2,$(TM_ENDIAN_CONFIG))

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,10 @@
+2013-10-12  Oleg Endo  <olegendo@gcc.gnu.org>
+
+	PR target/51244
+	* gcc.dg/torture/p51244-21.c: New.
+	* gcc.target/sh/pr51244-20.c: New.
+	* gcc.target/sh/pr51244-20-sh2a.c: New.
+
 2013-10-12  Arnaud Charlet  <charlet@adacore.com>

 	* gnat.dg/specs/linker_section.ads: Update test.
--- a/gcc/testsuite/gcc.dg/torture/pr51244-21.c
+++ b/gcc/testsuite/gcc.dg/torture/pr51244-21.c
@ -0,0 +1,75 @@
+/* { dg-do run } */
+#include <assert.h>
+
+static inline int
+blk_oversized_queue (int* q)
+{
+  if (q[2])
+    return q[1] != 0;
+  return q[0] == 0;
+}
+
+int __attribute__ ((noinline))
+get_request (int* q, int rw)
+{
+  if (blk_oversized_queue (q))
+    {
+      if ((rw == 1) || (rw == 0))
+	return -33;
+
+      return 0;
+    }
+
+  return -100;
+}
+
+int main (void)
+{
+  int x[3]; 
+  int r;
+
+  x[0] = 0; x[1] = 1; x[2] = 1;
+  r = get_request (x, 0);
+  assert (r == -33);
+
+  r = get_request (x, 1);
+  assert (r == -33);
+
+  r = get_request (x, 2);
+  assert (r == 0);
+
+
+  x[0] = 0; x[1] = 0; x[2] = 1;
+  r = get_request (x, 0);
+  assert (r == -100);
+
+  r = get_request (x, 1);
+  assert (r == -100);
+
+  r = get_request (x, 2);
+  assert (r == -100);
+
+
+  x[0] = 0; x[1] = 0; x[2] = 0;
+  r = get_request (x, 0);
+  assert (r == -33);
+
+  r = get_request (x, 1);
+  assert (r == -33);
+
+  r = get_request (x, 2);
+  assert (r == 0);
+
+
+  x[0] = 0; x[1] = 0; x[2] = 0;
+  r = get_request (x, 0);
+  assert (r == -33);
+
+  r = get_request (x, 1);
+  assert (r == -33);
+
+  r = get_request (x, 2);
+  assert (r == 0);
+
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/sh/pr51244-20-sh2a.c
+++ b/gcc/testsuite/gcc.target/sh/pr51244-20-sh2a.c
@ -0,0 +1,14 @@
+/* Check that the SH specific sh_treg_combine RTL optimization pass works as
+   expected.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O2" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m2a*" } } */
+/* { dg-final { scan-assembler-times "tst" 5 } } */
+/* { dg-final { scan-assembler-times "movt" 0 } } */
+/* { dg-final { scan-assembler-times "nott" 1 } } */
+/* { dg-final { scan-assembler-times "cmp/eq" 2 } } */
+/* { dg-final { scan-assembler-times "cmp/hi" 4 } } */
+/* { dg-final { scan-assembler-times "cmp/gt" 3 } } */
+/* { dg-final { scan-assembler-times "not\t" 1 } } */
+
+#include "pr51244-20.c"
--- a/gcc/testsuite/gcc.target/sh/pr51244-20.c
+++ b/gcc/testsuite/gcc.target/sh/pr51244-20.c
@ -0,0 +1,103 @@
+/* Check that the SH specific sh_treg_combine RTL optimization pass works as
+   expected.  On SH2A the expected insns are slightly different, see
+   pr51244-21.c.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O2" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "-m5*" "-m2a*" } { "" } } */
+/* { dg-final { scan-assembler-times "tst" 6 } } */
+/* { dg-final { scan-assembler-times "movt" 1 } } */
+/* { dg-final { scan-assembler-times "cmp/eq" 2 } } */
+/* { dg-final { scan-assembler-times "cmp/hi" 4 } } */
+/* { dg-final { scan-assembler-times "cmp/gt" 2 } } */
+/* { dg-final { scan-assembler-times "not\t" 1 } } */
+
+
+/* non-SH2A: 2x tst, 1x movt, 2x cmp/eq, 1x cmp/hi
+   SH2A: 1x tst, 1x nott, 2x cmp/eq, 1x cmp/hi  */
+static inline int
+blk_oversized_queue_0 (int* q)
+{
+  if (q[2])
+    return q[1] == 5; 
+  return (q[0] != 5);
+}
+
+int __attribute__ ((noinline))
+get_request_0 (int* q, int rw)
+{
+  if (blk_oversized_queue_0 (q))
+    {
+      if ((rw == 1) || (rw == 0))
+	return -33;
+      return 0;
+    }
+  return -100;
+}
+
+
+/* 1x tst, 1x cmp/gt, 1x cmp/hi
+   On SH2A mem loads/stores have a wrong length of 4 bytes and thus will
+   not be placed in a delay slot.  This introduces an extra cmp/gt insn.  */
+static inline int
+blk_oversized_queue_1 (int* q)
+{
+  if (q[2])
+    return q[1] > 5; 
+  return (q[0] > 5);
+}
+
+int __attribute__ ((noinline))
+get_request_1 (int* q, int rw)
+{
+  if (blk_oversized_queue_1 (q))
+    {
+      if ((rw == 1) || (rw == 0))
+	return -33;
+      return 0;
+    }
+  return -100;
+}
+
+
+/* 1x tst, 1x cmp/gt, 1x cmp/hi, 1x cmp/hi  */
+static inline int
+blk_oversized_queue_2 (int* q)
+{
+  if (q[2])
+    return q[1] > 5; 
+  return (q[0] < 5);
+}
+
+int __attribute__ ((noinline))
+get_request_2 (int* q, int rw)
+{
+  if (blk_oversized_queue_2 (q))
+    {
+      if ((rw == 1) || (rw == 0))
+	return -33;
+      return 0;
+    }
+  return -100;
+}
+
+
+/* 2x tst, 1x cmp/hi, 1x not  */
+static inline int
+blk_oversized_queue_5 (int* q)
+{
+  if (q[2])
+    return q[1] != 0; 
+  return q[0] == 0;
+}
+
+int __attribute__ ((noinline))
+get_request_5 (int* q, int rw)
+{
+  if (blk_oversized_queue_5 (q))
+    {
+      if ((rw == 1) || (rw == 0))
+	return -33;
+      return 0;
+    }
+  return -100;
+}