diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2483bfbc865..5db233fcf59 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,36 @@ +Sat Nov 8 18:20:21 1997 J"orn Rennecke + + * sh.h (ENABLE_REGMOVE_PASS): Define. + + Bring over from FSF: + + Thu Oct 30 12:21:06 1997 J"orn Rennecke + + * va-sh.h (__va_arg_sh1): Define. + (va_arg): Use it. + SH3E doesn't use any integer registers for subsequent arguments + once a non-float value was passed in the stack. + * sh.c (machine_dependent_reorg): If optimizing, put explicit + alignment in front label for ADDR_DIFF_VEC. + * sh.h (PASS_IN_REG_P): Fix SH3E case. + (ADJUST_INSN_LENGTH): If not optimizing, add two extra bytes length. + + Tue Oct 28 15:06:44 1997 J"orn Rennecke + + * sh/elf.h (PREFERRED_DEBUGGING_TYPE): Undefine before including + svr4.h. + + Mon Oct 27 16:11:52 1997 J"orn Rennecke + + * sh.c (machine_dependent_reorg): When -flag_delayed_branches, + put an use_sfunc_addr before each sfunc. + * sh.md (use_sfunc_addr, dummy_jump): New insns. + (casesi): For TARGET_SH2, emit a dummy_jump after LAB. + + Tue Oct 21 07:12:28 1997 J"orn Rennecke + + * sh/elf.h (PREFERRED_DEBUGGING_TYPE): Don't redefine. + Fri Nov 7 10:22:24 1997 Jason Merrill * frame.c (add_fdes, count_fdes): Go back to checking pc_begin for diff --git a/gcc/config/sh/elf.h b/gcc/config/sh/elf.h index 2c3294b4935..a56077e544e 100644 --- a/gcc/config/sh/elf.h +++ b/gcc/config/sh/elf.h @@ -25,10 +25,6 @@ Boston, MA 02111-1307, USA. */ /* No SDB debugging info. */ #undef SDB_DEBUGGING_INFO -/* Prefer stabs. */ -#undef PREFERRED_DEBUGGING_TYPE -#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG - /* Undefine some macros defined in both sh.h and svr4.h. */ #undef IDENT_ASM_OP #undef ASM_FILE_END @@ -40,10 +36,30 @@ Boston, MA 02111-1307, USA. */ #undef ASM_OUTPUT_CONSTRUCTOR #undef ASM_OUTPUT_DESTRUCTOR #undef ASM_DECLARE_FUNCTION_NAME +#undef PREFERRED_DEBUGGING_TYPE /* Be ELF-like. */ #include "svr4.h" +/* The prefix to add to user-visible assembler symbols. + Note that svr4.h redefined it from the original value (that we want) + in sh.h */ + +#undef USER_LABEL_PREFIX +#define USER_LABEL_PREFIX "_" + +#undef LOCAL_LABEL_PREFIX +#define LOCAL_LABEL_PREFIX "." + +#undef ASM_FILE_START +#define ASM_FILE_START(FILE) do { \ + output_file_directive ((FILE), main_input_filename); \ + if (TARGET_LITTLE_ENDIAN) \ + fprintf ((FILE), "\t.little\n"); \ +} while (0) + + + /* Let code know that this is ELF. */ #define CPP_PREDEFINES "-D__sh__ -D__ELF__ -Acpu(sh) -Amachine(sh)" @@ -52,7 +68,7 @@ Boston, MA 02111-1307, USA. */ #define ASM_SPEC "%{ml:-little} %{mrelax:-relax}" #undef LINK_SPEC -#define LINK_SPEC "%{ml:-m shl} %{mrelax:-relax}" +#define LINK_SPEC "%{ml:-m shlelf} %{mrelax:-relax}" /* svr4.h undefined DBX_REGISTER_NUMBER, so we need to define it again. */ @@ -63,29 +79,26 @@ Boston, MA 02111-1307, USA. */ symbol names. */ #undef ASM_OUTPUT_LABELREF #define ASM_OUTPUT_LABELREF(STREAM,NAME) \ - fprintf (STREAM, "_%s", NAME) - -/* Because SH ELF uses underscores, we don't put a '.' before local - labels, for easy compatibility with the COFF implementation. */ + asm_fprintf (STREAM, "%U%s", NAME) #undef ASM_GENERATE_INTERNAL_LABEL #define ASM_GENERATE_INTERNAL_LABEL(STRING, PREFIX, NUM) \ - sprintf (STRING, "*%s%d", PREFIX, NUM) + sprintf ((STRING), "*%s%s%d", LOCAL_LABEL_PREFIX, (PREFIX), (NUM)) #undef ASM_OUTPUT_INTERNAL_LABEL #define ASM_OUTPUT_INTERNAL_LABEL(FILE,PREFIX,NUM) \ - fprintf (FILE, "%s%d:\n", PREFIX, NUM) + asm_fprintf ((FILE), "%L%s%d:\n", (PREFIX), (NUM)) #undef ASM_OUTPUT_SOURCE_LINE #define ASM_OUTPUT_SOURCE_LINE(file, line) \ do \ { \ static int sym_lineno = 1; \ - fprintf (file, ".stabn 68,0,%d,LM%d-", \ - line, sym_lineno); \ - assemble_name (file, \ + asm_fprintf ((file), ".stabn 68,0,%d,%LLM%d-", \ + (line), sym_lineno); \ + assemble_name ((file), \ XSTR (XEXP (DECL_RTL (current_function_decl), 0), 0));\ - fprintf (file, "\nLM%d:\n", sym_lineno); \ + asm_fprintf ((file), "\n%LLM%d:\n", sym_lineno); \ sym_lineno += 1; \ } \ while (0) @@ -94,7 +107,7 @@ while (0) #define DBX_OUTPUT_MAIN_SOURCE_FILE_END(FILE, FILENAME) \ do { \ text_section (); \ - fprintf (FILE, "\t.stabs \"\",%d,0,0,Letext\nLetext:\n", N_SO); \ + fprintf ((FILE), "\t.stabs \"\",%d,0,0,Letext\nLetext:\n", N_SO); \ } while (0) /* Arrange to call __main, rather than using crtbegin.o and crtend.o @@ -103,3 +116,7 @@ do { \ #undef FINI_SECTION_ASM_OP #undef STARTFILE_SPEC #undef ENDFILE_SPEC + +/* HANDLE_SYSV_PRAGMA (defined by svr4.h) takes precedence over HANDLE_PRAGMA. + We want to use the HANDLE_PRAGMA from sh.h. */ +#undef HANDLE_SYSV_PRAGMA diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c index 30406f66acb..f26b60042c8 100644 --- a/gcc/config/sh/sh.c +++ b/gcc/config/sh/sh.c @@ -35,6 +35,8 @@ Boston, MA 02111-1307, USA. */ #include "output.h" #include "insn-attr.h" +int code_for_indirect_jump_scratch = CODE_FOR_indirect_jump_scratch; + #define MSW (TARGET_LITTLE_ENDIAN ? 1 : 0) #define LSW (TARGET_LITTLE_ENDIAN ? 0 : 1) @@ -84,6 +86,10 @@ enum processor_type sh_cpu; rtx sh_compare_op0; rtx sh_compare_op1; +enum machine_mode sh_addr_diff_vec_mode; +rtx *uid_align; +int uid_align_max; + /* Provides the class number of the smallest class containing reg number. */ @@ -114,6 +120,8 @@ enum reg_class reg_class_from_letter[] = /* u */ NO_REGS, /* v */ NO_REGS, /* w */ FP0_REGS, /* x */ MAC_REGS, /* y */ FPUL_REGS, /* z */ R0_REGS }; + +static void split_branches PROTO ((rtx)); /* Print the operand address in x to the stream. */ @@ -170,6 +178,7 @@ print_operand_address (stream, x) according to modifier code. '.' print a .s if insn needs delay slot + ',' print LOCAL_LABEL_PREFIX '@' print trap, rte or rts depending upon pragma interruptness '#' output a nop if there is nothing to put in the delay slot 'O' print a constant without the # @@ -188,7 +197,10 @@ print_operand (stream, x, code) case '.': if (final_sequence && ! INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0))) - fprintf (stream, ".s"); + fprintf (stream, ASSEMBLER_DIALECT ? "/s" : ".s"); + break; + case ',': + fprintf (stream, "%s", LOCAL_LABEL_PREFIX); break; case '@': if (trap_exit) @@ -389,13 +401,12 @@ prepare_scc_operands (code) mode = GET_MODE (sh_compare_op1); sh_compare_op0 = force_reg (mode, sh_compare_op0); - if (code != EQ && code != NE - && (sh_compare_op1 != const0_rtx - || code == GTU || code == GEU || code == LTU || code == LEU)) + if ((code != EQ && code != NE + && (sh_compare_op1 != const0_rtx + || code == GTU || code == GEU || code == LTU || code == LEU)) + || TARGET_SH3E && GET_MODE_CLASS (mode) == MODE_FLOAT) sh_compare_op1 = force_reg (mode, sh_compare_op1); - /* ??? This should be `mode' not `SImode' in the compare, but that would - require fixing the branch patterns too. */ emit_insn (gen_rtx (SET, VOIDmode, t_reg, gen_rtx (code, SImode, sh_compare_op0, sh_compare_op1))); @@ -410,20 +421,31 @@ from_compare (operands, code) rtx *operands; int code; { - if (code != EQ && code != NE) + enum machine_mode mode = GET_MODE (sh_compare_op0); + rtx insn; + if (mode == VOIDmode) + mode = GET_MODE (sh_compare_op1); + if (code != EQ + || mode == DImode + || (TARGET_SH3E && GET_MODE_CLASS (mode) == MODE_FLOAT)) { - enum machine_mode mode = GET_MODE (sh_compare_op0); - if (mode == VOIDmode) - mode = GET_MODE (sh_compare_op1); - /* Force args into regs, since we can't use constants here. */ sh_compare_op0 = force_reg (mode, sh_compare_op0); if (sh_compare_op1 != const0_rtx - || code == GTU || code == GEU || code == LTU || code == LEU) + || code == GTU || code == GEU + || (TARGET_SH3E && GET_MODE_CLASS (mode) == MODE_FLOAT)) sh_compare_op1 = force_reg (mode, sh_compare_op1); } - operands[1] = sh_compare_op0; - operands[2] = sh_compare_op1; + if (TARGET_SH3E && GET_MODE_CLASS (mode) == MODE_FLOAT && code == GE) + { + from_compare (operands, GT); + insn = gen_ieee_ccmpeqsf_t (sh_compare_op0, sh_compare_op1); + } + else + insn = gen_rtx (SET, VOIDmode, + gen_rtx (REG, SImode, 18), + gen_rtx (code, SImode, sh_compare_op0, sh_compare_op1)); + emit_insn (insn); } /* Functions to output assembly code. */ @@ -520,33 +542,54 @@ print_slot (insn) INSN_DELETED_P (XVECEXP (insn, 0, 1)) = 1; } -/* We can't tell if we need a register as a scratch for the jump - until after branch shortening, and then it's too late to allocate a - register the 'proper' way. These instruction sequences are rare - anyway, so to avoid always using a reg up from our limited set, we'll - grab one when we need one on output. */ - -/* ??? Should fix compiler so that using a clobber scratch in jump - instructions works, and then this will be unnecessary. */ - char * output_far_jump (insn, op) rtx insn; rtx op; { - rtx thislab = gen_label_rtx (); + struct { rtx lab, reg, op; } this; + char *jump; + int far; - /* Output the delay slot insn first if any. */ - if (dbr_sequence_length ()) - print_slot (final_sequence); + this.lab = gen_label_rtx (); - output_asm_insn ("mov.l r13,@-r15", 0); - output_asm_insn ("mov.l %O0,r13", &thislab); - output_asm_insn ("jmp @r13", 0); - output_asm_insn ("mov.l @r15+,r13", 0); - output_asm_insn (".align 2", 0); - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, "L", CODE_LABEL_NUMBER (thislab)); - output_asm_insn (".long %O0", &op); + if (braf_branch_p (insn, 0)) + { + far = 0; + jump = "mov.w %O0,%1;braf %1"; + } + else + { + far = 1; + jump = "mov.l %O0,%1;jmp @%1"; + } + /* If we have a scratch register available, use it. */ + if (GET_CODE (PREV_INSN (insn)) == INSN + && INSN_CODE (PREV_INSN (insn)) == CODE_FOR_indirect_jump_scratch) + { + this.reg = SET_DEST (PATTERN (PREV_INSN (insn))); + output_asm_insn (jump, &this.lab); + if (dbr_sequence_length ()) + print_slot (final_sequence); + else + output_asm_insn ("nop", 0); + } + else + { + /* Output the delay slot insn first if any. */ + if (dbr_sequence_length ()) + print_slot (final_sequence); + + this.reg = gen_rtx (REG, SImode, 13); + output_asm_insn ("mov.l r13,@-r15", 0); + output_asm_insn (jump, &this.lab); + output_asm_insn ("mov.l @r15+,r13", 0); + } + if (far) + output_asm_insn (".align 2", 0); + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, "L", CODE_LABEL_NUMBER (this.lab)); + this.op = op; + output_asm_insn (far ? ".long %O2" : ".word %O2-%O0", &this.lab); return ""; } @@ -563,97 +606,100 @@ output_branch (logic, insn, operands) rtx insn; rtx *operands; { - int label = lf++; - int length = get_attr_length (insn); - int adjusted_length; + int offset + = (insn_addresses[INSN_UID (XEXP (XEXP (SET_SRC (PATTERN (insn)), 1), 0))] + - insn_addresses[INSN_UID (insn)]); - /* Undo the effects of ADJUST_INSN_LENGTH, so that we get the real - length. If NEXT_INSN (PREV_INSN (insn)) != insn, then the insn - is inside a sequence, and ADJUST_INSN_LENGTH was not called on - it. */ - if (PREV_INSN (insn) == NULL - || NEXT_INSN (PREV_INSN (insn)) == insn) + if (offset == 260 + && final_sequence + && ! INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0))) { - adjusted_length = length; - ADJUST_INSN_LENGTH (insn, adjusted_length); - length -= (adjusted_length - length); + /* The filling of the delay slot has caused a forward branch to exceed + its range. + Just emit the insn from the delay slot in front of the branch. */ + /* The call to print_slot will clobber the operands. */ + rtx op0 = operands[0]; + print_slot (final_sequence); + operands[0] = op0; } - - switch (length) + else if (offset < -252 || offset > 258) { - case 2: - /* A branch with an unfilled delay slot. */ - case 4: - /* Simple branch in range -252..+258 bytes */ - return logic ? "bt%. %l0" : "bf%. %l0"; + /* This can happen when other condbranches hoist delay slot insn + from their destination, thus leading to code size increase. + But the branch will still be in the range -4092..+4098 bytes. */ - case 6: - /* A branch with an unfilled delay slot. */ - case 8: - /* Branch in range -4092..+4098 bytes. */ - { - /* The call to print_slot will clobber the operands. */ - rtx op0 = operands[0]; + int label = lf++; + /* The call to print_slot will clobber the operands. */ + rtx op0 = operands[0]; - /* If the instruction in the delay slot is annulled (true), then - there is no delay slot where we can put it now. The only safe - place for it is after the label. */ + /* If the instruction in the delay slot is annulled (true), then + there is no delay slot where we can put it now. The only safe + place for it is after the label. final will do that by default. */ - if (final_sequence) - { - fprintf (asm_out_file, "\tb%c%s\tLF%d\n", logic ? 'f' : 't', - INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0)) - ? "" : ".s", label); - if (! INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0))) - print_slot (final_sequence); - } - else - fprintf (asm_out_file, "\tb%c\tLF%d\n", logic ? 'f' : 't', label); - - output_asm_insn ("bra %l0", &op0); - fprintf (asm_out_file, "\tnop\n"); - fprintf (asm_out_file, "LF%d:\n", label); - - if (final_sequence - && INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0))) + if (final_sequence + && ! INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0))) + { + asm_fprintf (asm_out_file, "\tb%s%ss\t%LLF%d\n", logic ? "f" : "t", + ASSEMBLER_DIALECT ? "/" : ".", label); print_slot (final_sequence); - } - return ""; + } + else + asm_fprintf (asm_out_file, "\tb%s\t%LLF%d\n", logic ? "f" : "t", label); - case 16: - /* A branch with an unfilled delay slot. */ - case 18: - /* Branches a long way away. */ - { - /* The call to print_slot will clobber the operands. */ - rtx op0 = operands[0]; + output_asm_insn ("bra\t%l0", &op0); + fprintf (asm_out_file, "\tnop\n"); + ASM_OUTPUT_INTERNAL_LABEL(asm_out_file, "LF", label); - /* If the instruction in the delay slot is annulled (true), then - there is no delay slot where we can put it now. The only safe - place for it is after the label. */ - - if (final_sequence) - { - fprintf (asm_out_file, "\tb%c%s\tLF%d\n", logic ? 'f' : 't', - INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0)) - ? "" : ".s", label); - if (! INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0))) - print_slot (final_sequence); - } - else - fprintf (asm_out_file, "\tb%c\tLF%d\n", logic ? 'f' : 't', label); - - output_far_jump (insn, op0); - fprintf (asm_out_file, "LF%d:\n", label); - - if (final_sequence - && INSN_ANNULLED_BRANCH_P (XVECEXP (final_sequence, 0, 0))) - print_slot (final_sequence); - } return ""; } + return logic ? "bt%.\t%l0" : "bf%.\t%l0"; +} - abort (); +int branch_offset (); + +char * +output_branchy_insn (code, template, insn, operands) + char *template; + enum rtx_code code; + rtx insn; + rtx *operands; +{ + rtx next_insn = NEXT_INSN (insn); + int label_nr; + + if (next_insn && GET_CODE (next_insn) == JUMP_INSN && condjump_p (next_insn)) + { + rtx src = SET_SRC (PATTERN (next_insn)); + if (GET_CODE (src) == IF_THEN_ELSE && GET_CODE (XEXP (src, 0)) != code) + { + /* Following branch not taken */ + operands[9] = gen_label_rtx (); + emit_label_after (operands[9], next_insn); + return template; + } + else + { + int offset = branch_offset (next_insn) + 4; + if (offset >= -252 && offset <= 256) + { + if (GET_CODE (src) == IF_THEN_ELSE) + /* branch_true */ + src = XEXP (src, 1); + operands[9] = src; + return template; + } + } + } + operands[9] = gen_label_rtx (); + emit_label_after (operands[9], insn); + return template; +} + +char * +output_ieee_ccmpeq (insn, operands) + rtx insn, operands; +{ + output_branchy_insn (NE, "bt\t%l9\\;fcmp/eq\t%1,%0", insn, operands); } /* Output to FILE the start of the assembler file. */ @@ -751,20 +797,15 @@ shiftcosts (x) /* If shift by a non constant, then this will be expensive. */ if (GET_CODE (XEXP (x, 1)) != CONST_INT) - { - if (TARGET_SH3) - return 2; - /* If not an sh3 then we don't even have an instruction for it. */ - return 20; - } + return SH_DYNAMIC_SHIFT_COST; /* Otherwise, return the true cost in instructions. */ if (GET_CODE (x) == ASHIFTRT) { int cost = ashiftrt_insns[value]; /* If SH3, then we put the constant in a reg and use shad. */ - if (TARGET_SH3 && cost > 3) - cost = 3; + if (cost > 1 + SH_DYNAMIC_SHIFT_COST) + cost = 1 + SH_DYNAMIC_SHIFT_COST; return cost; } else @@ -999,9 +1040,11 @@ expand_ashiftrt (operands) emit_insn (gen_ashrsi3_d (operands[0], operands[1], count)); return 1; } - else if (ashiftrt_insns[INTVAL (operands[2])] > 3) + else if (ashiftrt_insns[INTVAL (operands[2]) & 31] + > 1 + SH_DYNAMIC_SHIFT_COST) { - rtx count = force_reg (SImode, GEN_INT (- INTVAL (operands[2]))); + rtx count + = force_reg (SImode, GEN_INT (- (INTVAL (operands[2]) & 31))); emit_insn (gen_ashrsi3_d (operands[0], operands[1], count)); return 1; } @@ -1009,7 +1052,7 @@ expand_ashiftrt (operands) if (GET_CODE (operands[2]) != CONST_INT) return 0; - value = INTVAL (operands[2]); + value = INTVAL (operands[2]) & 31; if (value == 31) { @@ -1050,6 +1093,12 @@ expand_ashiftrt (operands) return 1; } +int sh_dynamicalize_shift_p (count) + rtx count; +{ + return shift_insns[INTVAL (count)] > 1 + SH_DYNAMIC_SHIFT_COST; +} + /* Try to find a good way to implement the combiner pattern [(set (match_operand:SI 0 "register_operand" "r") (and:SI (ashift:SI (match_operand:SI 1 "register_operand" "r") @@ -1434,7 +1483,7 @@ shl_sext_kind (left_rtx, size_rtx, costp) if (TARGET_SH3) { /* Try to use a dynamic shift. */ - cost = shift_insns[32 - insize] + 3; + cost = shift_insns[32 - insize] + 1 + SH_DYNAMIC_SHIFT_COST; if (cost < best_cost) { kind = 0; @@ -1645,6 +1694,8 @@ typedef struct static pool_node pool_vector[MAX_POOL_SIZE]; static int pool_size; +static int max_uid_before_fixup_addr_diff_vecs; + /* ??? If we need a constant in HImode which is the truncated value of a constant we need in SImode, we could combine the two entries thus saving two bytes. Is this common enough to be worth the effort of implementing @@ -1735,7 +1786,8 @@ dump_table (scan) scan = emit_label_after (gen_label_rtx (), scan); scan = emit_insn_after (gen_align_4 (), scan); } - scan = emit_label_after (p->label, scan); + if (p->label) + scan = emit_label_after (p->label, scan); scan = emit_insn_after (gen_consttable_4 (p->value), scan); break; case DFmode: @@ -1746,7 +1798,8 @@ dump_table (scan) scan = emit_label_after (gen_label_rtx (), scan); scan = emit_insn_after (gen_align_4 (), scan); } - scan = emit_label_after (p->label, scan); + if (p->label) + scan = emit_label_after (p->label, scan); scan = emit_insn_after (gen_consttable_8 (p->value), scan); break; default: @@ -1792,7 +1845,8 @@ broken_move (insn) order bits end up as. */ && GET_MODE (SET_DEST (pat)) != QImode && CONSTANT_P (SET_SRC (pat)) - && ! (GET_CODE (SET_SRC (pat)) == CONST_DOUBLE + && ! (TARGET_SH3E + && GET_CODE (SET_SRC (pat)) == CONST_DOUBLE && (fp_zero_operand (SET_SRC (pat)) || fp_one_operand (SET_SRC (pat))) && GET_CODE (SET_DEST (pat)) == REG @@ -1806,28 +1860,49 @@ broken_move (insn) return 0; } +int +cache_align_p (insn) + rtx insn; +{ + rtx pat; + + if (! insn) + return 1; + + if (GET_CODE (insn) != INSN) + return 0; + + pat = PATTERN (insn); + return (GET_CODE (pat) == UNSPEC_VOLATILE + && XINT (pat, 1) == 1 + && INTVAL (XVECEXP (pat, 0, 0)) == CACHE_LOG); +} + +static int +mova_p (insn) + rtx insn; +{ + return (GET_CODE (insn) == INSN + && GET_CODE (PATTERN (insn)) == SET + && GET_CODE (SET_SRC (PATTERN (insn))) == UNSPEC + && XINT (SET_SRC (PATTERN (insn)), 1) == 1); +} + /* Find the last barrier from insn FROM which is close enough to hold the constant pool. If we can't find one, then create one near the end of the range. */ -/* ??? It would be good to put constant pool tables between a case jump and - the jump table. This fails for two reasons. First, there is no - barrier after the case jump. This is a bug in the casesi pattern. - Second, inserting the table here may break the mova instruction that - loads the jump table address, by moving the jump table too far away. - We fix that problem by never outputting the constant pool between a mova - and its label. */ - static rtx -find_barrier (from) - rtx from; +find_barrier (num_mova, mova, from) + int num_mova; + rtx mova, from; { int count_si = 0; int count_hi = 0; int found_hi = 0; int found_si = 0; - rtx found_barrier = 0; - rtx found_mova = 0; + int leading_mova = num_mova; + rtx barrier_before_mova, found_barrier = 0, good_barrier = 0; int si_limit; int hi_limit; @@ -1843,83 +1918,100 @@ find_barrier (from) before the table, subtract 2 for the instruction that fills the jump delay slot. This gives 1018. */ - /* If not optimizing, then it is possible that the jump instruction we add - won't be shortened, and thus will have a length of 14 instead of 2. - We must adjust the limits downwards to account for this, giving a limit - of 1008 for SImode and 500 for HImode. */ + /* The branch will always be shortened now that the reference address for + forward branches is the sucessor address, thus we need no longer make + adjustments to the [sh]i_limit for -O0. */ - if (optimize) - { - si_limit = 1018; - hi_limit = 510; - } - else - { - si_limit = 1008; - hi_limit = 500; - } - - /* If not optimizing for space, then the constant pool will be - aligned to a 4 to 16 byte boundary. We must make room for that - alignment that by reducing the limits. - ??? It would be better to not align the constant pool, but - ASM_OUTPUT_ALIGN_CODE does not make any provision for basing the - alignment on the instruction. */ - - if (! TARGET_SMALLCODE) - { - if (TARGET_SH3 || TARGET_SH3E) - { - si_limit -= 14; - hi_limit -= 14; - } - else - { - si_limit -= 2; - hi_limit -= 2; - } - } + si_limit = 1018; + hi_limit = 510; while (from && count_si < si_limit && count_hi < hi_limit) { - int inc = get_attr_length (from); + int inc = 0; + + /* The instructions created by fixup_addr_diff_vecs have no valid length + info yet. They should be considered to have zero at this point. */ + if (INSN_UID (from) < max_uid_before_fixup_addr_diff_vecs) + inc = get_attr_length (from); if (GET_CODE (from) == BARRIER) - found_barrier = from; + { + found_barrier = from; + /* If we are at the end of the function, or in front of an alignemnt + instruction, we need not insert an extra alignment. We prefer + this kind of barrier. */ + + if (cache_align_p (next_real_insn (found_barrier))) + good_barrier = from; + } if (broken_move (from)) { - rtx pat = PATTERN (from); - rtx src = SET_SRC (pat); - rtx dst = SET_DEST (pat); - enum machine_mode mode = GET_MODE (dst); + rtx pat, src, dst; + enum machine_mode mode; + + pat = PATTERN (from); + if (GET_CODE (pat) == PARALLEL) + pat = XVECEXP (pat, 0, 0); + src = SET_SRC (pat); + dst = SET_DEST (pat); + mode = GET_MODE (dst); /* We must explicitly check the mode, because sometimes the front end will generate code to load unsigned constants into HImode targets without properly sign extending them. */ if (mode == HImode || (mode == SImode && hi_const (src))) { - found_hi = 1; + found_hi += 2; /* We put the short constants before the long constants, so we must count the length of short constants in the range for the long constants. */ /* ??? This isn't optimal, but is easy to do. */ - if (found_si) - count_si += 2; + si_limit -= 2; } else - found_si = 1; + { + if (found_si > count_si) + count_si = found_si; + found_si += GET_MODE_SIZE (mode); + if (num_mova) + si_limit -= GET_MODE_SIZE (mode); + } } if (GET_CODE (from) == INSN && GET_CODE (PATTERN (from)) == SET && GET_CODE (SET_SRC (PATTERN (from))) == UNSPEC && XINT (SET_SRC (PATTERN (from)), 1) == 1) - found_mova = from; + { + if (! num_mova++) + { + leading_mova = 0; + mova = from; + barrier_before_mova = good_barrier ? good_barrier : found_barrier; + } + if (found_si > count_si) + count_si = found_si; + } else if (GET_CODE (from) == JUMP_INSN && (GET_CODE (PATTERN (from)) == ADDR_VEC || GET_CODE (PATTERN (from)) == ADDR_DIFF_VEC)) - found_mova = 0; + { + if (num_mova) + num_mova--; + if (cache_align_p (NEXT_INSN (next_nonnote_insn (from)))) + { + /* We have just passed the barrier in front front of the + ADDR_DIFF_VEC. Since the ADDR_DIFF_VEC is accessed + as data, just like our pool constants, this is a good + opportunity to accomodate what we have gathered so far. + If we waited any longer, we could end up at a barrier in + front of code, which gives worse cache usage for separated + instruction / data caches. */ + good_barrier = found_barrier; + break; + } + } if (found_si) count_si += inc; @@ -1928,12 +2020,42 @@ find_barrier (from) from = NEXT_INSN (from); } - /* Insert the constant pool table before the mova instruction, to prevent - the mova label reference from going out of range. */ - if (found_mova) - from = found_mova; + if (num_mova) + if (leading_mova) + { + /* Try as we might, the leading mova is out of range. Change + it into a load (which will become a pcload) and retry. */ + SET_SRC (PATTERN (mova)) = XVECEXP (SET_SRC (PATTERN (mova)), 0, 0); + INSN_CODE (mova) = -1; + return find_barrier (0, 0, mova); + } + else + { + /* Insert the constant pool table before the mova instruction, + to prevent the mova label reference from going out of range. */ + from = mova; + good_barrier = found_barrier = barrier_before_mova; + } - if (! found_barrier) + if (found_barrier) + { + /* We have before prepared barriers to come in pairs, with an + alignment instruction in-between. We want to use the first + barrier, so that the alignment applies to the code. + If we are compiling for SH3 or newer, there are some exceptions + when the second barrier and the alignment doesn't exist yet, so + we have to add it. */ + if (good_barrier) + found_barrier = good_barrier; + else if (! TARGET_SMALLCODE) + { + found_barrier + = emit_insn_before (gen_align_log (GEN_INT (CACHE_LOG)), + found_barrier); + found_barrier = emit_barrier_before (found_barrier); + } + } + else { /* We didn't find a barrier in time to dump our stuff, so we'll make one. */ @@ -1961,6 +2083,11 @@ find_barrier (from) LABEL_NUSES (label) = 1; found_barrier = emit_barrier_after (from); emit_label_after (label, found_barrier); + if (! TARGET_SMALLCODE) + { + emit_barrier_after (found_barrier); + emit_insn_after (gen_align_log (GEN_INT (CACHE_LOG)), found_barrier); + } } return found_barrier; @@ -1970,7 +2097,7 @@ find_barrier (from) positively find the register that is used to call the sfunc, and this register is not used anywhere else in this instruction - except as the destination of a set, return this register; else, return 0. */ -static rtx +rtx sfunc_uses_reg (insn) rtx insn; { @@ -1986,7 +2113,7 @@ sfunc_uses_reg (insn) for (reg_part = 0, i = XVECLEN (pattern, 0) - 1; i >= 1; i--) { part = XVECEXP (pattern, 0, i); - if (GET_CODE (part) == USE) + if (GET_CODE (part) == USE && GET_MODE (XEXP (part, 0)) == SImode) reg_part = part; } if (! reg_part) @@ -2092,6 +2219,480 @@ noncall_uses_reg (reg, insn, set) return 0; } +/* Given a X, a pattern of an insn or a part of it, return a mask of used + general registers. Bits 0..15 mean that the respective registers + are used as inputs in the instruction. Bits 16..31 mean that the + registers 0..15, respectively, are used as outputs, or are clobbered. + IS_DEST should be set to 16 if X is the destination of a SET, else to 0. */ +int +regs_used (x, is_dest) + rtx x; int is_dest; +{ + enum rtx_code code; + char *fmt; + int i, used = 0; + + if (! x) + return used; + code = GET_CODE (x); + switch (code) + { + case REG: + if (REGNO (x) < 16) + return (((1 << HARD_REGNO_NREGS (0, GET_MODE (x))) - 1) + << (REGNO (x) + is_dest)); + return 0; + case SUBREG: + { + rtx y = SUBREG_REG (x); + + if (GET_CODE (y) != REG) + break; + if (REGNO (y) < 16) + return (((1 << HARD_REGNO_NREGS (0, GET_MODE (x))) - 1) + << (REGNO (y) + SUBREG_WORD (x) + is_dest)); + return 0; + } + case SET: + return regs_used (SET_SRC (x), 0) | regs_used (SET_DEST (x), 16); + case RETURN: + /* If there was a return value, it must have been indicated with USE. */ + return 0x00ffff00; + case CLOBBER: + is_dest = 1; + break; + case MEM: + is_dest = 0; + break; + case CALL: + used |= 0x00ff00f0; + break; + } + + fmt = GET_RTX_FORMAT (code); + + for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--) + { + if (fmt[i] == 'E') + { + register int j; + for (j = XVECLEN (x, i) - 1; j >= 0; j--) + used |= regs_used (XVECEXP (x, i, j), is_dest); + } + else if (fmt[i] == 'e') + used |= regs_used (XEXP (x, i), is_dest); + } + return used; +} + +/* Create an instruction that prevents redirection of a conditional branch + to the desitination of the JUMP with address ADDR. + If the branch needs to be implemented as an indirect jump, try to find + a scratch register for it. + If NEED_BLOCK is 0, don't do anything unless we need a scratch register. + If any preceding insn that doesn't fit into a delay slot is good enough, + pass 1. Pass 2 if a definite blocking insn is needed. + -1 is used internally to avoid deep recursion. + If a blocking instruction is made or recognized, return it. */ + +static rtx +gen_block_redirect (jump, addr, need_block) + rtx jump; + int addr, need_block; +{ + int dead = 0; + rtx prev = prev_nonnote_insn (jump); + rtx dest; + + /* First, check if we already have an instruction that satisfies our need. */ + if (prev && GET_CODE (prev) == INSN && ! INSN_DELETED_P (prev)) + { + if (INSN_CODE (prev) == CODE_FOR_indirect_jump_scratch) + return prev; + if (GET_CODE (PATTERN (prev)) == USE + || GET_CODE (PATTERN (prev)) == CLOBBER + || get_attr_in_delay_slot (prev) == IN_DELAY_SLOT_YES) + prev = jump; + else if ((need_block &= ~1) < 0) + return prev; + else if (recog_memoized (prev) == CODE_FOR_block_branch_redirect) + need_block = 0; + } + /* We can't use JUMP_LABEL here because it might be undefined + when not optimizing. */ + dest = XEXP (SET_SRC (PATTERN (jump)), 0); + /* If the branch is out of range, try to find a scratch register for it. */ + if (optimize + && (insn_addresses[INSN_UID (dest)] - addr + 4092U > 4092 + 4098)) + { + rtx scan; + /* Don't look for the stack pointer as a scratch register, + it would cause trouble if an interrupt occured. */ + unsigned try = 0x7fff, used; + int jump_left = flag_expensive_optimizations + 1; + + /* It is likely that the most recent eligible instruction is wanted for + the delay slot. Therefore, find out which registers it uses, and + try to avoid using them. */ + + for (scan = jump; scan = PREV_INSN (scan); ) + { + enum rtx_code code; + + if (INSN_DELETED_P (scan)) + continue; + code = GET_CODE (scan); + if (code == CODE_LABEL || code == JUMP_INSN) + break; + if (code == INSN + && GET_CODE (PATTERN (scan)) != USE + && GET_CODE (PATTERN (scan)) != CLOBBER + && get_attr_in_delay_slot (scan) == IN_DELAY_SLOT_YES) + { + try &= ~regs_used (PATTERN (scan), 0); + break; + } + } + for (used = dead = 0, scan = JUMP_LABEL (jump); scan = NEXT_INSN (scan); ) + { + enum rtx_code code; + + if (INSN_DELETED_P (scan)) + continue; + code = GET_CODE (scan); + if (GET_RTX_CLASS (code) == 'i') + { + used |= regs_used (PATTERN (scan), 0); + if (code == CALL_INSN) + used |= regs_used (CALL_INSN_FUNCTION_USAGE (scan), 0); + dead |= (used >> 16) & ~used; + if (dead & try) + { + dead &= try; + break; + } + if (code == JUMP_INSN) + if (jump_left-- && simplejump_p (scan)) + scan = JUMP_LABEL (scan); + else + break; + } + } + /* Mask out the stack pointer again, in case it was + the only 'free' register we have found. */ + dead &= 0x7fff; + } + /* If the immediate destination is still in range, check for possible + threading with a jump beyond the delay slot insn. + Don't check if we are called recursively; the jump has been or will be + checked in a different invokation then. */ + + else if (optimize && need_block >= 0) + { + rtx next = next_active_insn (next_active_insn (dest)); + if (next && GET_CODE (next) == JUMP_INSN + && GET_CODE (PATTERN (next)) == SET + && recog_memoized (next) == CODE_FOR_jump) + { + dest = JUMP_LABEL (next); + if (dest + && insn_addresses[INSN_UID (dest)] - addr + 4092U > 4092 + 4098) + gen_block_redirect (next, insn_addresses[INSN_UID (next)], -1); + } + } + + if (dead) + { + rtx reg = gen_rtx (REG, SImode, exact_log2 (dead & -dead)); + + /* It would be nice if we could convert the jump into an indirect + jump / far branch right now, and thus exposing all consitituent + instructions to further optimization. However, reorg uses + simplejump_p to determine if there is an unconditional jump where + it should try to schedule instructions from the target of the + branch; simplejump_p fails for indirect jumps even if they have + a JUMP_LABEL. */ + rtx insn = emit_insn_before (gen_indirect_jump_scratch + (reg, GEN_INT (INSN_UID (JUMP_LABEL (jump)))) + , jump); + INSN_CODE (insn) = CODE_FOR_indirect_jump_scratch; + return insn; + } + else if (need_block) + /* We can't use JUMP_LABEL here because it might be undefined + when not optimizing. */ + return emit_insn_before (gen_block_branch_redirect + (GEN_INT (INSN_UID (XEXP (SET_SRC (PATTERN (jump)), 0)))) + , jump); + return prev; +} + +#define CONDJUMP_MIN -252 +#define CONDJUMP_MAX 262 +struct far_branch +{ + /* A label (to be placed) in front of the jump + that jumps to our ultimate destination. */ + rtx near_label; + /* Where we are going to insert it if we cannot move the jump any farther, + or the jump itself if we have picked up an existing jump. */ + rtx insert_place; + /* The ultimate destination. */ + rtx far_label; + struct far_branch *prev; + /* If the branch has already been created, its address; + else the address of its first prospective user. */ + int address; +}; + +enum mdep_reorg_phase_e mdep_reorg_phase; +void +gen_far_branch (bp) + struct far_branch *bp; +{ + rtx insn = bp->insert_place; + rtx jump; + rtx label = gen_label_rtx (); + + emit_label_after (label, insn); + if (bp->far_label) + { + jump = emit_jump_insn_after (gen_jump (bp->far_label), insn); + LABEL_NUSES (bp->far_label)++; + } + else + jump = emit_jump_insn_after (gen_return (), insn); + emit_label_after (bp->near_label, insn); + JUMP_LABEL (jump) = bp->far_label; + if (! invert_jump (insn, label)) + abort (); + /* Prevent reorg from undoing our splits. */ + gen_block_redirect (jump, bp->address += 2, 2); +} + +static void +fixup_aligns () +{ + rtx insn = get_last_insn (); + rtx align_tab[MAX_BITS_PER_WORD]; + int i; + + for (i = CACHE_LOG; i >= 0; i--) + align_tab[i] = insn; + bzero ((char *) uid_align, uid_align_max * sizeof *uid_align); + for (; insn; insn = PREV_INSN (insn)) + { + int uid = INSN_UID (insn); + if (uid < uid_align_max) + uid_align[uid] = align_tab[1]; + if (GET_CODE (insn) == INSN) + { + rtx pat = PATTERN (insn); + if (GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == 1) + { + /* Found an alignment instruction. */ + int log = INTVAL (XVECEXP (pat, 0, 0)); + uid_align[uid] = align_tab[log]; + for (i = log - 1; i >= 0; i--) + align_tab[i] = insn; + } + } + else if (GET_CODE (insn) == JUMP_INSN + && GET_CODE (PATTERN (insn)) == SET) + { + rtx dest = SET_SRC (PATTERN (insn)); + if (GET_CODE (dest) == IF_THEN_ELSE) + dest = XEXP (dest, 1); + if (GET_CODE (dest) == LABEL_REF) + { + dest = XEXP (dest, 0); + if (! uid_align[INSN_UID (dest)]) + /* Mark backward branch. */ + uid_align[uid] = 0; + } + } + } +} + +/* Fix up ADDR_DIFF_VECs. */ +void +fixup_addr_diff_vecs (first) + rtx first; +{ + rtx insn; + int max_address; + int need_fixup_aligns = 0; + + if (optimize) + max_address = insn_addresses[INSN_UID (get_last_insn ())] + 2; + for (insn = first; insn; insn = NEXT_INSN (insn)) + { + rtx vec_lab, rel_lab, pat, min_lab, max_lab, adj; + int len, i, min, max, size; + + if (GET_CODE (insn) != JUMP_INSN + || GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC) + continue; + pat = PATTERN (insn); + rel_lab = vec_lab = XEXP (XEXP (pat, 0), 0); + if (TARGET_SH2) + { + rtx prev, prevpat, x; + + /* Search the matching casesi_jump_2. */ + for (prev = vec_lab; ; prev = PREV_INSN (prev)) + { + if (GET_CODE (prev) != JUMP_INSN) + continue; + prevpat = PATTERN (prev); + if (GET_CODE (prevpat) != PARALLEL || XVECLEN (prevpat, 0) != 2) + continue; + x = XVECEXP (prevpat, 0, 1); + if (GET_CODE (x) != USE) + continue; + x = XEXP (x, 0); + if (GET_CODE (x) == LABEL_REF && XEXP (x, 0) == vec_lab) + break; + } + /* Fix up the ADDR_DIF_VEC to be relative + to the reference address of the braf. */ + XEXP (XEXP (pat, 0), 0) + = rel_lab = XEXP (XEXP (SET_SRC (XVECEXP (prevpat, 0, 0)), 1), 0); + } + if (! optimize) + continue; + len = XVECLEN (pat, 1); + if (len <= 0) + abort (); + for (min = max_address, max = 0, i = len - 1; i >= 0; i--) + { + rtx lab = XEXP (XVECEXP (pat, 1, i), 0); + int addr = insn_addresses[INSN_UID (lab)]; + if (addr < min) + { + min = addr; + min_lab = lab; + } + if (addr > max) + { + max = addr; + max_lab = lab; + } + } + adj + = emit_insn_before (gen_addr_diff_vec_adjust (min_lab, max_lab, rel_lab, + GEN_INT (len)), vec_lab); + size = (XVECLEN (pat, 1) * GET_MODE_SIZE (GET_MODE (pat)) + - addr_diff_vec_adjust (adj, 0)); + /* If this is a very small table, we want to remove the alignment after + the table. */ + if (! TARGET_SMALLCODE && size <= 1 << (CACHE_LOG - 2)) + { + rtx align = NEXT_INSN (next_nonnote_insn (insn)); + PUT_CODE (align, NOTE); + NOTE_LINE_NUMBER (align) = NOTE_INSN_DELETED; + NOTE_SOURCE_FILE (align) = 0; + need_fixup_aligns = 1; + } + } + if (need_fixup_aligns) + fixup_aligns (); +} + +/* Say how much the ADDR_DIFF_VEC following INSN can be shortened. + If FIRST_PASS is nonzero, all addresses and length of following + insns are still uninitialized. */ +int +addr_diff_vec_adjust (insn, first_pass) + rtx insn; + int first_pass; +{ + rtx pat = PATTERN (insn); + rtx min_lab = XEXP (XVECEXP (pat, 0, 0), 0); + rtx max_lab = XEXP (XVECEXP (pat, 0, 1), 0); + rtx rel_lab = XEXP (XVECEXP (pat, 0, 2), 0); + int len = INTVAL (XVECEXP (pat, 0, 3)); + int addr, min_addr, max_addr, saving, prev_saving = 0, offset; + rtx align_insn = uid_align[INSN_UID (rel_lab)]; + int standard_size = TARGET_BIGTABLE ? 4 : 2; + int last_size = GET_MODE_SIZE ( GET_MODE(pat)); + int align_fuzz = 0; + + if (! insn_addresses) + return 0; + if (first_pass) + /* If optimizing, we may start off with an optimistic guess. */ + return optimize ? len & ~1 : 0; + addr = insn_addresses[INSN_UID (rel_lab)]; + min_addr = insn_addresses[INSN_UID (min_lab)]; + max_addr = insn_addresses[INSN_UID (max_lab)]; + if (! last_size) + last_size = standard_size; + if (TARGET_SH2) + prev_saving = ((standard_size - last_size) * len) & ~1; + + /* The savings are linear to the vector length. However, if we have an + odd saving, we need one byte again to reinstate 16 bit alignment. */ + saving = ((standard_size - 1) * len) & ~1; + offset = prev_saving - saving; + + if ((insn_addresses[INSN_UID (align_insn)] < max_addr + || (insn_addresses[INSN_UID (align_insn)] == max_addr + && next_real_insn (max_lab) != align_insn)) + && GET_CODE (align_insn) == INSN) + { + int align = 1 << INTVAL (XVECEXP (PATTERN (align_insn), 0, 0)); + int align_addr = insn_addresses[INSN_UID (align_insn)]; + if (align_addr > insn_addresses[INSN_UID (insn)]) + { + int old_offset = offset; + offset = (align_addr - 1 & align - 1) + offset & -align; + align_addr += old_offset; + } + align_fuzz += (align_addr - 1) & (align - 2); + align_insn = uid_align[INSN_UID (align_insn)]; + if (insn_addresses[INSN_UID (align_insn)] <= max_addr + && GET_CODE (align_insn) == INSN) + { + int align2 = 1 << INTVAL (XVECEXP (PATTERN (align_insn), 0, 0)); + align_addr = insn_addresses[INSN_UID (align_insn)]; + if (align_addr > insn_addresses[INSN_UID (insn)]) + { + int old_offset = offset; + offset = (align_addr - 1 & align2 - 1) + offset & -align2; + align_addr += old_offset; + } + align_fuzz += (align_addr - 1) & (align2 - align); + } + } + + if (min_addr >= addr + && max_addr + offset - addr + align_fuzz <= 255) + { + PUT_MODE (pat, QImode); + return saving; + } + saving = 2 * len; +/* Since alignment might play a role in min_addr if it is smaller than addr, + we may not use it without exact alignment compensation; a 'worst case' + estimate is not good enough, because it won't prevent infinite oscillation + of shorten_branches. + ??? We should fix that eventually, but the code to deal with alignments + should go in a new function. */ +#if 0 + if (TARGET_BIGTABLE && min_addr - ((1 << CACHE_LOG) - 2) - addr >= -32768 +#else + if (TARGET_BIGTABLE && (min_addr >= addr || addr <= 32768) +#endif + && max_addr - addr <= 32767 + saving - prev_saving) + { + PUT_MODE (pat, HImode); + return saving; + } + PUT_MODE (pat, TARGET_BIGTABLE ? SImode : HImode); + return 0; +} + /* Exported to toplev.c. Do a final pass over the function, just before delayed branch @@ -2101,7 +2702,10 @@ void machine_dependent_reorg (first) rtx first; { - rtx insn; + rtx insn, mova; + int num_mova; + rtx r0_rtx = gen_rtx (REG, Pmode, 0); + rtx r0_inc_rtx = gen_rtx (POST_INC, Pmode, r0_rtx); /* If relaxing, generate pseudo-ops to associate function calls with the symbols they call. It does no harm to not generate these @@ -2109,6 +2713,7 @@ machine_dependent_reorg (first) linker to potentially relax the jsr to a bsr, and eliminate the register load and, possibly, the constant pool entry. */ + mdep_reorg_phase = SH_INSERT_USES_LABELS; if (TARGET_RELAX) { /* Remove all REG_LABEL notes. We want to use them for our own @@ -2330,21 +2935,178 @@ machine_dependent_reorg (first) } } + /* The following processing passes need length information. + addr_diff_vec_adjust needs to know if insn_addreses is valid. */ + insn_addresses = 0; + + /* If not optimizing for space, we want extra alignment for code after + a barrier, so that it starts on a word / cache line boundary. + We used to emit the alignment for the barrier itself and associate the + instruction length with the following instruction, but that had two + problems: + i) A code label that follows directly after a barrier gets too low an + address. When there is a forward branch to it, the incorrect distance + calculation can lead to out of range branches. That happened with + compile/920625-2 -O -fomit-frame-pointer in copyQueryResult. + ii) barriers before constant tables get the extra alignment too. + That is just a waste of space. + + So what we do now is to insert align_* instructions after the + barriers. By doing that before literal tables are generated, we + don't have to care about these. */ + /* We also want alignment in front of ADDR_DIFF_VECs; this is done already + by ASM_OUTPUT_CASE_LABEL, but when optimizing, we have to make it + explicit in the RTL in order to correctly shorten branches. */ + + if (optimize) + for (insn = first; insn; insn = NEXT_INSN (insn)) + { + rtx addr_diff_vec; + + if (GET_CODE (insn) == BARRIER + && (addr_diff_vec = next_real_insn (insn))) + if (GET_CODE (PATTERN (addr_diff_vec)) == ADDR_DIFF_VEC) + emit_insn_before (gen_align_4 (), + XEXP (XEXP (PATTERN (addr_diff_vec), 0), 0)); + else if (TARGET_SMALLCODE) + continue; + else if (TARGET_SH3) + { + /* We align for an entire cache line. If there is a immediately + preceding branch to the insn beyond the barrier, it does not + make sense to insert the align, because we are more likely + to discard useful information from the current cache line + when doing the align than to fetch unneeded insns when not. */ + rtx prev = prev_real_insn (prev_real_insn (insn)); + int slot, credit; + + for (slot = 2, credit = 1 << (CACHE_LOG - 2) + 2; + credit >= 0 && prev && GET_CODE (prev) == INSN; + prev = prev_real_insn (prev)) + { + if (GET_CODE (PATTERN (prev)) == USE + || GET_CODE (PATTERN (prev)) == CLOBBER) + continue; + if (slot && + get_attr_in_delay_slot (prev) == IN_DELAY_SLOT_YES) + slot = 0; + credit -= get_attr_length (prev); + } + if (! prev || GET_CODE (prev) != JUMP_INSN + || (next_real_insn (JUMP_LABEL (prev)) + != next_real_insn (insn)) + || (credit - slot + < (GET_CODE (SET_SRC (PATTERN (prev))) == PC ? 2 : 0))) + { + insn = emit_insn_after (gen_align_log (GEN_INT (CACHE_LOG)), + insn); + insn = emit_barrier_after (insn); + } + } + else + { + insn = emit_insn_after (gen_align_4 (), insn); + insn = emit_barrier_after (insn); + } + else if (TARGET_SMALLCODE) + continue; + else if (GET_CODE (insn) == NOTE + && NOTE_LINE_NUMBER (insn) == NOTE_INSN_LOOP_BEG) + { + rtx next = next_nonnote_insn (insn); + if (next && GET_CODE (next) == CODE_LABEL) + emit_insn_after (gen_align_4 (), insn); + } + } + + /* If TARGET_IEEE, we might have to split some branches before fixup_align. + If optimizing, the double call to shorten_branches will split insns twice, + unless we split now all that is to split and delete the original insn. */ + if (TARGET_IEEE || optimize) + for (insn = NEXT_INSN (first); insn; insn = NEXT_INSN (insn)) + if (GET_RTX_CLASS (GET_CODE (insn)) == 'i' && ! INSN_DELETED_P (insn)) + { + rtx old = insn; + insn = try_split (PATTERN (insn), insn, 1); + if (INSN_DELETED_P (old)) + { + PUT_CODE (old, NOTE); + NOTE_LINE_NUMBER (old) = NOTE_INSN_DELETED; + NOTE_SOURCE_FILE (old) = 0; + } + } + + max_uid_before_fixup_addr_diff_vecs = get_max_uid (); + + if (optimize) + { + uid_align_max = get_max_uid (); + uid_align = (rtx *) alloca (uid_align_max * sizeof *uid_align); + fixup_aligns (); + mdep_reorg_phase = SH_SHORTEN_BRANCHES0; + shorten_branches (first); + } + fixup_addr_diff_vecs (first); /* Scan the function looking for move instructions which have to be changed to pc-relative loads and insert the literal tables. */ - for (insn = first; insn; insn = NEXT_INSN (insn)) + mdep_reorg_phase = SH_FIXUP_PCLOAD; + for (insn = first, num_mova = 0; insn; insn = NEXT_INSN (insn)) { + if (mova_p (insn)) + { + if (! num_mova++) + mova = insn; + } + else if (GET_CODE (insn) == JUMP_INSN + && GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC + && num_mova) + { + rtx scan; + int total; + + num_mova--; + + /* Some code might have been inserted between the mova and + its ADDR_DIFF_VEC. Check if the mova is still in range. */ + for (scan = mova, total = 0; scan != insn; scan = NEXT_INSN (scan)) + if (INSN_UID (scan) < max_uid_before_fixup_addr_diff_vecs) + total += get_attr_length (scan); + + /* range of mova is 1020, add 4 because pc counts from address of + second instruction after this one, subtract 2 in case pc is 2 + byte aligned. Possible alignment needed for the ADDR_DIFF_VEC + chancles out with alignment effects of the mova itself. */ + if (total > 1022) + { + /* Change the mova into a load, and restart scanning + there. broken_move will then return true for mova. */ + SET_SRC (PATTERN (mova)) + = XVECEXP (SET_SRC (PATTERN (mova)), 0, 0); + INSN_CODE (mova) = -1; + insn = mova; + } + } if (broken_move (insn)) { rtx scan; /* Scan ahead looking for a barrier to stick the constant table behind. */ - rtx barrier = find_barrier (insn); + rtx barrier = find_barrier (num_mova, mova, insn); + rtx last_float_move, last_float = 0, *last_float_addr; + if (num_mova && ! mova_p (mova)) + { + /* find_barrier had to change the first mova into a + pcload; thus, we have to start with this new pcload. */ + insn = mova; + num_mova = 0; + } /* Now find all the moves between the points and modify them. */ for (scan = insn; scan != barrier; scan = NEXT_INSN (scan)) { + if (GET_CODE (scan) == CODE_LABEL) + last_float = 0; if (broken_move (scan)) { rtx *patp = &PATTERN (scan), pat = *patp; @@ -2373,17 +3135,306 @@ machine_dependent_reorg (first) dst = gen_rtx (REG, HImode, REGNO (dst) + offset); } - lab = add_constant (src, mode); - newsrc = gen_rtx (MEM, mode, - gen_rtx (LABEL_REF, VOIDmode, lab)); + if (GET_CODE (dst) == REG + && ((REGNO (dst) >= FIRST_FP_REG + && REGNO (dst) <= LAST_FP_REG) + || REGNO (dst) == FPUL_REG)) + { + if (last_float + && reg_set_between_p (r0_rtx, last_float_move, scan)) + last_float = 0; + lab = add_constant (src, mode, last_float); + if (lab) + emit_insn_before (gen_mova (lab), scan); + else + *last_float_addr = r0_inc_rtx; + last_float_move = scan; + last_float = src; + newsrc = gen_rtx (MEM, mode, + (REGNO (dst) == FPUL_REG + ? r0_inc_rtx + : r0_rtx)); + last_float_addr = &XEXP (newsrc, 0); + } + else + { + lab = add_constant (src, mode, 0); + newsrc = gen_rtx (MEM, mode, + gen_rtx (LABEL_REF, VOIDmode, lab)); + } RTX_UNCHANGING_P (newsrc) = 1; *patp = gen_rtx (SET, VOIDmode, dst, newsrc); INSN_CODE (scan) = -1; } } dump_table (barrier); + insn = barrier; } } + + mdep_reorg_phase = SH_SHORTEN_BRANCHES1; + insn_addresses = 0; + split_branches (first); + + /* The INSN_REFERENCES_ARE_DELAYED in sh.h is problematic because it + also has an effect on the register that holds the addres of the sfunc. + Insert an extra dummy insn in front of each sfunc that pretends to + use this register. */ + if (flag_delayed_branch) + { + for (insn = first; insn; insn = NEXT_INSN (insn)) + { + rtx reg = sfunc_uses_reg (insn); + + if (! reg) + continue; + emit_insn_before (gen_use_sfunc_addr (reg), insn); + } + } + mdep_reorg_phase = SH_AFTER_MDEP_REORG; +} + +int +get_dest_uid (label, max_uid) + rtx label; + int max_uid; +{ + rtx dest = next_real_insn (label); + int dest_uid; + if (! dest) + /* This can happen for an undefined label. */ + return 0; + dest_uid = INSN_UID (dest); + /* If this is a newly created branch redirection blocking instruction, + we cannot index the branch_uid or insn_addresses arrays with its + uid. But then, we won't need to, because the actual destination is + the following branch. */ + while (dest_uid >= max_uid) + { + dest = NEXT_INSN (dest); + dest_uid = INSN_UID (dest); + } + if (GET_CODE (dest) == JUMP_INSN && GET_CODE (PATTERN (dest)) == RETURN) + return 0; + return dest_uid; +} + +/* Split condbranches that are out of range. Also add clobbers for + scratch registers that are needed in far jumps. + We do this before delay slot scheduling, so that it can take our + newly created instructions into account. It also allows us to + find branches with common targets more easily. */ + +static void +split_branches (first) + rtx first; +{ + rtx insn; + struct far_branch **uid_branch, *far_branch_list = 0; + int max_uid = get_max_uid (); + + /* Find out which branches are out of range. */ + uid_align_max = get_max_uid (); + uid_align = (rtx *) alloca (uid_align_max * sizeof *uid_align); + fixup_aligns (); + shorten_branches (first); + + uid_branch = (struct far_branch **) alloca (max_uid * sizeof *uid_branch); + bzero ((char *) uid_branch, max_uid * sizeof *uid_branch); + + for (insn = first; insn; insn = NEXT_INSN (insn)) + if (GET_RTX_CLASS (GET_CODE (insn)) != 'i') + continue; + else if (INSN_DELETED_P (insn)) + { + /* Shorten_branches would split this instruction again, + so transform it into a note. */ + PUT_CODE (insn, NOTE); + NOTE_LINE_NUMBER (insn) = NOTE_INSN_DELETED; + NOTE_SOURCE_FILE (insn) = 0; + } + else if (GET_CODE (insn) == JUMP_INSN + /* Don't mess with ADDR_DIFF_VEC */ + && (GET_CODE (PATTERN (insn)) == SET + || GET_CODE (PATTERN (insn)) == RETURN)) + { + enum attr_type type = get_attr_type (insn); + if (type == TYPE_CBRANCH) + { + rtx next, beyond; + + if (get_attr_length (insn) > 4) + { + rtx src = SET_SRC (PATTERN (insn)); + rtx cond = XEXP (src, 0); + rtx olabel = XEXP (XEXP (src, 1), 0); + rtx jump; + int addr = insn_addresses[INSN_UID (insn)]; + rtx label = 0; + int dest_uid = get_dest_uid (olabel, max_uid); + struct far_branch *bp = uid_branch[dest_uid]; + + /* redirect_jump needs a valid JUMP_LABEL, and it might delete + the label if th lABEL_BUSES count drops to zero. There is + always a jump_optimize pass that sets these values, but it + proceeds to delete unreferenced code, and then if not + optimizeing, to un-delete the deleted instructions, thus + leaving labels with too low uses counts. */ + if (! optimize) + { + JUMP_LABEL (insn) = olabel; + LABEL_NUSES (olabel)++; + } + if (! bp) + { + bp = (struct far_branch *) alloca (sizeof *bp); + uid_branch[dest_uid] = bp; + bp->prev = far_branch_list; + far_branch_list = bp; + bp->far_label + = XEXP (XEXP (SET_SRC (PATTERN (insn)), 1), 0); + LABEL_NUSES (bp->far_label)++; + } + else + { + label = bp->near_label; + if (! label && bp->address - addr >= CONDJUMP_MIN) + { + rtx block = bp->insert_place; + + if (GET_CODE (PATTERN (block)) == RETURN) + block = PREV_INSN (block); + else + block = gen_block_redirect (block, + bp->address, 2); + label = emit_label_after (gen_label_rtx (), + PREV_INSN (block)); + bp->near_label = label; + } + else if (label && ! NEXT_INSN (label)) + if (addr + 2 - bp->address <= CONDJUMP_MAX) + bp->insert_place = insn; + else + gen_far_branch (bp); + } + if (! label + || NEXT_INSN (label) && bp->address - addr < CONDJUMP_MIN) + { + bp->near_label = label = gen_label_rtx (); + bp->insert_place = insn; + bp->address = addr; + } + if (! redirect_jump (insn, label)) + abort (); + } + else + { + /* get_attr_length (insn) == 2 */ + /* Check if we have a pattern where reorg wants to redirect + the branch to a label from an unconditional branch that + is too far away. */ + /* We can't use JUMP_LABEL here because it might be undefined + when not optimizing. */ + beyond + = next_active_insn (XEXP (XEXP (SET_SRC (PATTERN (insn)), 1), + 0)); + + if ((GET_CODE (beyond) == JUMP_INSN + || (GET_CODE (beyond = next_active_insn (beyond)) + == JUMP_INSN)) + && GET_CODE (PATTERN (beyond)) == SET + && recog_memoized (beyond) == CODE_FOR_jump + && ((insn_addresses[INSN_UID (XEXP (SET_SRC (PATTERN (beyond)), 0))] + - insn_addresses[INSN_UID (insn)] + 252U) + > 252 + 258 + 2)) + gen_block_redirect (beyond, + insn_addresses[INSN_UID (beyond)], 1); + } + + next = next_active_insn (insn); + + if ((GET_CODE (next) == JUMP_INSN + || GET_CODE (next = next_active_insn (next)) == JUMP_INSN) + && GET_CODE (PATTERN (next)) == SET + && recog_memoized (next) == CODE_FOR_jump + && ((insn_addresses[INSN_UID (XEXP (SET_SRC (PATTERN (next)), 0))] + - insn_addresses[INSN_UID (insn)] + 252U) + > 252 + 258 + 2)) + gen_block_redirect (next, insn_addresses[INSN_UID (next)], 1); + } + else if (type == TYPE_JUMP || type == TYPE_RETURN) + { + int addr = insn_addresses[INSN_UID (insn)]; + rtx far_label = 0; + int dest_uid = 0; + struct far_branch *bp; + + if (type == TYPE_JUMP) + { + far_label = XEXP (SET_SRC (PATTERN (insn)), 0); + dest_uid = get_dest_uid (far_label, max_uid); + if (! dest_uid) + { + /* Parse errors can lead to labels outside + the insn stream. */ + if (! NEXT_INSN (far_label)) + continue; + + if (! optimize) + { + JUMP_LABEL (insn) = far_label; + LABEL_NUSES (far_label)++; + } + redirect_jump (insn, NULL_RTX); + far_label = 0; + } + } + bp = uid_branch[dest_uid]; + if (! bp) + { + bp = (struct far_branch *) alloca (sizeof *bp); + uid_branch[dest_uid] = bp; + bp->prev = far_branch_list; + far_branch_list = bp; + bp->near_label = 0; + bp->far_label = far_label; + if (far_label) + LABEL_NUSES (far_label)++; + } + else if (bp->near_label && ! NEXT_INSN (bp->near_label)) + if (addr - bp->address <= CONDJUMP_MAX) + emit_label_after (bp->near_label, PREV_INSN (insn)); + else + { + gen_far_branch (bp); + bp->near_label = 0; + } + else + bp->near_label = 0; + bp->address = addr; + bp->insert_place = insn; + if (! far_label) + emit_insn_before (gen_block_branch_redirect (const0_rtx), insn); + else + gen_block_redirect (insn, addr, bp->near_label ? 2 : 0); + } + } + /* Generate all pending far branches, + and free our references to the far labels. */ + while (far_branch_list) + { + if (far_branch_list->near_label + && ! NEXT_INSN (far_branch_list->near_label)) + gen_far_branch (far_branch_list); + if (optimize + && far_branch_list->far_label + && ! --LABEL_NUSES (far_branch_list->far_label)) + delete_insn (far_branch_list->far_label); + far_branch_list = far_branch_list->prev; + } + uid_align_max = get_max_uid (); + uid_align = (rtx *) oballoc (uid_align_max * sizeof *uid_align); + fixup_aligns (); } /* Dump out instruction addresses, which is useful for debugging the @@ -2545,10 +3596,11 @@ push (rn) rtx x; if ((rn >= FIRST_FP_REG && rn <= LAST_FP_REG) || rn == FPUL_REG) - x = emit_insn (gen_push_e (gen_rtx (REG, SFmode, rn))); + x = gen_push_e (gen_rtx (REG, SFmode, rn)); else - x = emit_insn (gen_push (gen_rtx (REG, SImode, rn))); + x = gen_push (gen_rtx (REG, SImode, rn)); + x = emit_insn (x); REG_NOTES (x) = gen_rtx (EXPR_LIST, REG_INC, gen_rtx(REG, SImode, STACK_POINTER_REGNUM), 0); } @@ -2562,16 +3614,16 @@ pop (rn) rtx x; if ((rn >= FIRST_FP_REG && rn <= LAST_FP_REG) || rn == FPUL_REG) - x = emit_insn (gen_pop_e (gen_rtx (REG, SFmode, rn))); + x = gen_pop_e (gen_rtx (REG, SFmode, rn)); else - x = emit_insn (gen_pop (gen_rtx (REG, SImode, rn))); + x = gen_pop (gen_rtx (REG, SImode, rn)); + x = emit_insn (x); REG_NOTES (x) = gen_rtx (EXPR_LIST, REG_INC, gen_rtx(REG, SImode, STACK_POINTER_REGNUM), 0); } -/* Generate code to push the regs specified in the mask, and return - the number of bytes the insns take. */ +/* Generate code to push the regs specified in the mask. */ static void push_regs (mask, mask2) @@ -2579,16 +3631,21 @@ push_regs (mask, mask2) { int i; + /* Push PR last; this gives better latencies after the prologue, and + candidates for the return delay slot when there are no general + registers pushed. */ for (i = 0; i < 32; i++) - if (mask & (1 << i)) + if (mask & (1 << i) && i != PR_REG) push (i); for (i = 32; i < FIRST_PSEUDO_REGISTER; i++) if (mask2 & (1 << (i - 32))) push (i); + if (mask & (1 << PR_REG)) + push (PR_REG); } /* Work out the registers which need to be saved, both as a mask and a - count. + count of saved words. If doing a pragma interrupt function, then push all regs used by the function, and if we call another function (we can tell by looking at PR), @@ -2601,40 +3658,28 @@ calc_live_regs (count_ptr, live_regs_mask2) { int reg; int live_regs_mask = 0; - int count = 0; + int count; *live_regs_mask2 = 0; - for (reg = 0; reg < FIRST_PSEUDO_REGISTER; reg++) + for (count = 0, reg = FIRST_PSEUDO_REGISTER - 1; reg >= 0; reg--) { - if (pragma_interrupt && ! pragma_trapa) + if ((pragma_interrupt && ! pragma_trapa) + ? (/* Need to save all the regs ever live. */ + (regs_ever_live[reg] + || (call_used_regs[reg] + && (! fixed_regs[reg] || reg == MACH_REG || reg == MACL_REG) + && regs_ever_live[PR_REG])) + && reg != STACK_POINTER_REGNUM && reg != ARG_POINTER_REGNUM + && reg != RETURN_ADDRESS_POINTER_REGNUM + && reg != T_REG && reg != GBR_REG) + : (/* Only push those regs which are used and need to be saved. */ + regs_ever_live[reg] && ! call_used_regs[reg])) { - /* Need to save all the regs ever live. */ - if ((regs_ever_live[reg] - || (call_used_regs[reg] - && (! fixed_regs[reg] || reg == MACH_REG || reg == MACL_REG) - && regs_ever_live[PR_REG])) - && reg != STACK_POINTER_REGNUM && reg != ARG_POINTER_REGNUM - && reg != RETURN_ADDRESS_POINTER_REGNUM - && reg != T_REG && reg != GBR_REG) - { - if (reg >= 32) - *live_regs_mask2 |= 1 << (reg - 32); - else - live_regs_mask |= 1 << reg; - count++; - } - } - else - { - /* Only push those regs which are used and need to be saved. */ - if (regs_ever_live[reg] && ! call_used_regs[reg]) - { - if (reg >= 32) - *live_regs_mask2 |= 1 << (reg - 32); - else - live_regs_mask |= (1 << reg); - count++; - } + if (reg >= 32) + *live_regs_mask2 |= 1 << (reg - 32); + else + live_regs_mask |= 1 << reg; + count++; } } @@ -2650,7 +3695,6 @@ sh_expand_prologue () int live_regs_mask; int d, i; int live_regs_mask2; - live_regs_mask = calc_live_regs (&d, &live_regs_mask2); /* We have pretend args if we had an object sent partially in registers and partially on the stack, e.g. a large structure. */ @@ -2667,7 +3711,7 @@ sh_expand_prologue () /* This is not used by the SH3E calling convention */ if (!TARGET_SH3E) - { + { /* Push arg regs as if they'd been provided by caller in stack. */ for (i = 0; i < NPARM_REGS(SImode); i++) { @@ -2679,13 +3723,14 @@ sh_expand_prologue () push (rn); extra_push += 4; } - } + } } /* If we're supposed to switch stacks at function entry, do so now. */ if (sp_switch) emit_insn (gen_sp_switch_1 ()); + live_regs_mask = calc_live_regs (&d, &live_regs_mask2); push_regs (live_regs_mask, live_regs_mask2); output_stack_adjust (-get_frame_size (), stack_pointer_rtx, 3); @@ -2701,7 +3746,6 @@ sh_expand_epilogue () int d, i; int live_regs_mask2; - live_regs_mask = calc_live_regs (&d, &live_regs_mask2); if (frame_pointer_needed) { @@ -2726,10 +3770,13 @@ sh_expand_epilogue () /* Pop all the registers. */ + live_regs_mask = calc_live_regs (&d, &live_regs_mask2); + if (live_regs_mask & (1 << PR_REG)) + pop (PR_REG); for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) { int j = (FIRST_PSEUDO_REGISTER - 1) - i; - if (j < 32 && (live_regs_mask & (1 << j))) + if (j < 32 && (live_regs_mask & (1 << j)) && j != PR_REG) pop (j); else if (j >= 32 && (live_regs_mask2 & (1 << (j - 32)))) pop (j); @@ -2794,17 +3841,19 @@ sh_builtin_saveregs (arglist) named args need not be saved. We explicitly build a pointer to the buffer because it halves the insn count when not optimizing (otherwise the pointer is built for each reg - saved). */ + saved). + We emit the moves in reverse order so that we can use predecrement. */ fpregs = gen_reg_rtx (Pmode); emit_move_insn (fpregs, XEXP (regbuf, 0)); - for (regno = first_floatreg; regno < NPARM_REGS (SFmode); regno ++) - emit_move_insn (gen_rtx (MEM, SFmode, - plus_constant (fpregs, - GET_MODE_SIZE (SFmode) - * (regno - first_floatreg))), - gen_rtx (REG, SFmode, - BASE_ARG_REG (SFmode) + regno)); + emit_insn (gen_addsi3 (fpregs, fpregs, + GEN_INT (n_floatregs * UNITS_PER_WORD))); + for (regno = NPARM_REGS (SFmode) - 1; regno >= first_floatreg; regno--) + { + emit_insn (gen_addsi3 (fpregs, fpregs, GEN_INT (- UNITS_PER_WORD))); + emit_move_insn (gen_rtx (MEM, SFmode, fpregs), + gen_rtx (REG, SFmode, BASE_ARG_REG (SFmode) + regno)); + } /* Return the address of the regbuf. */ return XEXP (regbuf, 0); @@ -2821,9 +3870,11 @@ initial_elimination_offset (from, to) int regs_saved; int total_saved_regs_space; int total_auto_space = get_frame_size (); + int save_flags = target_flags; int live_regs_mask, live_regs_mask2; live_regs_mask = calc_live_regs (®s_saved, &live_regs_mask2); + target_flags = save_flags; total_saved_regs_space = (regs_saved) * 4; @@ -2840,13 +3891,10 @@ initial_elimination_offset (from, to) if (from == RETURN_ADDRESS_POINTER_REGNUM && (to == FRAME_POINTER_REGNUM || to == STACK_POINTER_REGNUM)) { - int i, n = 0; - for (i = PR_REG+1; i < 32; i++) + int i, n = total_saved_regs_space; + for (i = PR_REG-1; i >= 0; i--) if (live_regs_mask & (1 << i)) - n += 4; - for (i = 32; i < FIRST_PSEUDO_REGISTER; i++) - if (live_regs_mask2 & (1 << (i - 32))) - n += 4; + n -= 4; return n + total_auto_space; } @@ -3121,6 +4169,205 @@ fp_one_operand (op) REAL_VALUE_FROM_CONST_DOUBLE (r, op); return REAL_VALUES_EQUAL (r, dconst1); } + +int +braf_label_ref_operand(op, mode) + rtx op; + enum machine_mode mode; +{ + rtx prev; + + if (GET_CODE (op) != LABEL_REF) + return 0; + prev = prev_real_insn (XEXP (op, 0)); + if (GET_CODE (prev) != JUMP_INSN) + return 0; + prev = PATTERN (prev); + if (GET_CODE (prev) != PARALLEL || XVECLEN (prev, 0) != 2) + return 0; + prev = XVECEXP (prev, 0, 0); + if (GET_CODE (prev) != SET) + return 0; + prev = SET_SRC (prev); + if (GET_CODE (prev) != PLUS || XEXP (prev, 1) != op) + return 0; +} + +/* Return the offset of a branch. Offsets for backward branches are + reported relative to the branch instruction, while offsets for forward + branches are reported relative to the following instruction. */ + +int +branch_offset (branch) + rtx branch; +{ + rtx dest = SET_SRC (PATTERN (branch)), dest_next; + int branch_uid = INSN_UID (branch); + int dest_uid, dest_addr; + rtx branch_align = uid_align[branch_uid]; + + if (GET_CODE (dest) == IF_THEN_ELSE) + dest = XEXP (dest, 1); + dest = XEXP (dest, 0); + dest_uid = INSN_UID (dest); + dest_addr = insn_addresses[dest_uid]; + if (branch_align) + { + /* Forward branch. */ + /* If branch is in a sequence, get the successor of the sequence. */ + rtx next = NEXT_INSN (NEXT_INSN (PREV_INSN (branch))); + int next_addr = insn_addresses[INSN_UID (next)]; + int diff; + + /* If NEXT has been hoisted in a sequence further on, it address has + been clobbered in the previous pass. However, if that is the case, + we know that it is exactly 2 bytes long (because it fits in a delay + slot), and that there is a following label (the destination of the + instruction that filled its delay slot with NEXT). The address of + this label is reliable. */ + if (NEXT_INSN (next)) + { + int next_next_addr = insn_addresses[INSN_UID (NEXT_INSN (next))]; + if (next_addr > next_next_addr) + next_addr = next_next_addr - 2; + } + diff = dest_addr - next_addr; + /* If BRANCH_ALIGN has been the last insn, it might be a barrier or + a note. */ + if ((insn_addresses[INSN_UID (branch_align)] < dest_addr + || (insn_addresses[INSN_UID (branch_align)] == dest_addr + && next_real_insn (dest) != branch_align)) + && GET_CODE (branch_align) == INSN) + { + int align = 1 << INTVAL (XVECEXP (PATTERN (branch_align), 0, 0)); + int align_addr = insn_addresses[INSN_UID (branch_align)]; + diff += (align_addr - 1) & (align - 2); + branch_align = uid_align[INSN_UID (branch_align)]; + if (insn_addresses[INSN_UID (branch_align)] <= dest_addr + && GET_CODE (branch_align) == INSN) + { + int align2 = 1 << INTVAL (XVECEXP (PATTERN (branch_align), 0, 0)); + align_addr = insn_addresses[INSN_UID (branch_align)]; + diff += (align_addr - 1) & (align2 - align); + } + } + return diff; + } + else + { + /* Backward branch. */ + int branch_addr = insn_addresses[branch_uid]; + int diff = dest_addr - branch_addr; + int old_align = 2; + + while (dest_uid >= uid_align_max || ! uid_align[dest_uid]) + { + /* Label might be outside the insn stream, or even in a separate + insn stream, after a syntax errror. */ + if (! NEXT_INSN (dest)) + return 0; + dest = NEXT_INSN (dest), dest_uid = INSN_UID (dest); + } + + /* By searching for a known destination, we might already have + stumbled on the alignment instruction. */ + if (GET_CODE (dest) == INSN + && GET_CODE (PATTERN (dest)) == UNSPEC_VOLATILE + && XINT (PATTERN (dest), 1) == 1 + && INTVAL (XVECEXP (PATTERN (dest), 0, 0)) > 1) + branch_align = dest; + else + branch_align = uid_align[dest_uid]; + while (insn_addresses[INSN_UID (branch_align)] <= branch_addr + && GET_CODE (branch_align) == INSN) + { + int align = 1 << INTVAL (XVECEXP (PATTERN (branch_align), 0, 0)); + int align_addr = insn_addresses[INSN_UID (branch_align)]; + diff -= (align_addr - 1) & (align - old_align); + old_align = align; + branch_align = uid_align[INSN_UID (branch_align)]; + } + return diff; + } +} + +int +short_cbranch_p (branch) + rtx branch; +{ + int offset; + + if (! insn_addresses) + return 0; + if (mdep_reorg_phase <= SH_FIXUP_PCLOAD) + return 0; + offset = branch_offset (branch); + return (offset >= -252 + && offset <= (NEXT_INSN (PREV_INSN (branch)) == branch ? 256 : 254)); +} + +/* The maximum range used for SImode constant pool entrys is 1018. A final + instruction can add 8 bytes while only being 4 bytes in size, thus we + can have a total of 1022 bytes in the pool. Add 4 bytes for a branch + instruction around the pool table, 2 bytes of alignment before the table, + and 30 bytes of alignment after the table. That gives a maximum total + pool size of 1058 bytes. + Worst case code/pool content size ratio is 1:2 (using asms). + Thus, in the worst case, there is one instruction in front of a maximum + sized pool, and then there are 1052 bytes of pool for every 508 bytes of + code. For the last n bytes of code, there are 2n + 36 bytes of pool. + If we have a forward branch, the initial table will be put after the + unconditional branch. + + ??? We could do much better by keeping track of the actual pcloads within + the branch range and in the pcload range in front of the branch range. */ + +int +med_branch_p (branch, condlen) + rtx branch; + int condlen; +{ + int offset; + + if (! insn_addresses) + return 0; + offset = branch_offset (branch); + if (mdep_reorg_phase <= SH_FIXUP_PCLOAD) + return offset - condlen >= -990 && offset <= 998; + return offset - condlen >= -4092 && offset <= 4094; +} + +int +braf_branch_p (branch, condlen) + rtx branch; + int condlen; +{ + int offset; + + if (! insn_addresses) + return 0; + if (! TARGET_SH2) + return 0; + offset = branch_offset (branch); + if (mdep_reorg_phase <= SH_FIXUP_PCLOAD) + return offset - condlen >= -10330 && offset <= 10330; + return offset -condlen >= -32764 && offset <= 32766; +} + +int +align_length (insn) + rtx insn; +{ + int align = 1 << INTVAL (XVECEXP (PATTERN (insn), 0, 0)); + if (! insn_addresses) + if (optimize + && (mdep_reorg_phase == SH_SHORTEN_BRANCHES0 + || mdep_reorg_phase == SH_SHORTEN_BRANCHES1)) + return 0; + else + return align - 2; + return align - 2 - ((insn_addresses[INSN_UID (insn)] - 2) & (align - 2)); +} /* Return non-zero if REG is not used after INSN. We assume REG is a reload reg, and therefore does diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h index a08c125cfb7..ef6aef3786e 100644 --- a/gcc/config/sh/sh.h +++ b/gcc/config/sh/sh.h @@ -804,9 +804,16 @@ struct sh_args { This macro is only used in this file. */ #define PASS_IN_REG_P(CUM, MODE, TYPE) \ - (ROUND_REG ((CUM), (MODE)) < NPARM_REGS (MODE) \ - && ((TYPE) == 0 || ! TREE_ADDRESSABLE ((tree)(TYPE))) \ - && (! TARGET_SH3E || (ROUND_REG((CUM), (MODE)) + (GET_MODE_SIZE(MODE)/4) <= NPARM_REGS (MODE)))) + (((TYPE) == 0 || ! TREE_ADDRESSABLE ((tree)(TYPE))) \ + && (TARGET_SH3E \ + ? ((MODE) == BLKmode \ + ? (((CUM).arg_count[(int) SH_ARG_INT] * UNITS_PER_WORD \ + + int_size_in_bytes (TYPE)) \ + <= NPARM_REGS (SImode) * UNITS_PER_WORD) \ + : ((ROUND_REG((CUM), (MODE)) \ + + HARD_REGNO_NREGS (BASE_ARG_REG (MODE), (MODE))) \ + <= NPARM_REGS (MODE))) \ + : ROUND_REG ((CUM), (MODE)) < NPARM_REGS (MODE))) /* Define where to put the arguments to a function. Value is zero to push the argument on the stack, @@ -1755,9 +1762,14 @@ sh_valid_machine_decl_attribute (DECL, ATTRIBUTES, IDENTIFIER, ARGS) (LENGTH) = align_length (X); \ if (GET_CODE (X) == JUMP_INSN \ && GET_CODE (PATTERN (X)) == ADDR_DIFF_VEC) \ - /* The code before an ADDR_DIFF_VEC is even aligned, thus \ - any odd estimate is wrong. */ \ - (LENGTH) &= ~1; + { \ + /* The code before an ADDR_DIFF_VEC is even aligned, \ + thus any odd estimate is wrong. */ \ + (LENGTH) &= ~1; \ + /* If not optimizing, the alignment is implicit. */ \ + if (! optimize) \ + (LENGTH) += 2; \ + } /* Enable a bug fix for the shorten_branches pass. */ #define SHORTEN_WITH_ADJUST_INSN_LENGTH @@ -1859,4 +1871,7 @@ do { \ /* For the sake of libgcc2.c, indicate target supports atexit. */ #define HAVE_ATEXIT +/* Enable the register move pass to improve code. */ +#define ENABLE_REGMOVE_PASS + #define SH_DYNAMIC_SHIFT_COST (TARGET_SH3 ? (TARGET_SMALLCODE ? 1 : 2) : 20) diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md index 1b0e35d6ff7..1faad932426 100644 --- a/gcc/config/sh/sh.md +++ b/gcc/config/sh/sh.md @@ -69,67 +69,86 @@ ;; Target CPU. -(define_attr "cpu" "sh0,sh1,sh2,sh3,sh3e" +(define_attr "cpu" + "sh1,sh2,sh3,sh3e" (const (symbol_ref "sh_cpu_attr"))) +(define_attr "endian" "big,little" + (const (if_then_else (symbol_ref "TARGET_LITTLE_ENDIAN") + (const_string "little") (const_string "big")))) + ;; cbranch conditional branch instructions ;; jump unconditional jumps ;; arith ordinary arithmetic +;; arith3 a compound insn that behaves similarily to a sequence of +;; three insns of type arith +;; arith3b like above, but might end with a redirected branch ;; load from memory +;; load_si Likewise, SImode variant for general register. ;; store to memory ;; move register to register +;; fmove register to register, floating point ;; smpy word precision integer multiply ;; dmpy longword or doublelongword precision integer multiply ;; return rts ;; pload load of pr reg, which can't be put into delay slot of rts ;; pstore store of pr reg, which can't be put into delay slot of jsr ;; pcload pc relative load of constant value +;; pcload_si Likewise, SImode variant for general register. ;; rte return from exception ;; sfunc special function call with known used registers ;; call function call ;; fp floating point ;; fdiv floating point divide (or square root) +;; gp_fpul move between general purpose register and fpul (define_attr "type" - "cbranch,jump,arith,other,load,store,move,smpy,dmpy,return,pload,pstore,pcload,rte,sfunc,call,fp,fdiv" + "cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,other,load,load_si,store,move,fmove,smpy,dmpy,return,pload,pstore,pcload,pcload_si,rte,sfunc,call,fp,fdiv,gp_fpul" (const_string "other")) ; If a conditional branch destination is within -252..258 bytes away ; from the instruction it can be 2 bytes long. Something in the ; range -4090..4100 bytes can be 6 bytes long. All other conditional -; branches are 16 bytes long. +; branches are initially assumed to be 16 bytes long. +; In machine_dependent_reorg, we split all branches that are longer than +; 2 bytes. ; An unconditional jump in the range -4092..4098 can be 2 bytes long. -; Otherwise, it must be 14 bytes long. +; For wider ranges, we need a combination of a code and a data part. +; If we can get a scratch register for a long range jump, the code +; part can be 4 bytes long; otherwise, it must be 8 bytes long. +; If the jump is in the range -32764..32770, the data part can be 2 bytes +; long; otherwise, it must be 6 bytes long. ; All other instructions are two bytes long by default. -; All positive offsets have an adjustment added, which is the number of bytes -; difference between this instruction length and the next larger instruction -; length. This is because shorten_branches starts with the largest -; instruction size and then tries to reduce them. - (define_attr "length" "" (cond [(eq_attr "type" "cbranch") - (if_then_else (and (ge (minus (match_dup 0) (pc)) - (const_int -252)) - (le (minus (match_dup 0) (pc)) - (const_int 262))) - (const_int 2) - (if_then_else (and (ge (minus (match_dup 0) (pc)) - (const_int -4090)) - (le (minus (match_dup 0) (pc)) - (const_int 4110))) - (const_int 6) - (const_int 16))) - + (cond [(ne (symbol_ref "short_cbranch_p (insn)") (const_int 0)) + (const_int 2) + (ne (symbol_ref "med_branch_p (insn, 2)") (const_int 0)) + (const_int 6) + (ne (symbol_ref "braf_branch_p (insn, 2)") (const_int 0)) + (const_int 10) + (ne (pc) (pc)) + (const_int 12) + ] (const_int 16)) (eq_attr "type" "jump") - (if_then_else (and (ge (minus (match_dup 0) (pc)) - (const_int -4092)) - (le (minus (match_dup 0) (pc)) - (const_int 4110))) - (const_int 2) - (const_int 14)) + (cond [(ne (symbol_ref "med_branch_p (insn, 0)") (const_int 0)) + (const_int 2) + (and (eq (symbol_ref "GET_CODE (PREV_INSN (insn))") + (symbol_ref "INSN")) + (eq (symbol_ref "INSN_CODE (PREV_INSN (insn))") + (symbol_ref "code_for_indirect_jump_scratch"))) + (if_then_else (ne (symbol_ref "braf_branch_p (insn, 0)") + (const_int 0)) + (const_int 6) + (const_int 10)) + (ne (symbol_ref "braf_branch_p (insn, 0)") (const_int 0)) + (const_int 10) + (ne (pc) (pc)) + (const_int 12) + ] (const_int 14)) ] (const_int 2))) ;; (define_function_unit {name} {num-units} {n-users} {test} @@ -140,16 +159,41 @@ ;; gcc to separate load and store instructions by one instruction, ;; which makes it more likely that the linker will be able to word ;; align them when relaxing. + +;; Loads have a latency of two. +;; However, call insn can have ;; a delay slot, so that we want one more +;; insn to be scheduled between the load of the function address and the call. +;; This is equivalent to a latency of three. +;; We cannot use a conflict list for this, because we need to distinguish +;; between the actual call address and the function arguments. +;; ADJUST_COST can only properly handle reductions of the cost, so we +;; use a latency of three here. +;; We only do this for SImode loads of general regsiters, to make the work +;; for ADJUST_COST easier. (define_function_unit "memory" 1 0 - (eq_attr "type" "load,pcload,pload,store,pstore") 2 2) + (eq_attr "type" "load_si,pcload_si") + 3 2) +(define_function_unit "memory" 1 0 + (eq_attr "type" "load,pcload,pload,store,pstore") + 2 2) + +(define_function_unit "int" 1 0 + (eq_attr "type" "arith3,arith3b") 3 3) + +(define_function_unit "int" 1 0 + (eq_attr "type" "dyn_shift") 2 2) + +(define_function_unit "int" 1 0 + (eq_attr "type" "arith,arith3b,dyn_shift") 2 2) ;; ??? These are approximations. (define_function_unit "mpy" 1 0 (eq_attr "type" "smpy") 2 2) (define_function_unit "mpy" 1 0 (eq_attr "type" "dmpy") 3 3) -(define_function_unit "fp" 1 0 (eq_attr "type" "fp") 2 1) +(define_function_unit "fp" 1 0 (eq_attr "type" "fp,fmove") 2 1) (define_function_unit "fp" 1 0 (eq_attr "type" "fdiv") 13 12) + ; Definitions for filling branch delay slots. (define_attr "needs_delay_slot" "yes,no" (const_string "no")) @@ -161,7 +205,7 @@ (define_attr "in_delay_slot" "yes,no" (cond [(eq_attr "type" "cbranch") (const_string "no") - (eq_attr "type" "pcload") (const_string "no") + (eq_attr "type" "pcload,pcload_si") (const_string "no") (eq_attr "needs_delay_slot" "yes") (const_string "no") (eq_attr "length" "2") (const_string "yes") ] (const_string "no"))) @@ -197,27 +241,15 @@ ;; Say that we have annulled true branches, since this gives smaller and ;; faster code when branches are predicted as not taken. -;; ??? Branches which are out-of-range actually have two delay slots, -;; the first is either always executed or else annulled false, and the -;; second is always annulled false. Handling these differently from -;; in range branches would give better code. - (define_delay (and (eq_attr "type" "cbranch") - (eq_attr "cpu" "sh2,sh3")) + (ne (symbol_ref "TARGET_SH2") (const_int 0))) [(eq_attr "in_delay_slot" "yes") (eq_attr "in_delay_slot" "yes") (nil)]) ;; ------------------------------------------------------------------------- ;; SImode signed integer comparisons ;; ------------------------------------------------------------------------- -(define_insn "" - [(set (match_operand:SI 0 "arith_reg_operand" "=r") - (eq:SI (reg:SI 18) - (const_int 1)))] - "" - "movt %0") - (define_insn "" [(set (reg:SI 18) (eq:SI (and:SI (match_operand:SI 0 "arith_reg_operand" "z,r") @@ -288,6 +320,93 @@ }") ;; ------------------------------------------------------------------------- +;; DImode signed integer comparisons +;; ------------------------------------------------------------------------- + +;; ??? Could get better scheduling by splitting the initial test from the +;; rest of the insn after reload. However, the gain would hardly justify +;; the sh.md size increase necessary to do that. + +(define_insn "" + [(set (reg:SI 18) + (eq:SI (and:DI (match_operand:DI 0 "arith_reg_operand" "r") + (match_operand:DI 1 "arith_operand" "r")) + (const_int 0)))] + "" + "* return output_branchy_insn (EQ, \"tst\\t%S1,%S0\;bf\\t%l9\;tst\\t%R1,%R0\", + insn, operands);" + [(set_attr "length" "6") + (set_attr "type" "arith3b")]) + +(define_insn "cmpeqdi_t" + [(set (reg:SI 18) (eq:SI (match_operand:DI 0 "arith_reg_operand" "r,r") + (match_operand:DI 1 "arith_reg_or_0_operand" "N,r")))] + "" + "* + return output_branchy_insn + (EQ, + (which_alternative + ? \"cmp/eq\\t%S1,%S0\;bf\\t%l9\;cmp/eq\\t%R1,%R0\" + : \"tst\\t%S0,%S0\;bf\\t%l9\;tst\\t%R0,%R0\"), + insn, operands);" + [(set_attr "length" "6") + (set_attr "type" "arith3b")]) + +(define_insn "cmpgtdi_t" + [(set (reg:SI 18) (gt:SI (match_operand:DI 0 "arith_reg_operand" "r,r") + (match_operand:DI 1 "arith_reg_or_0_operand" "r,N")))] + "TARGET_SH2" + "@ + cmp/eq\\t%S1,%S0\;bf{.|/}s\\t%,Ldi%=\;cmp/gt\\t%S1,%S0\;cmp/hi\\t%R1,%R0\\n%,Ldi%=: + tst\\t%S0,%S0\;bf{.|/}s\\t%,Ldi%=\;cmp/pl\\t%S0\;cmp/hi\\t%S0,%R0\\n%,Ldi%=:" + [(set_attr "length" "8") + (set_attr "type" "arith3")]) + +(define_insn "cmpgedi_t" + [(set (reg:SI 18) (ge:SI (match_operand:DI 0 "arith_reg_operand" "r,r") + (match_operand:DI 1 "arith_reg_or_0_operand" "r,N")))] + "TARGET_SH2" + "@ + cmp/eq\\t%S1,%S0\;bf{.|/}s\\t%,Ldi%=\;cmp/ge\\t%S1,%S0\;cmp/hs\\t%R1,%R0\\n%,Ldi%=: + cmp/pz\\t%S0" + [(set_attr "length" "8,2") + (set_attr "type" "arith3,arith")]) + +;; ------------------------------------------------------------------------- +;; DImode unsigned integer comparisons +;; ------------------------------------------------------------------------- + +(define_insn "cmpgeudi_t" + [(set (reg:SI 18) (geu:SI (match_operand:DI 0 "arith_reg_operand" "r") + (match_operand:DI 1 "arith_reg_operand" "r")))] + "TARGET_SH2" + "cmp/eq\\t%S1,%S0\;bf{.|/}s\\t%,Ldi%=\;cmp/hs\\t%S1,%S0\;cmp/hs\\t%R1,%R0\\n%,Ldi%=:" + [(set_attr "length" "8") + (set_attr "type" "arith3")]) + +(define_insn "cmpgtudi_t" + [(set (reg:SI 18) (gtu:SI (match_operand:DI 0 "arith_reg_operand" "r") + (match_operand:DI 1 "arith_reg_operand" "r")))] + "TARGET_SH2" + "cmp/eq\\t%S1,%S0\;bf{.|/}s\\t%,Ldi%=\;cmp/hi\\t%S1,%S0\;cmp/hi\\t%R1,%R0\\n%,Ldi%=:" + [(set_attr "length" "8") + (set_attr "type" "arith3")]) + +;; We save the compare operands in the cmpxx patterns and use them when +;; we generate the branch. + +(define_expand "cmpdi" + [(set (reg:SI 18) (compare (match_operand:DI 0 "arith_operand" "") + (match_operand:DI 1 "arith_operand" "")))] + "TARGET_SH2" + " +{ + sh_compare_op0 = operands[0]; + sh_compare_op1 = operands[1]; + DONE; +}") + +;; ------------------------------------------------------------------------- ;; Addition instructions ;; ------------------------------------------------------------------------- @@ -299,9 +418,50 @@ (match_operand:DI 2 "arith_reg_operand" "r"))) (clobber (reg:SI 18))] "" - "clrt\;addc %R2,%R0\;addc %S2,%S0" + "#" [(set_attr "length" "6")]) +(define_split + [(set (match_operand:DI 0 "arith_reg_operand" "=r") + (plus:DI (match_operand:DI 1 "arith_reg_operand" "%0") + (match_operand:DI 2 "arith_reg_operand" "r"))) + (clobber (reg:SI 18))] + "reload_completed" + [(const_int 0)] + " +{ + rtx high0, high2, low0 = gen_lowpart (SImode, operands[0]); + high0 = gen_rtx (REG, SImode, + true_regnum (operands[0]) + (TARGET_LITTLE_ENDIAN ? 1 : 0)); + high2 = gen_rtx (REG, SImode, + true_regnum (operands[2]) + (TARGET_LITTLE_ENDIAN ? 1 : 0)); + emit_insn (gen_clrt ()); + emit_insn (gen_addc (low0, low0, gen_lowpart (SImode, operands[2]))); + emit_insn (gen_addc1 (high0, high0, high2)); + DONE; +}") + +(define_insn "addc" + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (plus:SI (plus:SI (match_operand:SI 1 "arith_reg_operand" "0") + (match_operand:SI 2 "arith_reg_operand" "r")) + (reg:SI 18))) + (set (reg:SI 18) + (ltu:SI (plus:SI (match_dup 1) (match_dup 2)) (match_dup 1)))] + "" + "addc %2,%0" + [(set_attr "type" "arith")]) + +(define_insn "addc1" + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (plus:SI (plus:SI (match_operand:SI 1 "arith_reg_operand" "0") + (match_operand:SI 2 "arith_reg_operand" "r")) + (reg:SI 18))) + (clobber (reg:SI 18))] + "" + "addc %2,%0" + [(set_attr "type" "arith")]) + (define_insn "addsi3" [(set (match_operand:SI 0 "arith_reg_operand" "=r") (plus:SI (match_operand:SI 1 "arith_operand" "%0") @@ -322,9 +482,50 @@ (match_operand:DI 2 "arith_reg_operand" "r"))) (clobber (reg:SI 18))] "" - "clrt\;subc %R2,%R0\;subc %S2,%S0" + "#" [(set_attr "length" "6")]) +(define_split + [(set (match_operand:DI 0 "arith_reg_operand" "=r") + (minus:DI (match_operand:DI 1 "arith_reg_operand" "0") + (match_operand:DI 2 "arith_reg_operand" "r"))) + (clobber (reg:SI 18))] + "reload_completed" + [(const_int 0)] + " +{ + rtx high0, high2, low0 = gen_lowpart (SImode, operands[0]); + high0 = gen_rtx (REG, SImode, + true_regnum (operands[0]) + (TARGET_LITTLE_ENDIAN ? 1 : 0)); + high2 = gen_rtx (REG, SImode, + true_regnum (operands[2]) + (TARGET_LITTLE_ENDIAN ? 1 : 0)); + emit_insn (gen_clrt ()); + emit_insn (gen_subc (low0, low0, gen_lowpart (SImode, operands[2]))); + emit_insn (gen_subc1 (high0, high0, high2)); + DONE; +}") + +(define_insn "subc" + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (minus:SI (minus:SI (match_operand:SI 1 "arith_reg_operand" "0") + (match_operand:SI 2 "arith_reg_operand" "r")) + (reg:SI 18))) + (set (reg:SI 18) + (gtu:SI (minus:SI (match_dup 1) (match_dup 2)) (match_dup 1)))] + "" + "subc %2,%0" + [(set_attr "type" "arith")]) + +(define_insn "subc1" + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (minus:SI (minus:SI (match_operand:SI 1 "arith_reg_operand" "0") + (match_operand:SI 2 "arith_reg_operand" "r")) + (reg:SI 18))) + (clobber (reg:SI 18))] + "" + "subc %2,%0" + [(set_attr "type" "arith")]) + (define_insn "*subsi3_internal" [(set (match_operand:SI 0 "arith_reg_operand" "=r") (minus:SI (match_operand:SI 1 "arith_reg_operand" "0") @@ -360,6 +561,17 @@ ;; We take advantage of the library routines which don't clobber as many ;; registers as a normal function call would. +;; The INSN_REFERENCES_ARE_DELAYED in sh.h is problematic because it +;; also has an effect on the register that holds the addres of the sfunc. +;; To make this work, we have an extra dummy insns that shows the use +;; of this register for reorg. + +(define_insn "use_sfunc_addr" + [(set (reg:SI 17) (unspec [(match_operand:SI 0 "register_operand" "r")] 5))] + "" + "" + [(set_attr "length" "0")]) + ;; We must use a pseudo-reg forced to reg 0 in the SET_DEST rather than ;; hard register 0. If we used hard register 0, then the next instruction ;; would be a move from hard register 0 to a pseudo-reg. If the pseudo-reg @@ -369,14 +581,14 @@ ;; If we let reload allocate r0, then this problem can never happen. (define_insn "" - [(set (match_operand:SI 1 "register_operand" "=z") + [(set (match_operand:SI 0 "register_operand" "=z") (udiv:SI (reg:SI 4) (reg:SI 5))) (clobber (reg:SI 18)) (clobber (reg:SI 17)) (clobber (reg:SI 4)) - (use (match_operand:SI 0 "arith_reg_operand" "r"))] + (use (match_operand:SI 1 "arith_reg_operand" "r"))] "" - "jsr @%0%#" + "jsr @%1%#" [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) @@ -395,16 +607,16 @@ "operands[3] = gen_reg_rtx(SImode);") (define_insn "" - [(set (match_operand:SI 1 "register_operand" "=z") + [(set (match_operand:SI 0 "register_operand" "=z") (div:SI (reg:SI 4) (reg:SI 5))) (clobber (reg:SI 18)) (clobber (reg:SI 17)) (clobber (reg:SI 1)) (clobber (reg:SI 2)) (clobber (reg:SI 3)) - (use (match_operand:SI 0 "arith_reg_operand" "r"))] + (use (match_operand:SI 1 "arith_reg_operand" "r"))] "" - "jsr @%0%#" + "jsr @%1%#" [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) @@ -676,7 +888,8 @@ (ior:SI (match_operand:SI 1 "arith_reg_operand" "%0,0") (match_operand:SI 2 "logical_operand" "r,L")))] "" - "or %2,%0") + "or %2,%0" + [(set_attr "type" "arith")]) (define_insn "xorsi3" [(set (match_operand:SI 0 "arith_reg_operand" "=z,r") @@ -697,7 +910,8 @@ (set (reg:SI 18) (lshiftrt:SI (match_dup 1) (const_int 31)))] "" - "rotl %0") + "rotl %0" + [(set_attr "type" "arith")]) (define_insn "rotlsi3_31" [(set (match_operand:SI 0 "arith_reg_operand" "=r") @@ -705,14 +919,16 @@ (const_int 31))) (clobber (reg:SI 18))] "" - "rotr %0") + "rotr %0" + [(set_attr "type" "arith")]) -(define_insn "" +(define_insn "rotlsi3_16" [(set (match_operand:SI 0 "arith_reg_operand" "=r") (rotate:SI (match_operand:SI 1 "arith_reg_operand" "r") (const_int 16)))] "" - "swap.w %1,%0") + "swap.w %1,%0" + [(set_attr "type" "arith")]) (define_expand "rotlsi3" [(set (match_operand:SI 0 "arith_reg_operand" "") @@ -721,29 +937,62 @@ "" " { + static char rot_tab[] = { + 000, 000, 000, 000, 000, 000, 010, 001, + 001, 001, 011, 013, 003, 003, 003, 003, + 003, 003, 003, 003, 003, 013, 012, 002, + 002, 002, 010, 000, 000, 000, 000, 000, + }; + + int count, choice; + if (GET_CODE (operands[2]) != CONST_INT) FAIL; - - if (INTVAL (operands[2]) == 1) - { - emit_insn (gen_rotlsi3_1 (operands[0], operands[1])); - DONE; - } - else if (INTVAL (operands[2]) == 31) - { - emit_insn (gen_rotlsi3_31 (operands[0], operands[1])); - DONE; - } - else if (INTVAL (operands[2]) != 16) + count = INTVAL (operands[2]); + choice = rot_tab[count]; + if (choice & 010 && SH_DYNAMIC_SHIFT_COST <= 1) FAIL; + choice &= 7; + switch (choice) + { + case 0: + emit_move_insn (operands[0], operands[1]); + count -= (count & 16) * 2; + break; + case 3: + emit_insn (gen_rotlsi3_16 (operands[0], operands[1])); + count -= 16; + break; + case 1: + case 2: + { + rtx parts[2]; + parts[0] = gen_reg_rtx (SImode); + parts[1] = gen_reg_rtx (SImode); + emit_insn (gen_rotlsi3_16 (parts[2-choice], operands[1])); + parts[choice-1] = operands[1]; + emit_insn (gen_ashlsi3 (parts[0], parts[0], GEN_INT (8))); + emit_insn (gen_lshrsi3 (parts[1], parts[1], GEN_INT (8))); + emit_insn (gen_iorsi3 (operands[0], parts[0], parts[1])); + count = (count & ~16) - 8; + } + } + + for (; count > 0; count--) + emit_insn (gen_rotlsi3_1 (operands[0], operands[0])); + for (; count < 0; count++) + emit_insn (gen_rotlsi3_31 (operands[0], operands[0])); + + DONE; }") -(define_insn "" +(define_insn "*rotlhi3_8" [(set (match_operand:HI 0 "arith_reg_operand" "=r") (rotate:HI (match_operand:HI 1 "arith_reg_operand" "r") (const_int 8)))] "" - "swap.b %1,%0") + "swap.b %1,%0" + [(set_attr "type" "arith")]) (define_expand "rotlhi3" [(set (match_operand:HI 0 "arith_reg_operand" "") @@ -764,7 +1013,8 @@ (ashift:SI (match_operand:SI 1 "arith_reg_operand" "0") (match_operand:SI 2 "arith_reg_operand" "r")))] "TARGET_SH3" - "shld %2,%0") + "shld %2,%0" + [(set_attr "type" "dyn_shift")]) (define_insn "ashlsi3_k" [(set (match_operand:SI 0 "arith_reg_operand" "=r,r") @@ -773,7 +1023,8 @@ "CONST_OK_FOR_K (INTVAL (operands[2]))" "@ add %0,%0 - shll%O2 %0") + shll%O2 %0" + [(set_attr "type" "arith")]) (define_insn "ashlhi3_k" [(set (match_operand:HI 0 "arith_reg_operand" "=r,r") @@ -782,14 +1033,15 @@ "CONST_OK_FOR_K (INTVAL (operands[2]))" "@ add %0,%0 - shll%O2 %0") + shll%O2 %0" + [(set_attr "type" "arith")]) (define_insn "ashlsi3_n" [(set (match_operand:SI 0 "arith_reg_operand" "=r") (ashift:SI (match_operand:SI 1 "arith_reg_operand" "0") (match_operand:SI 2 "const_int_operand" "n"))) (clobber (reg:SI 18))] - "" + "! sh_dynamicalize_shift_p (operands[2])" "#" [(set (attr "length") (cond [(eq (symbol_ref "shift_insns_rtx (insn)") (const_int 1)) @@ -822,6 +1074,9 @@ "" " { + if (GET_CODE (operands[2]) == CONST_INT + && sh_dynamicalize_shift_p (operands[2])) + operands[2] = force_reg (SImode, operands[2]); if (TARGET_SH3 && arith_reg_operand (operands[2], GET_MODE (operands[2]))) { emit_insn (gen_ashlsi3_d (operands[0], operands[1], operands[2])); @@ -888,27 +1143,59 @@ (ashiftrt:SI (match_operand:SI 1 "arith_reg_operand" "r") (const_int 16)))] "" - "swap.w %1,%0\;exts.w %0,%0" + "#" [(set_attr "length" "4")]) +(define_split + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (ashiftrt:SI (match_operand:SI 1 "arith_reg_operand" "r") + (const_int 16)))] + "" + [(set (match_dup 0) (rotate:SI (match_dup 1) (const_int 16))) + (set (match_dup 0) (sign_extend:SI (match_dup 2)))] + "operands[2] = gen_lowpart (HImode, operands[0]);") + ;; ??? This should be a define expand. (define_insn "ashrsi2_31" [(set (match_operand:SI 0 "arith_reg_operand" "=r") - (ashiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0") - (const_int 31))) + (ashiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0") + (const_int 31))) (clobber (reg:SI 18))] "" - "@ - shll %0\;subc %0,%0" + "#" [(set_attr "length" "4")]) +(define_split + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (ashiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0") + (const_int 31))) + (clobber (reg:SI 18))] + "" + [(const_int 0)] + " +{ + emit_insn (gen_ashlsi_c (operands[0], operands[1], operands[1])); + emit_insn (gen_subc1 (operands[0], operands[0], operands[0])); + DONE; +}") + +(define_insn "ashlsi_c" + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (ashift:SI (match_operand:SI 1 "arith_reg_operand" "0") (const_int 1))) + (set (reg:SI 18) (lt:SI (match_operand:SI 2 "arith_reg_operand" "0") + (const_int 0)))] + "" + "shll %0" + [(set_attr "type" "arith")]) + (define_insn "ashrsi3_d" [(set (match_operand:SI 0 "arith_reg_operand" "=r") (ashiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0") (neg:SI (match_operand:SI 2 "arith_reg_operand" "r"))))] "TARGET_SH3" - "shad %2,%0") + "shad %2,%0" + [(set_attr "type" "dyn_shift")]) (define_insn "ashrsi3_n" [(set (reg:SI 4) @@ -937,7 +1224,8 @@ (lshiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0") (neg:SI (match_operand:SI 2 "arith_reg_operand" "r"))))] "TARGET_SH3" - "shld %2,%0") + "shld %2,%0" + [(set_attr "type" "dyn_shift")]) ;; Only the single bit shift clobbers the T bit. @@ -947,7 +1235,8 @@ (match_operand:SI 2 "const_int_operand" "M"))) (clobber (reg:SI 18))] "CONST_OK_FOR_M (INTVAL (operands[2]))" - "shlr %0") + "shlr %0" + [(set_attr "type" "arith")]) (define_insn "lshrsi3_k" [(set (match_operand:SI 0 "arith_reg_operand" "=r") @@ -955,7 +1244,8 @@ (match_operand:SI 2 "const_int_operand" "K")))] "CONST_OK_FOR_K (INTVAL (operands[2])) && ! CONST_OK_FOR_M (INTVAL (operands[2]))" - "shlr%O2 %0") + "shlr%O2 %0" + [(set_attr "type" "arith")]) (define_insn "lshrhi3_m" [(set (match_operand:HI 0 "arith_reg_operand" "=r") @@ -963,7 +1253,8 @@ (match_operand:HI 2 "const_int_operand" "M"))) (clobber (reg:SI 18))] "CONST_OK_FOR_M (INTVAL (operands[2]))" - "shlr %0") + "shlr %0" + [(set_attr "type" "arith")]) (define_insn "lshrhi3_k" [(set (match_operand:HI 0 "arith_reg_operand" "=r") @@ -971,14 +1262,15 @@ (match_operand:HI 2 "const_int_operand" "K")))] "CONST_OK_FOR_K (INTVAL (operands[2])) && ! CONST_OK_FOR_M (INTVAL (operands[2]))" - "shlr%O2 %0") + "shlr%O2 %0" + [(set_attr "type" "arith")]) (define_insn "lshrsi3_n" [(set (match_operand:SI 0 "arith_reg_operand" "=r") (lshiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0") (match_operand:SI 2 "const_int_operand" "n"))) (clobber (reg:SI 18))] - "" + "! sh_dynamicalize_shift_p (operands[2])" "#" [(set (attr "length") (cond [(eq (symbol_ref "shift_insns_rtx (insn)") (const_int 1)) @@ -1011,6 +1303,9 @@ "" " { + if (GET_CODE (operands[2]) == CONST_INT + && sh_dynamicalize_shift_p (operands[2])) + operands[2] = force_reg (SImode, operands[2]); if (TARGET_SH3 && arith_reg_operand (operands[2], GET_MODE (operands[2]))) { rtx count = copy_to_mode_reg (SImode, operands[2]); @@ -1060,7 +1355,8 @@ (clobber (reg:SI 18))] "" "shll %R0\;rotcl %S0" - [(set_attr "length" "4")]) + [(set_attr "length" "4") + (set_attr "type" "arith")]) (define_expand "ashldi3" [(parallel [(set (match_operand:DI 0 "arith_reg_operand" "") @@ -1080,7 +1376,8 @@ (clobber (reg:SI 18))] "" "shlr %S0\;rotcr %R0" - [(set_attr "length" "4")]) + [(set_attr "length" "4") + (set_attr "type" "arith")]) (define_expand "lshrdi3" [(parallel [(set (match_operand:DI 0 "arith_reg_operand" "") @@ -1100,7 +1397,8 @@ (clobber (reg:SI 18))] "" "shar %S0\;rotcr %R0" - [(set_attr "length" "4")]) + [(set_attr "length" "4") + (set_attr "type" "arith")]) (define_expand "ashrdi3" [(parallel [(set (match_operand:DI 0 "arith_reg_operand" "") @@ -1306,7 +1604,8 @@ (lshiftrt:SI (match_operand:SI 2 "arith_reg_operand" "0") (const_int 16))))] "" - "xtrct %1,%0") + "xtrct %1,%0" + [(set_attr "type" "arith")]) (define_insn "xtrct_right" [(set (match_operand:SI 0 "arith_reg_operand" "=r") @@ -1315,7 +1614,8 @@ (ashift:SI (match_operand:SI 2 "arith_reg_operand" "r") (const_int 16))))] "" - "xtrct %2,%0") + "xtrct %2,%0" + [(set_attr "type" "arith")]) ;; ------------------------------------------------------------------------- ;; Unary arithmetic @@ -1400,13 +1700,8 @@ ;; ??? This should be a define expand. ;; ??? Or perhaps it should be dropped? -(define_insn "extendsidi2" - [(set (match_operand:DI 0 "arith_reg_operand" "=r") - (sign_extend:DI (match_operand:SI 1 "arith_reg_operand" "r"))) - (clobber (reg:SI 18))] - "" - "mov %1,%S0\;mov %1,%R0\;shll %S0\;subc %S0,%S0" - [(set_attr "length" "8")]) +/* There is no point in defining extendsidi2; convert_move generates good + code for that. */ (define_insn "extendhisi2" [(set (match_operand:SI 0 "arith_reg_operand" "=r,r") @@ -1499,17 +1794,17 @@ "" "sett") -;; t/z is first, so that it will be preferred over r/r when reloading a move +;; t/r is first, so that it will be preferred over r/r when reloading a move ;; of a pseudo-reg into the T reg (define_insn "movsi_i" - [(set (match_operand:SI 0 "general_movdst_operand" "=t,r,r,r,r,r,m,<,xl,xl,r") - (match_operand:SI 1 "general_movsrc_operand" "z,Q,rI,m,xl,t,r,xl,r,>,i"))] + [(set (match_operand:SI 0 "general_movdst_operand" "=t,r,r,r,r,r,m,<,<,xl,x,l,r") + (match_operand:SI 1 "general_movsrc_operand" "r,Q,rI,m,xl,t,r,x,l,r,>,>,i"))] " - ! TARGET_SH3E && - (register_operand (operands[0], SImode) - || register_operand (operands[1], SImode))" + ! TARGET_SH3E + && (register_operand (operands[0], SImode) + || register_operand (operands[1], SImode))" "@ - tst %1,%1\;rotcl %1\;xor #1,%1\;rotcr %1 + cmp/pl %1 mov.l %1,%0 mov %1,%0 mov.l %1,%0 @@ -1517,39 +1812,58 @@ movt %0 mov.l %1,%0 sts.l %1,%0 + sts.l %1,%0 lds %1,%0 lds.l %1,%0 + lds.l %1,%0 fake %1,%0" - [(set_attr "type" "move,pcload,move,load,move,store,store,move,load,move,pcload") - (set_attr "length" "8,*,*,*,*,*,*,*,*,*,*")]) + [(set_attr "type" "*,pcload_si,move,load_si,move,move,store,store,pstore,move,load,pload,pcload_si") + (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*")]) -;; t/z is first, so that it will be preferred over r/r when reloading a move -;; of a pseudo-reg into the T reg +;; t/r must come after r/r, lest reload will try to reload stuff like +;; (subreg:SI (reg:SF 38 fr14) 0) into T (compiling stdlib/strtod.c -m3e -O2) ;; ??? This allows moves from macl to fpul to be recognized, but these moves ;; will require a reload. (define_insn "movsi_ie" - [(set (match_operand:SI 0 "general_movdst_operand" "=t,r,r,r,r,r,m,<,xl,xl,r,y,r") - (match_operand:SI 1 "general_movsrc_operand" "z,Q,rI,m,xl,t,r,xl,r,>,i,r,y"))] + [(set (match_operand:SI 0 "general_movdst_operand" "=r,r,t,r,r,r,m,<,<,xl,x,l,r,y,r,y") + (match_operand:SI 1 "general_movsrc_operand" "Q,rI,r,m,xl,t,r,x,l,r,>,>,i,r,y,y"))] "TARGET_SH3E && (register_operand (operands[0], SImode) || register_operand (operands[1], SImode))" "@ - tst %1,%1\;rotcl %1\;xor #1,%1\;rotcr %1 + mov.l %1,%0 + mov %1,%0 + cmp/pl %1 + mov.l %1,%0 + sts %1,%0 + movt %0 + mov.l %1,%0 + sts.l %1,%0 + sts.l %1,%0 + lds %1,%0 + lds.l %1,%0 + lds.l %1,%0 + fake %1,%0 + lds %1,%0 + sts %1,%0 + ! move optimized away" + [(set_attr "type" "pcload_si,move,*,load_si,move,move,store,store,pstore,move,load,pload,pcload_si,gp_fpul,gp_fpul,other") + (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")]) + +(define_insn "movsi_i_lowpart" + [(set (strict_low_part (match_operand:SI 0 "general_movdst_operand" "=r,r,r,r,r,m,r")) + (match_operand:SI 1 "general_movsrc_operand" "Q,rI,m,xl,t,r,i"))] + "register_operand (operands[0], SImode) + || register_operand (operands[1], SImode)" + "@ mov.l %1,%0 mov %1,%0 mov.l %1,%0 sts %1,%0 movt %0 mov.l %1,%0 - sts.l %1,%0 - lds %1,%0 - lds.l %1,%0 - fake %1,%0 - lds %1,%0 - sts %1,%0" - [(set_attr "type" "move,pcload,move,load,move,store,store,move,load,move,pcload,move,move") - (set_attr "length" "8,*,*,*,*,*,*,*,*,*,*,*,*")]) - + fake %1,%0" + [(set_attr "type" "pcload,move,load,move,move,store,pcload")]) (define_expand "movsi" [(set (match_operand:SI 0 "general_movdst_operand" "") (match_operand:SI 1 "general_movsrc_operand" ""))] @@ -1600,14 +1914,16 @@ ;; ??? This should be a define expand. +;; x/r can be created by inlining/cse, e.g. for execute/961213-1.c +;; compiled with -m2 -ml -O3 -funroll-loops (define_insn "" - [(set (match_operand:DI 0 "general_movdst_operand" "=r,r,r,m,r,r,r") - (match_operand:DI 1 "general_movsrc_operand" "Q,r,m,r,I,i,x"))] + [(set (match_operand:DI 0 "general_movdst_operand" "=r,r,r,m,r,r,r,*!x") + (match_operand:DI 1 "general_movsrc_operand" "Q,r,m,r,I,i,x,r"))] "arith_reg_operand (operands[0], DImode) || arith_reg_operand (operands[1], DImode)" "* return output_movedouble (insn, operands, DImode);" [(set_attr "length" "4") - (set_attr "type" "pcload,move,load,store,move,pcload,move")]) + (set_attr "type" "pcload,move,load,store,move,pcload,move,move")]) ;; If the output is a register and the input is memory or a register, we have ;; to be careful and see which word needs to be loaded first. @@ -1779,15 +2095,19 @@ [(set (match_operand:DF 0 "general_movdst_operand" "") (match_operand:DF 1 "general_movsrc_operand" ""))] "" - "{ if (prepare_move_operands (operands, DFmode)) DONE; }") + " +{ + if (prepare_move_operands (operands, DFmode)) DONE; +}") + (define_insn "movsf_i" [(set (match_operand:SF 0 "general_movdst_operand" "=r,r,r,r,m,l,r") (match_operand:SF 1 "general_movsrc_operand" "r,I,FQ,m,r,r,l"))] " - ! TARGET_SH3E && - (arith_reg_operand (operands[0], SFmode) - || arith_reg_operand (operands[1], SFmode))" + ! TARGET_SH3E + && (arith_reg_operand (operands[0], SFmode) + || arith_reg_operand (operands[1], SFmode))" "@ mov %1,%0 mov %1,%0 @@ -1800,10 +2120,10 @@ (define_insn "movsf_ie" [(set (match_operand:SF 0 "general_movdst_operand" - "=f,r,f,f,?f,f,m,r,r,m,!??r,!??f") + "=f,r,f,f,fy,f,m,r,r,m,f,y,y,rf,ry") (match_operand:SF 1 "general_movsrc_operand" - "f,r,G,H,FQ,m,f,FQ,m,r,f,r")) - (clobber (match_scratch:SI 2 "=X,X,X,X,&z,X,X,X,X,X,X,X"))] + "f,r,G,H,FQ,m,f,FQ,m,r,y,f,>,fr,yr")) + (clobber (match_scratch:SI 2 "=X,X,X,X,&z,X,X,X,X,X,X,X,X,y,X"))] "TARGET_SH3E && (arith_reg_operand (operands[0], SFmode) @@ -1819,29 +2139,39 @@ mov.l %1,%0 mov.l %1,%0 mov.l %1,%0 - flds %1,fpul\;sts fpul,%0 - lds %1,fpul\;fsts fpul,%0" - [(set_attr "type" "move,move,fp,fp,pcload,load,store,pcload,load,store,move,fp") - (set_attr "length" "*,*,*,*,4,*,*,*,*,*,4,4")]) + fsts fpul,%0 + flds %1,fpul + lds.l %1,%0 + # + #" + [(set_attr "type" "fmove,move,fmove,fmove,pcload,load,store,pcload,load,store,fmove,fmove,load,*,*") + (set_attr "length" "*,*,*,*,4,*,*,*,*,*,2,2,2,*,*")]) (define_split - [(set (match_operand:SF 0 "general_movdst_operand" "") - (match_operand:SF 1 "general_movsrc_operand" "")) - (clobber (reg:SI 0))] - "GET_CODE (operands[0]) == REG && REGNO (operands[0]) < FIRST_PSEUDO_REGISTER" - [(parallel [(set (match_dup 0) (match_dup 1)) - (clobber (scratch:SI))])] + [(set (match_operand:SF 0 "register_operand" "ry") + (match_operand:SF 1 "register_operand" "ry")) + (clobber (match_scratch:SI 2 "X"))] + "reload_completed + && true_regnum (operands[0]) < FIRST_FP_REG + && true_regnum (operands[1]) < FIRST_FP_REG" + [(set (match_dup 0) (match_dup 1))] " { - if (REGNO (operands[0]) >= FIRST_FP_REG && REGNO (operands[0]) <= LAST_FP_REG) - { - if (GET_CODE (operands[1]) != MEM) - FAIL; - emit_insn (gen_mova (XEXP (operands[1], 0))); - XEXP (operands[1], 0) = gen_rtx (REG, Pmode, 0); - } + operands[0] = gen_rtx (REG, SImode, true_regnum (operands[0])); + operands[1] = gen_rtx (REG, SImode, true_regnum (operands[1])); }") +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (match_operand:SF 1 "register_operand" "")) + (clobber (reg:SI 22))] + "" + [(parallel [(set (reg:SF 22) (match_dup 1)) + (clobber (scratch:SI))]) + (parallel [(set (match_dup 0) (reg:SF 22)) + (clobber (scratch:SI))])] + "") + (define_expand "movsf" [(set (match_operand:SF 0 "general_movdst_operand" "") (match_operand:SF 1 "general_movsrc_operand" ""))] @@ -1884,47 +2214,45 @@ "* return output_branch (0, insn, operands);" [(set_attr "type" "cbranch")]) -(define_insn "inverse_branch_true" - [(set (pc) (if_then_else (ne (reg:SI 18) (const_int 0)) - (pc) - (label_ref (match_operand 0 "" ""))))] +;; Patterns to prevent reorg from re-combining a condbranch with a branch +;; which destination is too far away. +;; The const_int_operand is distinct for each branch target; it avoids +;; unwanted matches with redundant_insn. +(define_insn "block_branch_redirect" + [(set (pc) (unspec [(match_operand 0 "const_int_operand" "")] 4))] "" - "* return output_branch (0, insn, operands);" - [(set_attr "type" "cbranch")]) + "" + [(set_attr "length" "0")]) -(define_insn "inverse_branch_false" - [(set (pc) (if_then_else (eq (reg:SI 18) (const_int 0)) - (pc) - (label_ref (match_operand 0 "" ""))))] +;; This one has the additional purpose to record a possible scratch register +;; for the following branch. +(define_insn "indirect_jump_scratch" + [(set (match_operand 0 "register_operand" "r") + (unspec [(match_operand 1 "const_int_operand" "")] 4))] "" - "* return output_branch (1, insn, operands);" - [(set_attr "type" "cbranch")]) + "" + [(set_attr "length" "0")]) ;; Conditional branch insns (define_expand "beq" - [(set (reg:SI 18) (eq:SI (match_dup 1) (match_dup 2))) - (set (pc) + [(set (pc) (if_then_else (ne (reg:SI 18) (const_int 0)) (label_ref (match_operand 0 "" "")) (pc)))] "" "from_compare (operands, EQ);") -; There is no bne compare, so we reverse the branch arms. - (define_expand "bne" - [(set (reg:SI 18) (eq:SI (match_dup 1) (match_dup 2))) - (set (pc) - (if_then_else (ne (reg:SI 18) (const_int 0)) - (pc) - (label_ref (match_operand 0 "" ""))))] + [(set (pc) + (if_then_else (eq (reg:SI 18) (const_int 0)) + (label_ref (match_operand 0 "" "")) + (pc)))] "" - "from_compare (operands, NE);") + "from_compare (operands, EQ);") (define_expand "bgt" - [(set (reg:SI 18) (gt:SI (match_dup 1) (match_dup 2))) - (set (pc) + [(set (pc) (if_then_else (ne (reg:SI 18) (const_int 0)) (label_ref (match_operand 0 "" "")) (pc)))] @@ -1932,11 +2260,10 @@ "from_compare (operands, GT);") (define_expand "blt" - [(set (reg:SI 18) (ge:SI (match_dup 1) (match_dup 2))) - (set (pc) - (if_then_else (ne (reg:SI 18) (const_int 0)) - (pc) - (label_ref (match_operand 0 "" ""))))] + [(set (pc) + (if_then_else (eq (reg:SI 18) (const_int 0)) + (label_ref (match_operand 0 "" "")) + (pc)))] "" " { @@ -1948,28 +2275,41 @@ emit_insn (gen_bgt (operands[0])); DONE; } - from_compare (operands, LT); + from_compare (operands, GE); }") (define_expand "ble" - [(set (reg:SI 18) (gt:SI (match_dup 1) (match_dup 2))) - (set (pc) - (if_then_else (ne (reg:SI 18) (const_int 0)) - (pc) - (label_ref (match_operand 0 "" ""))))] + [(set (pc) + (if_then_else (eq (reg:SI 18) (const_int 0)) + (label_ref (match_operand 0 "" "")) + (pc)))] "" - "from_compare (operands, LE);") + " +{ + if (TARGET_SH3E + && TARGET_IEEE + && GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT) + { + rtx tmp = sh_compare_op0; + sh_compare_op0 = sh_compare_op1; + sh_compare_op1 = tmp; + emit_insn (gen_bge (operands[0])); + DONE; + } + from_compare (operands, GT); +}") (define_expand "bge" - [(set (reg:SI 18) (ge:SI (match_dup 1) (match_dup 2))) - (set (pc) + [(set (pc) (if_then_else (ne (reg:SI 18) (const_int 0)) (label_ref (match_operand 0 "" "")) (pc)))] "" " { - if (GET_MODE (sh_compare_op0) == SFmode) + if (TARGET_SH3E + && ! TARGET_IEEE + && GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT) { rtx tmp = sh_compare_op0; sh_compare_op0 = sh_compare_op1; @@ -1981,8 +2321,7 @@ }") (define_expand "bgtu" - [(set (reg:SI 18) (gtu:SI (match_dup 1) (match_dup 2))) - (set (pc) + [(set (pc) (if_then_else (ne (reg:SI 18) (const_int 0)) (label_ref (match_operand 0 "" "")) (pc)))] @@ -1990,17 +2329,15 @@ "from_compare (operands, GTU); ") (define_expand "bltu" - [(set (reg:SI 18) (geu:SI (match_dup 1) (match_dup 2))) - (set (pc) - (if_then_else (ne (reg:SI 18) (const_int 0)) - (pc) - (label_ref (match_operand 0 "" ""))))] + [(set (pc) + (if_then_else (eq (reg:SI 18) (const_int 0)) + (label_ref (match_operand 0 "" "")) + (pc)))] "" - "from_compare (operands, LTU);") + "from_compare (operands, GEU);") (define_expand "bgeu" - [(set (reg:SI 18) (geu:SI (match_dup 1) (match_dup 2))) - (set (pc) + [(set (pc) (if_then_else (ne (reg:SI 18) (const_int 0)) (label_ref (match_operand 0 "" "")) (pc)))] @@ -2008,13 +2345,12 @@ "from_compare (operands, GEU);") (define_expand "bleu" - [(set (reg:SI 18) (gtu:SI (match_dup 1) (match_dup 2))) - (set (pc) - (if_then_else (ne (reg:SI 18) (const_int 0)) - (pc) - (label_ref (match_operand 0 "" ""))))] + [(set (pc) + (if_then_else (eq (reg:SI 18) (const_int 0)) + (label_ref (match_operand 0 "" "")) + (pc)))] "" - "from_compare (operands, LEU);") + "from_compare (operands, GTU);") ;; ------------------------------------------------------------------------ ;; Jump and linkage insns @@ -2027,7 +2363,7 @@ "* { /* The length is 16 if the delay slot is unfilled. */ - if (get_attr_length(insn) >= 14) + if (get_attr_length(insn) > 4) return output_far_jump(insn, operands[0]); else return \"bra %l0%#\"; @@ -2074,19 +2410,38 @@ (match_operand:SI 0 "arith_reg_operand" "r"))] "" "jmp @%0%#" - [(set_attr "needs_delay_slot" "yes")]) + [(set_attr "needs_delay_slot" "yes") + (set_attr "type" "jump_ind")]) -;; This might seem redundant, but it helps us distinguish case table jumps +;; The use of operand 1 / 2 helps us distinguish case table jumps ;; which can be present in structured code from indirect jumps which can not ;; be present in structured code. This allows -fprofile-arcs to work. -(define_insn "*casesi_jump" +;; For SH1 processors. +(define_insn "casesi_jump_1" [(set (pc) - (match_operand:SI 0 "arith_reg_operand" "r")) + (match_operand:SI 0 "register_operand" "r")) (use (label_ref (match_operand 1 "" "")))] "" - "jmp @%0%#" - [(set_attr "needs_delay_slot" "yes")]) + "jmp @%0%#" + [(set_attr "needs_delay_slot" "yes") + (set_attr "type" "jump_ind")]) + +;; For all later processors. +(define_insn "casesi_jump_2" + [(set (pc) (plus:SI (match_operand:SI 0 "register_operand" "r") + (match_operand 1 "braf_label_ref_operand" ""))) + (use (label_ref (match_operand 2 "" "")))] + "" + "braf %0%#" + [(set_attr "needs_delay_slot" "yes") + (set_attr "type" "jump_ind")]) + +(define_insn "dummy_jump" + [(set (pc) (const_int 0))] + "" + "" + [(set_attr "length" "0")]) ;; Call subroutine returning any type. ;; ??? This probably doesn't work. @@ -2127,22 +2482,26 @@ (eq:SI (match_operand:SI 0 "arith_reg_operand" "+r") (const_int 1))) (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))] "TARGET_SH2" - "dt %0") + "dt %0" + [(set_attr "type" "arith")]) (define_insn "nop" [(const_int 0)] "" "nop") -;; Load address of a label. This is only generated by the casesi expand. -;; This must use unspec, because this only works immediately before a casesi. +;; Load address of a label. This is only generated by the casesi expand, +;; and by machine_dependent_reorg (fixing up fp moves). +;; This must use unspec, because this only works for labels that are +;; within range, (define_insn "mova" [(set (reg:SI 0) (unspec [(label_ref (match_operand 0 "" ""))] 1))] "" "mova %O0,r0" - [(set_attr "in_delay_slot" "no")]) + [(set_attr "in_delay_slot" "no") + (set_attr "type" "arith")]) ;; case instruction for switch statements. @@ -2152,56 +2511,157 @@ ;; operand 3 is CODE_LABEL for the table; ;; operand 4 is the CODE_LABEL to go to if index out of range. -;; ??? There should be a barrier after the jump at the end. - (define_expand "casesi" - [(set (match_dup 5) (match_operand:SI 0 "arith_reg_operand" "")) - (set (match_dup 5) (minus:SI (match_dup 5) + [(match_operand:SI 0 "arith_reg_operand" "") + (match_operand:SI 1 "arith_reg_operand" "") + (match_operand:SI 2 "arith_reg_operand" "") + (match_operand 3 "" "") (match_operand 4 "" "")] + "" + " +{ + rtx reg = gen_reg_rtx (SImode); + rtx reg2 = gen_reg_rtx (SImode); + operands[1] = copy_to_mode_reg (SImode, operands[1]); + operands[2] = copy_to_mode_reg (SImode, operands[2]); + /* If optimizing, casesi_worker depends on the mode of the instruction + before label it 'uses' - operands[3]. */ + emit_insn (gen_casesi_0 (operands[0], operands[1], operands[2], operands[4], + reg)); + emit_insn (gen_casesi_worker_0 (reg2, reg, operands[3])); + if (TARGET_SH2) + { + rtx lab = gen_label_rtx (); + emit_jump_insn (gen_casesi_jump_2 (reg2, + gen_rtx (LABEL_REF, VOIDmode, lab), + operands[3])); + emit_label (lab); + /* Put a fake jump after the label, lest some optimization might + delete the barrier and LAB. */ + emit_jump_insn (gen_dummy_jump ()); + } + else + { + emit_jump_insn (gen_casesi_jump_1 (reg2, operands[3])); + } + /* For SH2 and newer, the ADDR_DIFF_VEC is not actually relative to + operands[3], but to lab. We will fix this up in + machine_dependent_reorg. */ + emit_barrier (); + DONE; +}") + +(define_expand "casesi_0" + [(set (match_operand:SI 4 "" "") (match_operand:SI 0 "arith_reg_operand" "")) + (set (match_dup 4) (minus:SI (match_dup 4) (match_operand:SI 1 "arith_operand" ""))) (set (reg:SI 18) - (gtu:SI (match_dup 5) + (gtu:SI (match_dup 4) (match_operand:SI 2 "arith_reg_operand" ""))) (set (pc) (if_then_else (ne (reg:SI 18) (const_int 0)) - (label_ref (match_operand 4 "" "")) - (pc))) - (set (match_dup 6) (match_dup 5)) - (set (match_dup 6) (ashift:SI (match_dup 6) (match_dup 7))) - (set (reg:SI 0) (unspec [(label_ref (match_operand 3 "" ""))] 1)) - (parallel [(set (reg:SI 0) (plus:SI (reg:SI 0) - (mem:HI (plus:SI (reg:SI 0) - (match_dup 6))))) - (set (match_dup 6) (mem:HI (plus:SI (reg:SI 0) (match_dup 6))))]) - (parallel [(set (pc) (reg:SI 0)) - (use (label_ref (match_dup 3)))])] + (label_ref (match_operand 3 "" "")) + (pc)))] "" - " -{ - operands[1] = copy_to_mode_reg (SImode, operands[1]); - operands[2] = copy_to_mode_reg (SImode, operands[2]); - operands[5] = gen_reg_rtx (SImode); - operands[6] = gen_reg_rtx (SImode); - operands[7] = GEN_INT (TARGET_BIGTABLE ? 2 : 1); -}") + "") -(define_insn "casesi_worker" - [(set (reg:SI 0) - (plus:SI (reg:SI 0) - (mem:HI (plus:SI (reg:SI 0) - (match_operand:SI 0 "arith_reg_operand" "+r"))))) - (set (match_dup 0) (mem:HI (plus:SI (reg:SI 0) - (match_dup 0))))] +;; ??? reload might clobber r0 if we use it explicitly in the RTL before +;; reload; using a R0_REGS pseudo reg is likely to give poor code. +;; So we keep the use of r0 hidden in a R0_REGS clobber until after reload. + +(define_insn "casesi_worker_0" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (unspec [(match_operand 1 "register_operand" "0,r") + (label_ref (match_operand 2 "" ""))] 2)) + (clobber (match_scratch:SI 3 "=X,1")) + (clobber (match_scratch:SI 4 "=&z,z"))] + "" + "#") + +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (unspec [(match_operand 1 "register_operand" "") + (label_ref (match_operand 2 "" ""))] 2)) + (clobber (match_scratch:SI 3 "")) + (clobber (match_scratch:SI 4 ""))] + "! TARGET_SH2 && reload_completed" + [(set (reg:SI 0) (unspec [(label_ref (match_dup 2))] 1)) + (parallel [(set (match_dup 0) + (unspec [(reg:SI 0) (match_dup 1) (label_ref (match_dup 2))] 2)) + (clobber (match_dup 3))]) + (set (match_dup 0) (plus:SI (match_dup 0) (reg:SI 0)))] + "LABEL_NUSES (operands[2])++;") + +(define_split + [(set (match_operand:SI 0 "register_operand" "") + (unspec [(match_operand 1 "register_operand" "") + (label_ref (match_operand 2 "" ""))] 2)) + (clobber (match_scratch:SI 3 "")) + (clobber (match_scratch:SI 4 ""))] + "TARGET_SH2 && reload_completed" + [(set (reg:SI 0) (unspec [(label_ref (match_dup 2))] 1)) + (parallel [(set (match_dup 0) + (unspec [(reg:SI 0) (match_dup 1) (label_ref (match_dup 2))] 2)) + (clobber (match_dup 3))])] + "LABEL_NUSES (operands[2])++;") + +(define_insn "*casesi_worker" + [(set (match_operand:SI 0 "register_operand" "=r,r") + (unspec [(reg:SI 0) (match_operand 1 "register_operand" "0,r") + (label_ref (match_operand 2 "" ""))] 2)) + (clobber (match_scratch:SI 3 "=X,1"))] "" "* { - if (TARGET_BIGTABLE) - return \"mov.l @(r0,%0),%0\;add %0,r0\"; - else - return \"mov.w @(r0,%0),%0\;add %0,r0\"; + enum machine_mode mode + = optimize + ? GET_MODE (PATTERN (prev_real_insn (operands[2]))) + : sh_addr_diff_vec_mode; + switch (mode) + { + case SImode: + return \"shll2 %1\;mov.l @(r0,%1),%0\"; + case HImode: + return \"add %1,%1\;mov.w @(r0,%1),%0\"; + case QImode: + { + rtx adj = PATTERN (prev_real_insn (operands[2])); + if ((insn_addresses[INSN_UID (XEXP ( XVECEXP (adj, 0, 1), 0))] + - insn_addresses[INSN_UID (XEXP (XVECEXP (adj, 0, 2), 0))]) + <= 126) + return \"mov.b @(r0,%1),%0\"; + return \"mov.b @(r0,%1),%0\;extu.b %0,%0\"; + } + default: + abort (); + } }" [(set_attr "length" "4")]) +;; Include ADDR_DIFF_VECS in the shorten_branches pass; we have to +;; use a negative-length instruction to actually accomplish this. +(define_insn "addr_diff_vec_adjust" + [(unspec_volatile [(label_ref (match_operand 0 "" "")) + (label_ref (match_operand 1 "" "")) + (label_ref (match_operand 2 "" "")) + (match_operand 3 "const_int_operand" "")] 7)] + "" + "* +{ + /* ??? ASM_OUTPUT_ADDR_DIFF_ELT gets passed no context information, so + we must use a kludge with a global variable. */ + sh_addr_diff_vec_mode = GET_MODE (PATTERN (insn)); + return \"\"; +}" +;; Need a variable length for this to be processed in each shorten_branch pass. +;; The actual work is done in ADJUST_INSN_LENTH, because length attributes +;; need to be (a choice of) constants. +;; We use the calculated length before ADJUST_INSN_LENGTH to +;; determine if the insn_addresses array contents are valid. + [(set (attr "length") + (if_then_else (eq (pc) (const_int -1)) + (const_int 2) (const_int 0)))]) + (define_insn "return" [(return)] "reload_completed" @@ -2233,7 +2693,8 @@ [(set (match_operand:SI 0 "arith_reg_operand" "=r") (eq:SI (reg:SI 18) (const_int 1)))] "" - "movt %0") + "movt %0" + [(set_attr "type" "arith")]) (define_expand "seq" [(set (match_operand:SI 0 "arith_reg_operand" "") @@ -2248,18 +2709,15 @@ "operands[1] = prepare_scc_operands (LT);") (define_expand "sle" - [(set (match_operand:SI 0 "arith_reg_operand" "") - (match_dup 1))] + [(match_operand:SI 0 "arith_reg_operand" "")] "" " { - if (GET_MODE (sh_compare_op0) == SFmode) - { - emit_insn (gen_sgt (operands[0])); - emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx)); - DONE; - } - operands[1] = prepare_scc_operands (LE); + rtx tmp = sh_compare_op0; + sh_compare_op0 = sh_compare_op1; + sh_compare_op1 = tmp; + emit_insn (gen_sge (operands[0])); + DONE; }") (define_expand "sgt" @@ -2276,8 +2734,22 @@ { if (GET_MODE (sh_compare_op0) == SFmode) { - emit_insn (gen_slt (operands[0])); - emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx)); + if (TARGET_IEEE) + { + rtx t_reg = gen_rtx (REG, SImode, T_REG); + rtx lab = gen_label_rtx (); + emit_insn (gen_rtx (SET, VOIDmode, t_reg, + gen_rtx (EQ, SImode, sh_compare_op0, + sh_compare_op1))); + emit_jump_insn (gen_branch_true (lab)); + emit_insn (gen_rtx (SET, VOIDmode, t_reg, + gen_rtx (GT, SImode, sh_compare_op0, + sh_compare_op1))); + emit_label (lab); + emit_insn (gen_movt (operands[0])); + } + else + emit_insn (gen_movnegt (operands[0], prepare_scc_operands (LT))); DONE; } operands[1] = prepare_scc_operands (GE); @@ -2330,6 +2802,18 @@ operands[2] = gen_reg_rtx (SImode); }") +;; Use the same trick for FP sle / sge +(define_expand "movnegt" + [(set (match_dup 2) (const_int -1)) + (parallel [(set (match_operand 0 "" "") + (neg:SI (plus:SI (match_dup 1) + (match_dup 2)))) + (set (reg:SI 18) + (ne:SI (ior:SI (match_operand 1 "" "") (match_dup 2)) + (const_int 0)))])] + "" + "operands[2] = gen_reg_rtx (SImode);") + ;; Recognize mov #-1/negc/neg sequence, and change it to movt/add #-1. ;; This prevents a regression that occured when we switched from xor to ;; mov/neg for sne. @@ -2416,22 +2900,42 @@ [(set_attr "length" "8") (set_attr "in_delay_slot" "no")]) +;; Alignment is needed for some constant tables; it may also be added for +;; Instructions at the start of loops, or after unconditional branches. +;; ??? We would get more accurate lengths if we did instruction +;; alignment based on the value of INSN_CURRENT_ADDRESS; the approach used +;; here is too conservative. + ; align to a two byte boundary (define_insn "align_2" - [(unspec_volatile [(const_int 0)] 10)] + [(unspec_volatile [(const_int 1)] 1)] "" ".align 1" [(set_attr "length" "0") (set_attr "in_delay_slot" "no")]) ; align to a four byte boundary +;; align_4 and align_log are instructions for the starts of loops, or +;; after unconditional branches, which may take up extra room. -(define_insn "align_4" - [(unspec_volatile [(const_int 0)] 5)] +(define_expand "align_4" + [(unspec_volatile [(const_int 2)] 1)] "" - ".align 2" - [(set_attr "in_delay_slot" "no")]) + "") + +; align to a cache line boundary + +(define_insn "align_log" + [(unspec_volatile [(match_operand 0 "const_int_operand" "")] 1)] + "" + ".align %O0" +;; Need a variable length for this to be processed in each shorten_branch pass. +;; The actual work is done in ADJUST_INSN_LENTH, because length attributes +;; need to be (a choice of) constants. + [(set (attr "length") + (if_then_else (ne (pc) (pc)) (const_int 2) (const_int 0))) + (set_attr "in_delay_slot" "no")]) ; emitted at the end of the literal table, used to emit the ; 32bit branch labels if needed. @@ -2508,7 +3012,7 @@ (define_insn "subsf3" [(set (match_operand:SF 0 "arith_reg_operand" "=f") (minus:SF (match_operand:SF 1 "arith_reg_operand" "0") - (match_operand:SF 2 "arith_reg_operand" "f")))] + (match_operand:SF 2 "arith_reg_operand" "f")))] "TARGET_SH3E" "fsub %2,%0" [(set_attr "type" "fp")]) @@ -2533,81 +3037,64 @@ (define_insn "divsf3" [(set (match_operand:SF 0 "arith_reg_operand" "=f") (div:SF (match_operand:SF 1 "arith_reg_operand" "0") - (match_operand:SF 2 "arith_reg_operand" "f")))] + (match_operand:SF 2 "arith_reg_operand" "f")))] "TARGET_SH3E" "fdiv %2,%0" [(set_attr "type" "fdiv")]) -;; ??? This is the right solution, but it fails because the movs[if] patterns -;; silently clobber FPUL (r22) for int<->fp moves. Thus we can not explicitly -;; use FPUL here. -;; -;;(define_expand "floatsisf2" -;; [(set (reg:SI 22) -;; (match_operand:SI 1 "arith_reg_operand" "")) -;; (set (match_operand:SF 0 "arith_reg_operand" "") -;; (float:SF (reg:SI 22)))] -;; "TARGET_SH3E" -;; "") -;; -;;(define_insn "*floatsisf" -;; [(set (match_operand:SF 0 "arith_reg_operand" "=f") -;; (float:SF (reg:SI 22)))] -;; "TARGET_SH3E" -;; "float fpul,%0") +(define_expand "floatsisf2" + [(set (reg:SI 22) + (match_operand:SI 1 "arith_reg_operand" "")) + (set (match_operand:SF 0 "arith_reg_operand" "") + (float:SF (reg:SI 22)))] + "TARGET_SH3E" + "") -(define_insn "floatsisf2" +(define_insn "*floatsisf2_ie" [(set (match_operand:SF 0 "arith_reg_operand" "=f") - (float:SF (match_operand:SI 1 "arith_reg_operand" "r")))] + (float:SF (reg:SI 22)))] "TARGET_SH3E" - "lds %1,fpul\;float fpul,%0" - [(set_attr "length" "4") - (set_attr "type" "fp")]) + "float fpul,%0" + [(set_attr "type" "fp")]) -;; ??? This is the right solution, but it fails because the movs[if] patterns -;; silently clobber FPUL (r22) for int<->fp moves. Thus we can not explicitly -;; use FPUL here. -;; -;;(define_expand "fix_truncsfsi2" -;; [(set (reg:SI 22) -;; (fix:SI (match_operand:SF 1 "arith_reg_operand" "f"))) -;; (set (match_operand:SI 0 "arith_reg_operand" "=r") -;; (reg:SI 22))] -;; "TARGET_SH3E" -;; "") -;; -;;(define_insn "*fixsfsi" -;; [(set (reg:SI 22) -;; (fix:SI (match_operand:SF 0 "arith_reg_operand" "f")))] -;; "TARGET_SH3E" -;; "ftrc %0,fpul") - -(define_insn "fix_truncsfsi2" - [(set (match_operand:SI 0 "arith_reg_operand" "=r") - (fix:SI (match_operand:SF 1 "arith_reg_operand" "f")))] +(define_expand "fix_truncsfsi2" + [(set (reg:SI 22) + (fix:SI (match_operand:SF 1 "arith_reg_operand" "f"))) + (set (match_operand:SI 0 "arith_reg_operand" "=r") + (reg:SI 22))] "TARGET_SH3E" - "ftrc %1,fpul\;sts fpul,%0" - [(set_attr "length" "4") - (set_attr "type" "move")]) + "") -;; ??? This should be SFmode not SImode in the compare, but that would -;; require fixing the branch patterns too. -(define_insn "*cmpgtsf_t" +(define_insn "*fixsfsi" + [(set (reg:SI 22) + (fix:SI (match_operand:SF 0 "arith_reg_operand" "f")))] + "TARGET_SH3E" + "ftrc %0,fpul" + [(set_attr "type" "fp")]) + +(define_insn "cmpgtsf_t" [(set (reg:SI 18) (gt:SI (match_operand:SF 0 "arith_reg_operand" "f") (match_operand:SF 1 "arith_reg_operand" "f")))] "TARGET_SH3E" "fcmp/gt %1,%0" [(set_attr "type" "fp")]) -;; ??? This should be SFmode not SImode in the compare, but that would -;; require fixing the branch patterns too. -(define_insn "*cmpeqsf_t" +(define_insn "cmpeqsf_t" [(set (reg:SI 18) (eq:SI (match_operand:SF 0 "arith_reg_operand" "f") (match_operand:SF 1 "arith_reg_operand" "f")))] "TARGET_SH3E" "fcmp/eq %1,%0" [(set_attr "type" "fp")]) +(define_insn "ieee_ccmpeqsf_t" + [(set (reg:SI 18) (ior:SI (reg:SI 18) + (eq:SI (match_operand:SF 0 "arith_reg_operand" "f") + (match_operand:SF 1 "arith_reg_operand" "f"))))] + "TARGET_SH3E && TARGET_IEEE" + "* return output_ieee_ccmpeq (insn, operands);" + [(set_attr "length" "4")]) + + (define_expand "cmpsf" [(set (reg:SI 18) (compare (match_operand:SF 0 "arith_operand" "") (match_operand:SF 1 "arith_operand" "")))] @@ -2813,7 +3300,7 @@ || (GET_CODE (operands[2]) == SUBREG && REGNO (SUBREG_REG (operands[2])) >= FIRST_FP_REG)) && reg_unused_after (operands[0], insn)" - "fmov.s %2,@(%0,%1)") + "fmov{.s|} %2,@(%0,%1)") (define_peephole [(set (match_operand:SI 0 "register_operand" "=r") @@ -2826,7 +3313,7 @@ || (GET_CODE (operands[2]) == SUBREG && REGNO (SUBREG_REG (operands[2])) >= FIRST_FP_REG)) && reg_unused_after (operands[0], insn)" - "fmov.s @(%0,%1),%2") + "fmov{.s|} @(%0,%1),%2") ;; Switch to a new stack with its address in sp_switch (a SYMBOL_REF). */ (define_insn "sp_switch_1" diff --git a/gcc/ginclude/va-sh.h b/gcc/ginclude/va-sh.h index 4bae1e0dcd2..f1671c7b0b6 100644 --- a/gcc/ginclude/va-sh.h +++ b/gcc/ginclude/va-sh.h @@ -23,9 +23,6 @@ typedef struct { typedef void *__gnuc_va_list; -#define __va_rounded_size(TYPE) \ - (((sizeof (TYPE) + sizeof (int) - 1) / sizeof (int)) * sizeof (int)) - #endif /* ! SH3E */ #endif /* __GNUC_VA_LIST */ @@ -116,104 +113,83 @@ enum __va_type_classes { #endif #define va_end(pvar) ((void)0) +#ifdef __LITTLE_ENDIAN__ +#define __LITTLE_ENDIAN_P 1 +#else +#define __LITTLE_ENDIAN_P 0 +#endif + +#define __SCALAR_TYPE(TYPE) \ + ((TYPE) == __integer_type_class \ + || (TYPE) == __char_type_class \ + || (TYPE) == __enumeral_type_class) + /* RECORD_TYPE args passed using the C calling convention are passed by invisible reference. ??? RECORD_TYPE args passed in the stack are made to be word-aligned; for an aggregate that is not word-aligned, we advance the pointer to the first non-reg slot. */ + /* When this is a smaller-than-int integer, using + auto-increment in the promoted (SImode) is fastest; + however, there is no way to express that is C. Therefore, + we use an asm. + We want the MEM_IN_STRUCT_P bit set in the emitted RTL, therefore we + use unions even when it would otherwise be unnecessary. */ + +#define __va_arg_sh1(AP, TYPE) __extension__ \ +__extension__ \ +({(sizeof (TYPE) == 1 \ + ? ({union {TYPE t; char c;} __t; \ + asm("" \ + : "=r" (__t.c) \ + : "0" ((((union { int i, j; } *) (AP))++)->i)); \ + __t.t;}) \ + : sizeof (TYPE) == 2 \ + ? ({union {TYPE t; short s;} __t; \ + asm("" \ + : "=r" (__t.s) \ + : "0" ((((union { int i, j; } *) (AP))++)->i)); \ + __t.t;}) \ + : sizeof (TYPE) >= 4 || __LITTLE_ENDIAN_P \ + ? (((union { TYPE t; int i;} *) (AP))++)->t \ + : ((union {TYPE t;TYPE u;}*) ((char *)++(int *)(AP) - sizeof (TYPE)))->t);}) + #ifdef __SH3E__ -#ifdef __LITTLE_ENDIAN__ +#define __PASS_AS_FLOAT(TYPE_CLASS,SIZE) \ + (TYPE_CLASS == __real_type_class && SIZE == 4) #define va_arg(pvar,TYPE) \ __extension__ \ -(*({int __type = __builtin_classify_type (* (TYPE *) 0); \ - void * __result; \ - if (__type == __real_type_class && sizeof(TYPE) == 4) \ - /* float? */ \ +({int __type = __builtin_classify_type (* (TYPE *) 0); \ + void * __result_p; \ + if (__PASS_AS_FLOAT (__type, sizeof(TYPE))) \ { \ - __va_freg *__r; \ if (pvar.__va_next_fp < pvar.__va_next_fp_limit) \ - __r = (__va_freg *) pvar.__va_next_fp++; \ + { \ + __result_p = &pvar.__va_next_fp; \ + } \ else \ - __r = (__va_freg *) pvar.__va_next_stack++; \ - __result = (char *) __r; \ + __result_p = &pvar.__va_next_stack; \ } \ else \ { \ - __va_greg *_r; \ if (pvar.__va_next_o + ((sizeof (TYPE) + 3) / 4) \ <= pvar.__va_next_o_limit) \ - { \ - _r = pvar.__va_next_o; \ - pvar.__va_next_o += (sizeof (TYPE) + 3) / 4; \ - } \ + __result_p = &pvar.__va_next_o; \ else \ { \ - _r = pvar.__va_next_stack; \ - pvar.__va_next_stack += (sizeof (TYPE) + 3) / 4; \ + if (sizeof (TYPE) > 4) \ + pvar.__va_next_o = pvar.__va_next_o_limit; \ + \ + __result_p = &pvar.__va_next_stack; \ } \ - __result = (char *) _r; \ } \ - (TYPE *) __result;})) - -#else /* ! __LITTLE_ENDIAN__ */ - -#define va_arg(pvar,TYPE) \ -__extension__ \ -(*({int __type = __builtin_classify_type (* (TYPE *) 0); \ - void * __result; \ - if (__type == __real_type_class && sizeof(TYPE) == 4) \ - /* float? */ \ - { \ - __va_freg *__r; \ - if (pvar.__va_next_fp < pvar.__va_next_fp_limit) \ - __r = (__va_freg *) pvar.__va_next_fp++; \ - else \ - __r = (__va_freg *) pvar.__va_next_stack++; \ - __result = (char *) __r; \ - } \ - else \ - { \ - __va_greg *_r; \ - if (pvar.__va_next_o + ((sizeof (TYPE) + 3) / 4) \ - <= pvar.__va_next_o_limit) \ - { \ - pvar.__va_next_o += (sizeof (TYPE) + 3) / 4; \ - _r = pvar.__va_next_o; \ - } \ - else \ - { \ - pvar.__va_next_stack += (sizeof (TYPE) + 3) / 4; \ - _r = pvar.__va_next_stack; \ - } \ - __result = ((char *) _r \ - - (sizeof (TYPE) < 4 ? sizeof (TYPE) \ - : ((sizeof (TYPE) + 3) / 4) * 4)); \ - } \ - (TYPE *) __result;})) - -#endif /* __LITTLE_ENDIAN__ */ + __va_arg_sh1(*(void **)__result_p, TYPE);}) #else /* ! SH3E */ -#ifdef __LITTLE_ENDIAN__ - -/* This is for little-endian machines; small args are padded upward. */ -#define va_arg(AP, TYPE) \ - (AP = (__gnuc_va_list) ((char *) (AP) + __va_rounded_size (TYPE)), \ - *((TYPE *) (void *) ((char *) (AP) - __va_rounded_size (TYPE)))) - -#else /* ! __LITTLE_ENDIAN__ */ - -/* This is for big-endian machines; small args are padded downward. */ -#define va_arg(AP, TYPE) \ - (AP = (__gnuc_va_list) ((char *) (AP) + __va_rounded_size (TYPE)), \ - *((TYPE *) (void *) ((char *) (AP) \ - - ((sizeof (TYPE) < __va_rounded_size (char) \ - ? sizeof (TYPE) : __va_rounded_size (TYPE)))))) - -#endif /* __LITTLE_ENDIAN__ */ +#define va_arg(AP, TYPE) __va_arg_sh1((AP), TYPE) #endif /* SH3E */