Scheduling revamp:

* sh.md (attribute type): Add types mt_group, fload, pcfload, fpul_gp,
	mac_gp ftrc_s and cwb.  Add / Adjust definitions in individual insn
	accordingly.
	(attribute insn_class): Provide default definitions based on type.
	Remove all insn-specific settings.
	(various function units): Remove old SH4 scheduling.
	(branch_zero, dfp_comp, late_fp_use, any_fp_comp, any_int_load):
	New attributes.  Set them where appropriate.
	(cpu unit FS): Don't define / use.
	(F3, load_store): New cpu units.
	(F01): New reservation.
	(all insn_reservations): Make dependent on sh4 pipeline model.
	Fix latencies.
	(nil, reg_mov, freg_mov, sh4_fpul_gp, sh4_call): New insn_reservations.
	(sh4_mac_gp, fp_arith_ftrc, arith3, arith3b): Likewise.
	(mt insn_reservation): Use type mt_group.
	(insn_reservation load_store): Split into sh4_load, sh4_load_si,
	sh4_fload and sh4_store.
	(insn_reservation branch_zero and branch): Replace with sh4_branch.
	(insn_reservation branch_far): Replace with sh4_return.
	(insn_reservation return_from_exp): Rename to:
	(sh4_return_from_exp).  Change to be just d_lock*5.
	(insn_reservation lds_to_pr): Rename to:
	(sh4_lds_to_pr).  Change to be just d_lock*2.
	(insn_reservation ldsmem_to_pr, sts_from_pr): Change to be just
	d_lock*2.
	(insn_reservation prload_mem): Rename to:
	(sh4_prstore_mem).  Change to d_lock*2,nothing,memory.
	(insn_reservation fpscr_store): Rename to:
	(fpscr_load).  Change to d_lock,nothing,F1*3.
	(insn_reservation fpscr_store_mem): Rename to:
	(fpscr_load_mem).  Change to d_lock,nothing,(F1+memory),F1*2.
	(insn_reservation multi): Change to
	d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2.
	(insn_reservation fp_arith): Change to issue,F01,F2.
	(insn_reservation fp_div: Change to issue,F01+F3,F2+F3,F3*7,F1+F3,F2.
	(insn_reservation dp_float): Change to issue,F01,F1+F2,F2.
	(insn_reservation fp_double_arith): Change to issue,F01,F1+F2,fpu*4,F2.
	(insn_reservation fp_double_cmp): Change to
	d_lock,(d_lock+F01),F1+F2,F2.
	(insn_reservation dp_div): Change to
	issue,F01+F3,F1+F2+F3,F2+F3,F3*16,F1+F3,(fpu+F3)*2,F2.
	* sh.c (flow_dependent_p, flow_dependent_p_1): New functions.
	(sh_adjust_cost, SHcompact): Differentiate between different
	kinds of dependencies.  Drop factor of ten for superscalar.
	Use new instruction types.  Add new exception rules.

Two small bug fixes:
	* sh.md (mulhisi3, umulhisi3: Add a REG_EQUAL note.

	* sh.md (mperm_w): Add DONE.

From-SVN: r56601
This commit is contained in:
J"orn Rennecke 2002-08-27 15:31:02 +00:00 committed by Joern Rennecke
parent f34fc46e79
commit c49439f112
3 changed files with 541 additions and 406 deletions

View File

@ -1,3 +1,56 @@
Tue Aug 27 14:39:09 2002 J"orn Rennecke <joern.rennecke@superh.com>
* sh.md (attribute type): Add types mt_group, fload, pcfload, fpul_gp,
mac_gp ftrc_s and cwb. Add / Adjust definitions in individual insn
accordingly.
(attribute insn_class): Provide default definitions based on type.
Remove all insn-specific settings.
(various function units): Remove old SH4 scheduling.
(branch_zero, dfp_comp, late_fp_use, any_fp_comp, any_int_load):
New attributes. Set them where appropriate.
(cpu unit FS): Don't define / use.
(F3, load_store): New cpu units.
(F01): New reservation.
(all insn_reservations): Make dependent on sh4 pipeline model.
Fix latencies.
(nil, reg_mov, freg_mov, sh4_fpul_gp, sh4_call): New insn_reservations.
(sh4_mac_gp, fp_arith_ftrc, arith3, arith3b): Likewise.
(mt insn_reservation): Use type mt_group.
(insn_reservation load_store): Split into sh4_load, sh4_load_si,
sh4_fload and sh4_store.
(insn_reservation branch_zero and branch): Replace with sh4_branch.
(insn_reservation branch_far): Replace with sh4_return.
(insn_reservation return_from_exp): Rename to:
(sh4_return_from_exp). Change to be just d_lock*5.
(insn_reservation lds_to_pr): Rename to:
(sh4_lds_to_pr). Change to be just d_lock*2.
(insn_reservation ldsmem_to_pr, sts_from_pr): Change to be just
d_lock*2.
(insn_reservation prload_mem): Rename to:
(sh4_prstore_mem). Change to d_lock*2,nothing,memory.
(insn_reservation fpscr_store): Rename to:
(fpscr_load). Change to d_lock,nothing,F1*3.
(insn_reservation fpscr_store_mem): Rename to:
(fpscr_load_mem). Change to d_lock,nothing,(F1+memory),F1*2.
(insn_reservation multi): Change to
d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2.
(insn_reservation fp_arith): Change to issue,F01,F2.
(insn_reservation fp_div: Change to issue,F01+F3,F2+F3,F3*7,F1+F3,F2.
(insn_reservation dp_float): Change to issue,F01,F1+F2,F2.
(insn_reservation fp_double_arith): Change to issue,F01,F1+F2,fpu*4,F2.
(insn_reservation fp_double_cmp): Change to
d_lock,(d_lock+F01),F1+F2,F2.
(insn_reservation dp_div): Change to
issue,F01+F3,F1+F2+F3,F2+F3,F3*16,F1+F3,(fpu+F3)*2,F2.
* sh.c (flow_dependent_p, flow_dependent_p_1): New functions.
(sh_adjust_cost, SHcompact): Differentiate between different
kinds of dependencies. Drop factor of ten for superscalar.
Use new instruction types. Add new exception rules.
* sh.md (mulhisi3, umulhisi3: Add a REG_EQUAL note.
* sh.md (mperm_w): Add DONE.
2002-08-27 David Edelsohn <edelsohn@gnu.org>
* longlong.h: Import current PowerPC defintion from GMP-4.1.

View File

@ -208,6 +208,8 @@ static const char *sh_strip_name_encoding PARAMS ((const char *));
static void sh_init_builtins PARAMS ((void));
static void sh_media_init_builtins PARAMS ((void));
static rtx sh_expand_builtin PARAMS ((tree, rtx, rtx, enum machine_mode, int));
static int flow_dependent_p PARAMS ((rtx, rtx));
static void flow_dependent_p_1 PARAMS ((rtx, rtx, void *));
/* Initialize the GCC target structure. */
@ -6994,7 +6996,7 @@ sh_adjust_cost (insn, link, dep_insn, cost)
rtx dep_insn;
int cost;
{
rtx reg;
rtx reg, use_pat;
if (TARGET_SHMEDIA)
{
@ -7007,49 +7009,119 @@ sh_adjust_cost (insn, link, dep_insn, cost)
&& get_attr_is_mac_media (dep_insn))
cost = 1;
}
else if (GET_CODE(insn) == CALL_INSN)
else if (REG_NOTE_KIND (link) == 0)
{
enum attr_type dep_type, type;
if (recog_memoized (insn) < 0
|| recog_memoized (dep_insn) < 0)
return;
dep_type = get_attr_type (dep_insn);
if (dep_type == TYPE_FLOAD || dep_type == TYPE_PCFLOAD)
cost--;
if ((dep_type == TYPE_LOAD_SI || dep_type == TYPE_PCLOAD_SI)
&& (type = get_attr_type (insn)) != TYPE_CALL
&& type != TYPE_SFUNC)
cost--;
/* The only input for a call that is timing-critical is the
function's address. */
rtx call = PATTERN (insn);
if (GET_CODE(insn) == CALL_INSN)
{
rtx call = PATTERN (insn);
if (GET_CODE (call) == PARALLEL)
call = XVECEXP (call, 0 ,0);
if (GET_CODE (call) == SET)
call = SET_SRC (call);
if (GET_CODE (call) == CALL && GET_CODE (XEXP (call, 0)) == MEM
&& ! reg_set_p (XEXP (XEXP (call, 0), 0), dep_insn))
cost = 0;
}
/* All sfunc calls are parallels with at least four components.
Exploit this to avoid unnecessary calls to sfunc_uses_reg. */
else if (GET_CODE (PATTERN (insn)) == PARALLEL
&& XVECLEN (PATTERN (insn), 0) >= 4
&& (reg = sfunc_uses_reg (insn)))
{
if (GET_CODE (call) == PARALLEL)
call = XVECEXP (call, 0 ,0);
if (GET_CODE (call) == SET)
call = SET_SRC (call);
if (GET_CODE (call) == CALL && GET_CODE (XEXP (call, 0)) == MEM
&& ! reg_set_p (XEXP (XEXP (call, 0), 0), dep_insn))
cost = 0;
}
/* Likewise, the most timing critical input for an sfuncs call
is the function address. However, sfuncs typically start
using their arguments pretty quickly.
Assume a four cycle delay before they are needed. */
if (! reg_set_p (reg, dep_insn))
cost -= TARGET_SUPERSCALAR ? 40 : 4;
/* All sfunc calls are parallels with at least four components.
Exploit this to avoid unnecessary calls to sfunc_uses_reg. */
else if (GET_CODE (PATTERN (insn)) == PARALLEL
&& XVECLEN (PATTERN (insn), 0) >= 4
&& (reg = sfunc_uses_reg (insn)))
{
if (! reg_set_p (reg, dep_insn))
cost -= 4;
}
/* When the preceding instruction loads the shift amount of
the following SHAD/SHLD, the latency of the load is increased
by 1 cycle. */
else if (TARGET_SH4
&& get_attr_type (insn) == TYPE_DYN_SHIFT
&& get_attr_any_int_load (dep_insn) == ANY_INT_LOAD_YES
&& reg_overlap_mentioned_p (SET_DEST (PATTERN (dep_insn)),
XEXP (SET_SRC (single_set(insn)),
1)))
cost++;
/* When an LS group instruction with a latency of less than
3 cycles is followed by a double-precision floating-point
instruction, FIPR, or FTRV, the latency of the first
instruction is increased to 3 cycles. */
else if (cost < 3
&& get_attr_insn_class (dep_insn) == INSN_CLASS_LS_GROUP
&& get_attr_dfp_comp (insn) == DFP_COMP_YES)
cost = 3;
/* The lsw register of a double-precision computation is ready one
cycle earlier. */
else if (reload_completed
&& get_attr_dfp_comp (dep_insn) == DFP_COMP_YES
&& (use_pat = single_set (insn))
&& ! regno_use_in (REGNO (SET_DEST (single_set (dep_insn))),
SET_SRC (use_pat)))
cost -= 1;
if (get_attr_any_fp_comp (dep_insn) == ANY_FP_COMP_YES
&& get_attr_late_fp_use (insn) == LATE_FP_USE_YES)
cost -= 1;
}
/* Adjust load_si / pcload_si type insns latency. Use the known
nominal latency and form of the insn to speed up the check. */
else if (cost == 3
&& GET_CODE (PATTERN (dep_insn)) == SET
/* Latency for dmpy type insns is also 3, so check the that
it's actually a move insn. */
&& general_movsrc_operand (SET_SRC (PATTERN (dep_insn)), SImode))
/* An anti-dependence penalty of two applies if the first insn is a double
precision fadd / fsub / fmul. */
else if (REG_NOTE_KIND (link) == REG_DEP_ANTI
&& recog_memoized (dep_insn) >= 0
&& get_attr_type (dep_insn) == TYPE_DFP_ARITH
/* A lot of alleged anti-flow dependences are fake,
so check this one is real. */
&& flow_dependent_p (dep_insn, insn))
cost = 2;
else if (cost == 30
&& GET_CODE (PATTERN (dep_insn)) == SET
&& GET_MODE (SET_SRC (PATTERN (dep_insn))) == SImode)
cost = 20;
return cost;
}
/* Check if INSN is flow-dependent on DEP_INSN. Can also be used to check
if DEP_INSN is anti-flow dependent on INSN. */
static int
flow_dependent_p (insn, dep_insn)
rtx insn, dep_insn;
{
rtx tmp = PATTERN (insn);
note_stores (PATTERN (dep_insn), flow_dependent_p_1, &tmp);
return tmp == NULL_RTX;
}
/* A helper function for flow_dependent_p called through note_stores. */
static void
flow_dependent_p_1 (x, pat, data)
rtx x;
rtx pat ATTRIBUTE_UNUSED;
void *data;
{
rtx * pinsn = (rtx *) data;
if (*pinsn && reg_referenced_p (x, *pinsn))
*pinsn = NULL_RTX;
}
/* For use by ALLOCATE_INITIAL_VALUE. Note that sh.md contains some
'special function' patterns (type sfunc) that clobber pr, but that
do not look like function calls to leaf_function_p. Hence we must
@ -7060,27 +7132,26 @@ sh_pr_n_sets ()
return REG_N_SETS (TARGET_SHMEDIA ? PR_MEDIA_REG : PR_REG);
}
/* This Function Returns non zero if DFA based scheduler
interface is to be used.At present supported only for
SH4. */
/* This Function returns non zero if the DFA based scheduler interface
is to be used. At present this is supported for the SH4 only. */
static int
sh_use_dfa_interface()
{
if (TARGET_SH4)
return 1;
else
return 0;
if (TARGET_HARD_SH4)
return 1;
else
return 0;
}
/* This function returns "2" that signifies dual issue
for SH4 processor.To be used by DFA pipeline description. */
/* This function returns "2" to indicate dual issue for the SH4
processor. To be used by the DFA pipeline description. */
static int
sh_issue_rate()
{
if(TARGET_SH4)
return 2;
else
return 1;
if (TARGET_SUPERSCALAR)
return 2;
else
return 1;
}
/* SHmedia requires registers for branches, so we can't generate new

File diff suppressed because it is too large Load Diff