timevar.def (TV_SCHED_FUSION): New time var.
* timevar.def (TV_SCHED_FUSION): New time var. * passes.def (pass_sched_fusion): New pass. * config/arm/arm.c (TARGET_SCHED_FUSION_PRIORITY): New. (extract_base_offset_in_addr, fusion_load_store): New. (arm_sched_fusion_priority): New. (arm_option_override): Disable scheduling fusion by default on non-armv7 processors or ldrd/strd isn't preferred. * sched-int.h (struct _haifa_insn_data): New field. (INSN_FUSION_PRIORITY, FUSION_MAX_PRIORITY, sched_fusion): New. * sched-rgn.c (rest_of_handle_sched_fusion): New. (pass_data_sched_fusion, pass_sched_fusion): New. (make_pass_sched_fusion): New. * haifa-sched.c (sched_fusion): New. (insn_cost): Handle sched_fusion. (priority): Handle sched_fusion by calling target hook. (enum rfs_decision): New enum value. (rfs_str): New element for RFS_FUSION. (rank_for_schedule): Support sched_fusion. (schedule_insn, max_issue, prune_ready_list): Handle sched_fusion. (schedule_block, fix_tick_ready): Handle sched_fusion. * common.opt (flag_schedule_fusion): New. * tree-pass.h (make_pass_sched_fusion): New. * target.def (fusion_priority): New. * doc/tm.texi.in (TARGET_SCHED_FUSION_PRIORITY): New. * doc/tm.texi: Regenerated. * doc/invoke.texi (-fschedule-fusion): New. testsuite: * gcc.target/arm/ldrd-strd-pair-1.c: New test. * gcc.target/arm/vfp-1.c: Improve scanning string. From-SVN: r217533
This commit is contained in:
parent
0fb3402f69
commit
b16abbcb85
|
@ -1,3 +1,32 @@
|
|||
2014-11-14 Bin Cheng <bin.cheng@arm.com>
|
||||
|
||||
* timevar.def (TV_SCHED_FUSION): New time var.
|
||||
* passes.def (pass_sched_fusion): New pass.
|
||||
* config/arm/arm.c (TARGET_SCHED_FUSION_PRIORITY): New.
|
||||
(extract_base_offset_in_addr, fusion_load_store): New.
|
||||
(arm_sched_fusion_priority): New.
|
||||
(arm_option_override): Disable scheduling fusion by default
|
||||
on non-armv7 processors or ldrd/strd isn't preferred.
|
||||
* sched-int.h (struct _haifa_insn_data): New field.
|
||||
(INSN_FUSION_PRIORITY, FUSION_MAX_PRIORITY, sched_fusion): New.
|
||||
* sched-rgn.c (rest_of_handle_sched_fusion): New.
|
||||
(pass_data_sched_fusion, pass_sched_fusion): New.
|
||||
(make_pass_sched_fusion): New.
|
||||
* haifa-sched.c (sched_fusion): New.
|
||||
(insn_cost): Handle sched_fusion.
|
||||
(priority): Handle sched_fusion by calling target hook.
|
||||
(enum rfs_decision): New enum value.
|
||||
(rfs_str): New element for RFS_FUSION.
|
||||
(rank_for_schedule): Support sched_fusion.
|
||||
(schedule_insn, max_issue, prune_ready_list): Handle sched_fusion.
|
||||
(schedule_block, fix_tick_ready): Handle sched_fusion.
|
||||
* common.opt (flag_schedule_fusion): New.
|
||||
* tree-pass.h (make_pass_sched_fusion): New.
|
||||
* target.def (fusion_priority): New.
|
||||
* doc/tm.texi.in (TARGET_SCHED_FUSION_PRIORITY): New.
|
||||
* doc/tm.texi: Regenerated.
|
||||
* doc/invoke.texi (-fschedule-fusion): New.
|
||||
|
||||
2014-11-13 Rong Xu <xur@google.com>
|
||||
|
||||
PR debug/63581
|
||||
|
|
|
@ -1848,6 +1848,10 @@ frename-registers
|
|||
Common Report Var(flag_rename_registers) Init(2) Optimization
|
||||
Perform a register renaming optimization pass
|
||||
|
||||
fschedule-fusion
|
||||
Common Report Var(flag_schedule_fusion) Init(2) Optimization
|
||||
Perform a target dependent instruction fusion optimization pass
|
||||
|
||||
freorder-blocks
|
||||
Common Report Var(flag_reorder_blocks) Optimization
|
||||
Reorder basic blocks to improve code placement
|
||||
|
|
|
@ -311,6 +311,8 @@ static unsigned arm_add_stmt_cost (void *data, int count,
|
|||
static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
|
||||
bool op0_preserve_value);
|
||||
static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void);
|
||||
|
||||
static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*);
|
||||
|
||||
/* Table of machine attributes. */
|
||||
static const struct attribute_spec arm_attribute_table[] =
|
||||
|
@ -708,6 +710,9 @@ static const struct attribute_spec arm_attribute_table[] =
|
|||
#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
|
||||
#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
|
||||
|
||||
#undef TARGET_SCHED_FUSION_PRIORITY
|
||||
#define TARGET_SCHED_FUSION_PRIORITY arm_sched_fusion_priority
|
||||
|
||||
struct gcc_target targetm = TARGET_INITIALIZER;
|
||||
|
||||
/* Obstack for minipool constant handling. */
|
||||
|
@ -3168,6 +3173,12 @@ arm_option_override (void)
|
|||
if (TARGET_THUMB2)
|
||||
inline_asm_unified = 1;
|
||||
|
||||
/* Disable scheduling fusion by default if it's not armv7 processor
|
||||
or doesn't prefer ldrd/strd. */
|
||||
if (flag_schedule_fusion == 2
|
||||
&& (!arm_arch7 || !current_tune->prefer_ldrd_strd))
|
||||
flag_schedule_fusion = 0;
|
||||
|
||||
/* Register global variables with the garbage collector. */
|
||||
arm_add_gc_roots ();
|
||||
}
|
||||
|
@ -32350,4 +32361,124 @@ arm_is_constant_pool_ref (rtx x)
|
|||
&& CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)));
|
||||
}
|
||||
|
||||
/* If MEM is in the form of [base+offset], extract the two parts
|
||||
of address and set to BASE and OFFSET, otherwise return false
|
||||
after clearing BASE and OFFSET. */
|
||||
|
||||
static bool
|
||||
extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
|
||||
{
|
||||
rtx addr;
|
||||
|
||||
gcc_assert (MEM_P (mem));
|
||||
|
||||
addr = XEXP (mem, 0);
|
||||
|
||||
/* Strip off const from addresses like (const (addr)). */
|
||||
if (GET_CODE (addr) == CONST)
|
||||
addr = XEXP (addr, 0);
|
||||
|
||||
if (GET_CODE (addr) == REG)
|
||||
{
|
||||
*base = addr;
|
||||
*offset = const0_rtx;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (GET_CODE (addr) == PLUS
|
||||
&& GET_CODE (XEXP (addr, 0)) == REG
|
||||
&& CONST_INT_P (XEXP (addr, 1)))
|
||||
{
|
||||
*base = XEXP (addr, 0);
|
||||
*offset = XEXP (addr, 1);
|
||||
return true;
|
||||
}
|
||||
|
||||
*base = NULL_RTX;
|
||||
*offset = NULL_RTX;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* If INSN is a load or store of address in the form of [base+offset],
|
||||
extract the two parts and set to BASE and OFFSET. IS_LOAD is set
|
||||
to TRUE if it's a load. Return TRUE if INSN is such an instruction,
|
||||
otherwise return FALSE. */
|
||||
|
||||
static bool
|
||||
fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, bool *is_load)
|
||||
{
|
||||
rtx x, dest, src;
|
||||
|
||||
gcc_assert (INSN_P (insn));
|
||||
x = PATTERN (insn);
|
||||
if (GET_CODE (x) != SET)
|
||||
return false;
|
||||
|
||||
src = SET_SRC (x);
|
||||
dest = SET_DEST (x);
|
||||
if (GET_CODE (src) == REG && GET_CODE (dest) == MEM)
|
||||
{
|
||||
*is_load = false;
|
||||
extract_base_offset_in_addr (dest, base, offset);
|
||||
}
|
||||
else if (GET_CODE (src) == MEM && GET_CODE (dest) == REG)
|
||||
{
|
||||
*is_load = true;
|
||||
extract_base_offset_in_addr (src, base, offset);
|
||||
}
|
||||
else
|
||||
return false;
|
||||
|
||||
return (*base != NULL_RTX && *offset != NULL_RTX);
|
||||
}
|
||||
|
||||
/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
|
||||
|
||||
Currently we only support to fuse ldr or str instructions, so FUSION_PRI
|
||||
and PRI are only calculated for these instructions. For other instruction,
|
||||
FUSION_PRI and PRI are simply set to MAX_PRI. In the future, other kind
|
||||
instruction fusion can be supported by returning different priorities.
|
||||
|
||||
It's important that irrelevant instructions get the largest FUSION_PRI. */
|
||||
|
||||
static void
|
||||
arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
|
||||
int *fusion_pri, int *pri)
|
||||
{
|
||||
int tmp, off_val;
|
||||
bool is_load;
|
||||
rtx base, offset;
|
||||
|
||||
gcc_assert (INSN_P (insn));
|
||||
|
||||
tmp = max_pri - 1;
|
||||
if (!fusion_load_store (insn, &base, &offset, &is_load))
|
||||
{
|
||||
*pri = tmp;
|
||||
*fusion_pri = tmp;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Load goes first. */
|
||||
if (is_load)
|
||||
*fusion_pri = tmp - 1;
|
||||
else
|
||||
*fusion_pri = tmp - 2;
|
||||
|
||||
tmp /= 2;
|
||||
|
||||
/* INSN with smaller base register goes first. */
|
||||
tmp -= ((REGNO (base) & 0xff) << 20);
|
||||
|
||||
/* INSN with smaller offset goes first. */
|
||||
off_val = (int)(INTVAL (offset));
|
||||
if (off_val >= 0)
|
||||
tmp -= (off_val & 0xfffff);
|
||||
else
|
||||
tmp += ((- off_val) & 0xfffff);
|
||||
|
||||
*pri = tmp;
|
||||
return;
|
||||
}
|
||||
#include "gt-arm.h"
|
||||
|
|
|
@ -406,7 +406,7 @@ Objective-C and Objective-C++ Dialects}.
|
|||
-fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
|
||||
-fprofile-generate=@var{path} @gol
|
||||
-fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
|
||||
-freciprocal-math -free -frename-registers -freorder-blocks @gol
|
||||
-freciprocal-math -free -frename-registers -fschedule-fusion -freorder-blocks @gol
|
||||
-freorder-blocks-and-partition -freorder-functions @gol
|
||||
-frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
|
||||
-frounding-math -fsched2-use-superblocks -fsched-pressure @gol
|
||||
|
@ -9575,6 +9575,14 @@ a ``home register''.
|
|||
|
||||
Enabled by default with @option{-funroll-loops} and @option{-fpeel-loops}.
|
||||
|
||||
@item -fschedule-fusion
|
||||
@opindex fschedule-fusion
|
||||
Performs a target dependent pass over the instruction stream to schedule
|
||||
instructions of same type together because target machine can execute them
|
||||
more efficiently if they are adjacent to each other in the instruction flow.
|
||||
|
||||
Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}.
|
||||
|
||||
@item -ftracer
|
||||
@opindex ftracer
|
||||
Perform tail duplication to enlarge superblock size. This transformation
|
||||
|
|
|
@ -6771,6 +6771,76 @@ This hook is called by tree reassociator to determine a level of
|
|||
parallelism required in output calculations chain.
|
||||
@end deftypefn
|
||||
|
||||
@deftypefn {Target Hook} void TARGET_SCHED_FUSION_PRIORITY (rtx_insn *@var{insn}, int @var{max_pri}, int *@var{fusion_pri}, int *@var{pri})
|
||||
This hook is called by scheduling fusion pass. It calculates fusion
|
||||
priorities for each instruction passed in by parameter. The priorities
|
||||
are returned via pointer parameters.
|
||||
|
||||
@var{insn} is the instruction whose priorities need to be calculated.
|
||||
@var{max_pri} is the maximum priority can be returned in any cases.
|
||||
@var{fusion_pri} is the pointer parameter through which @var{insn}'s
|
||||
fusion priority should be calculated and returned.
|
||||
@var{pri} is the pointer parameter through which @var{insn}'s priority
|
||||
should be calculated and returned.
|
||||
|
||||
Same @var{fusion_pri} should be returned for instructions which should
|
||||
be scheduled together. Different @var{pri} should be returned for
|
||||
instructions with same @var{fusion_pri}. @var{fusion_pri} is the major
|
||||
sort key, @var{pri} is the minor sort key. All instructions will be
|
||||
scheduled according to the two priorities. All priorities calculated
|
||||
should be between 0 (exclusive) and @var{max_pri} (inclusive). To avoid
|
||||
false dependencies, @var{fusion_pri} of instructions which need to be
|
||||
scheduled together should be smaller than @var{fusion_pri} of irrelevant
|
||||
instructions.
|
||||
|
||||
Given below example:
|
||||
|
||||
ldr r10, [r1, 4]
|
||||
add r4, r4, r10
|
||||
ldr r15, [r2, 8]
|
||||
sub r5, r5, r15
|
||||
ldr r11, [r1, 0]
|
||||
add r4, r4, r11
|
||||
ldr r16, [r2, 12]
|
||||
sub r5, r5, r16
|
||||
|
||||
On targets like ARM/AArch64, the two pairs of consecutive loads should be
|
||||
merged. Since peephole2 pass can't help in this case unless consecutive
|
||||
loads are actually next to each other in instruction flow. That's where
|
||||
this scheduling fusion pass works. This hook calculates priority for each
|
||||
instruction based on its fustion type, like:
|
||||
|
||||
ldr r10, [r1, 4] ; fusion_pri=99, pri=96
|
||||
add r4, r4, r10 ; fusion_pri=100, pri=100
|
||||
ldr r15, [r2, 8] ; fusion_pri=98, pri=92
|
||||
sub r5, r5, r15 ; fusion_pri=100, pri=100
|
||||
ldr r11, [r1, 0] ; fusion_pri=99, pri=100
|
||||
add r4, r4, r11 ; fusion_pri=100, pri=100
|
||||
ldr r16, [r2, 12] ; fusion_pri=98, pri=88
|
||||
sub r5, r5, r16 ; fusion_pri=100, pri=100
|
||||
|
||||
Scheduling fusion pass then sorts all ready to issue instructions according
|
||||
to the priorities. As a result, instructions of same fusion type will be
|
||||
pushed together in instruction flow, like:
|
||||
|
||||
ldr r11, [r1, 0]
|
||||
ldr r10, [r1, 4]
|
||||
ldr r15, [r2, 8]
|
||||
ldr r16, [r2, 12]
|
||||
add r4, r4, r10
|
||||
sub r5, r5, r15
|
||||
add r4, r4, r11
|
||||
sub r5, r5, r16
|
||||
|
||||
Now peephole2 pass can simply merge the two pairs of loads.
|
||||
|
||||
Since scheduling fusion pass relies on peephole2 to do real fusion
|
||||
work, it is only enabled by default when peephole2 is in effect.
|
||||
|
||||
This is firstly introduced on ARM/AArch64 targets, please refer to
|
||||
the hook implementation for how different fusion types are supported.
|
||||
@end deftypefn
|
||||
|
||||
@node Sections
|
||||
@section Dividing the Output into Sections (Texts, Data, @dots{})
|
||||
@c the above section title is WAY too long. maybe cut the part between
|
||||
|
|
|
@ -4811,6 +4811,8 @@ them: try the first ones in this list first.
|
|||
|
||||
@hook TARGET_SCHED_REASSOCIATION_WIDTH
|
||||
|
||||
@hook TARGET_SCHED_FUSION_PRIORITY
|
||||
|
||||
@node Sections
|
||||
@section Dividing the Output into Sections (Texts, Data, @dots{})
|
||||
@c the above section title is WAY too long. maybe cut the part between
|
||||
|
|
|
@ -1391,6 +1391,9 @@ insn_cost (rtx_insn *insn)
|
|||
{
|
||||
int cost;
|
||||
|
||||
if (sched_fusion)
|
||||
return 0;
|
||||
|
||||
if (sel_sched_p ())
|
||||
{
|
||||
if (recog_memoized (insn) < 0)
|
||||
|
@ -1603,6 +1606,8 @@ dep_list_size (rtx insn, sd_list_types_def list)
|
|||
return nodbgcount;
|
||||
}
|
||||
|
||||
bool sched_fusion;
|
||||
|
||||
/* Compute the priority number for INSN. */
|
||||
static int
|
||||
priority (rtx_insn *insn)
|
||||
|
@ -1617,7 +1622,15 @@ priority (rtx_insn *insn)
|
|||
{
|
||||
int this_priority = -1;
|
||||
|
||||
if (dep_list_size (insn, SD_LIST_FORW) == 0)
|
||||
if (sched_fusion)
|
||||
{
|
||||
int this_fusion_priority;
|
||||
|
||||
targetm.sched.fusion_priority (insn, FUSION_MAX_PRIORITY,
|
||||
&this_fusion_priority, &this_priority);
|
||||
INSN_FUSION_PRIORITY (insn) = this_fusion_priority;
|
||||
}
|
||||
else if (dep_list_size (insn, SD_LIST_FORW) == 0)
|
||||
/* ??? We should set INSN_PRIORITY to insn_cost when and insn has
|
||||
some forward deps but all of them are ignored by
|
||||
contributes_to_priority hook. At the moment we set priority of
|
||||
|
@ -2548,7 +2561,7 @@ enum rfs_decision {
|
|||
RFS_SCHED_GROUP, RFS_PRESSURE_DELAY, RFS_PRESSURE_TICK,
|
||||
RFS_FEEDS_BACKTRACK_INSN, RFS_PRIORITY, RFS_SPECULATION,
|
||||
RFS_SCHED_RANK, RFS_LAST_INSN, RFS_PRESSURE_INDEX,
|
||||
RFS_DEP_COUNT, RFS_TIE, RFS_N };
|
||||
RFS_DEP_COUNT, RFS_TIE, RFS_FUSION, RFS_N };
|
||||
|
||||
/* Corresponding strings for print outs. */
|
||||
static const char *rfs_str[RFS_N] = {
|
||||
|
@ -2556,7 +2569,7 @@ static const char *rfs_str[RFS_N] = {
|
|||
"RFS_SCHED_GROUP", "RFS_PRESSURE_DELAY", "RFS_PRESSURE_TICK",
|
||||
"RFS_FEEDS_BACKTRACK_INSN", "RFS_PRIORITY", "RFS_SPECULATION",
|
||||
"RFS_SCHED_RANK", "RFS_LAST_INSN", "RFS_PRESSURE_INDEX",
|
||||
"RFS_DEP_COUNT", "RFS_TIE" };
|
||||
"RFS_DEP_COUNT", "RFS_TIE", "RFS_FUSION" };
|
||||
|
||||
/* Statistical breakdown of rank_for_schedule decisions. */
|
||||
typedef struct { unsigned stats[RFS_N]; } rank_for_schedule_stats_t;
|
||||
|
@ -2627,6 +2640,55 @@ rank_for_schedule (const void *x, const void *y)
|
|||
/* Make sure that priority of TMP and TMP2 are initialized. */
|
||||
gcc_assert (INSN_PRIORITY_KNOWN (tmp) && INSN_PRIORITY_KNOWN (tmp2));
|
||||
|
||||
if (sched_fusion)
|
||||
{
|
||||
/* The instruction that has the same fusion priority as the last
|
||||
instruction is the instruction we picked next. If that is not
|
||||
the case, we sort ready list firstly by fusion priority, then
|
||||
by priority, and at last by INSN_LUID. */
|
||||
int a = INSN_FUSION_PRIORITY (tmp);
|
||||
int b = INSN_FUSION_PRIORITY (tmp2);
|
||||
int last = -1;
|
||||
|
||||
if (last_nondebug_scheduled_insn
|
||||
&& !NOTE_P (last_nondebug_scheduled_insn)
|
||||
&& BLOCK_FOR_INSN (tmp)
|
||||
== BLOCK_FOR_INSN (last_nondebug_scheduled_insn))
|
||||
last = INSN_FUSION_PRIORITY (last_nondebug_scheduled_insn);
|
||||
|
||||
if (a != last && b != last)
|
||||
{
|
||||
if (a == b)
|
||||
{
|
||||
a = INSN_PRIORITY (tmp);
|
||||
b = INSN_PRIORITY (tmp2);
|
||||
}
|
||||
if (a != b)
|
||||
return rfs_result (RFS_FUSION, b - a, tmp, tmp2);
|
||||
else
|
||||
return rfs_result (RFS_FUSION,
|
||||
INSN_LUID (tmp) - INSN_LUID (tmp2), tmp, tmp2);
|
||||
}
|
||||
else if (a == b)
|
||||
{
|
||||
gcc_assert (last_nondebug_scheduled_insn
|
||||
&& !NOTE_P (last_nondebug_scheduled_insn));
|
||||
last = INSN_PRIORITY (last_nondebug_scheduled_insn);
|
||||
|
||||
a = abs (INSN_PRIORITY (tmp) - last);
|
||||
b = abs (INSN_PRIORITY (tmp2) - last);
|
||||
if (a != b)
|
||||
return rfs_result (RFS_FUSION, a - b, tmp, tmp2);
|
||||
else
|
||||
return rfs_result (RFS_FUSION,
|
||||
INSN_LUID (tmp) - INSN_LUID (tmp2), tmp, tmp2);
|
||||
}
|
||||
else if (a == last)
|
||||
return rfs_result (RFS_FUSION, -1, tmp, tmp2);
|
||||
else
|
||||
return rfs_result (RFS_FUSION, 1, tmp, tmp2);
|
||||
}
|
||||
|
||||
if (sched_pressure != SCHED_PRESSURE_NONE)
|
||||
{
|
||||
/* Prefer insn whose scheduling results in the smallest register
|
||||
|
@ -4007,8 +4069,8 @@ schedule_insn (rtx_insn *insn)
|
|||
gcc_assert (INSN_TICK (insn) >= MIN_TICK);
|
||||
if (INSN_TICK (insn) > clock_var)
|
||||
/* INSN has been prematurely moved from the queue to the ready list.
|
||||
This is possible only if following flag is set. */
|
||||
gcc_assert (flag_sched_stalled_insns);
|
||||
This is possible only if following flags are set. */
|
||||
gcc_assert (flag_sched_stalled_insns || sched_fusion);
|
||||
|
||||
/* ??? Probably, if INSN is scheduled prematurely, we should leave
|
||||
INSN_TICK untouched. This is a machine-dependent issue, actually. */
|
||||
|
@ -5500,6 +5562,9 @@ max_issue (struct ready_list *ready, int privileged_n, state_t state,
|
|||
struct choice_entry *top;
|
||||
rtx_insn *insn;
|
||||
|
||||
if (sched_fusion)
|
||||
return 0;
|
||||
|
||||
n_ready = ready->n_ready;
|
||||
gcc_assert (dfa_lookahead >= 1 && privileged_n >= 0
|
||||
&& privileged_n <= n_ready);
|
||||
|
@ -5848,6 +5913,9 @@ prune_ready_list (state_t temp_state, bool first_cycle_insn_p,
|
|||
bool sched_group_found = false;
|
||||
int min_cost_group = 1;
|
||||
|
||||
if (sched_fusion)
|
||||
return;
|
||||
|
||||
for (i = 0; i < ready.n_ready; i++)
|
||||
{
|
||||
rtx_insn *insn = ready_element (&ready, i);
|
||||
|
@ -6059,7 +6127,7 @@ schedule_block (basic_block *target_bb, state_t init_state)
|
|||
rtx_insn *tail = PREV_INSN (next_tail);
|
||||
|
||||
if ((current_sched_info->flags & DONT_BREAK_DEPENDENCIES) == 0
|
||||
&& sched_pressure != SCHED_PRESSURE_MODEL)
|
||||
&& sched_pressure != SCHED_PRESSURE_MODEL && !sched_fusion)
|
||||
find_modifiable_mems (head, tail);
|
||||
|
||||
/* We used to have code to avoid getting parameters moved from hard
|
||||
|
@ -6455,7 +6523,7 @@ schedule_block (basic_block *target_bb, state_t init_state)
|
|||
{
|
||||
memcpy (temp_state, curr_state, dfa_state_size);
|
||||
cost = state_transition (curr_state, insn);
|
||||
if (sched_pressure != SCHED_PRESSURE_WEIGHTED)
|
||||
if (sched_pressure != SCHED_PRESSURE_WEIGHTED && !sched_fusion)
|
||||
gcc_assert (cost < 0);
|
||||
if (memcmp (temp_state, curr_state, dfa_state_size) != 0)
|
||||
cycle_issued_insns++;
|
||||
|
@ -7288,7 +7356,7 @@ fix_tick_ready (rtx_insn *next)
|
|||
INSN_TICK (next) = tick;
|
||||
|
||||
delay = tick - clock_var;
|
||||
if (delay <= 0 || sched_pressure != SCHED_PRESSURE_NONE)
|
||||
if (delay <= 0 || sched_pressure != SCHED_PRESSURE_NONE || sched_fusion)
|
||||
delay = QUEUE_READY;
|
||||
|
||||
change_queue_index (next, delay);
|
||||
|
|
|
@ -419,6 +419,7 @@ along with GCC; see the file COPYING3. If not see
|
|||
NEXT_PASS (pass_stack_adjustments);
|
||||
NEXT_PASS (pass_jump2);
|
||||
NEXT_PASS (pass_duplicate_computed_gotos);
|
||||
NEXT_PASS (pass_sched_fusion);
|
||||
NEXT_PASS (pass_peephole2);
|
||||
NEXT_PASS (pass_if_after_reload);
|
||||
NEXT_PASS (pass_regrename);
|
||||
|
|
|
@ -805,6 +805,9 @@ struct _haifa_insn_data
|
|||
/* A priority for each insn. */
|
||||
int priority;
|
||||
|
||||
/* The fusion priority for each insn. */
|
||||
int fusion_priority;
|
||||
|
||||
/* The minimum clock tick at which the insn becomes ready. This is
|
||||
used to note timing constraints for the insns in the pending list. */
|
||||
int tick;
|
||||
|
@ -903,6 +906,7 @@ extern vec<haifa_insn_data_def> h_i_d;
|
|||
/* Accessor macros for h_i_d. There are more in haifa-sched.c and
|
||||
sched-rgn.c. */
|
||||
#define INSN_PRIORITY(INSN) (HID (INSN)->priority)
|
||||
#define INSN_FUSION_PRIORITY(INSN) (HID (INSN)->fusion_priority)
|
||||
#define INSN_REG_PRESSURE(INSN) (HID (INSN)->reg_pressure)
|
||||
#define INSN_MAX_REG_PRESSURE(INSN) (HID (INSN)->max_reg_pressure)
|
||||
#define INSN_REG_USE_LIST(INSN) (HID (INSN)->reg_use_list)
|
||||
|
@ -1620,6 +1624,10 @@ extern void sd_copy_back_deps (rtx_insn *, rtx_insn *, bool);
|
|||
extern void sd_delete_dep (sd_iterator_def);
|
||||
extern void sd_debug_lists (rtx, sd_list_types_def);
|
||||
|
||||
/* Macros and declarations for scheduling fusion. */
|
||||
#define FUSION_MAX_PRIORITY (INT_MAX)
|
||||
extern bool sched_fusion;
|
||||
|
||||
#endif /* INSN_SCHEDULING */
|
||||
|
||||
#endif /* GCC_SCHED_INT_H */
|
||||
|
|
|
@ -3658,6 +3658,17 @@ rest_of_handle_sched2 (void)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
rest_of_handle_sched_fusion (void)
|
||||
{
|
||||
#ifdef INSN_SCHEDULING
|
||||
sched_fusion = true;
|
||||
schedule_insns ();
|
||||
sched_fusion = false;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
const pass_data pass_data_live_range_shrinkage =
|
||||
|
@ -3800,3 +3811,55 @@ make_pass_sched2 (gcc::context *ctxt)
|
|||
{
|
||||
return new pass_sched2 (ctxt);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
const pass_data pass_data_sched_fusion =
|
||||
{
|
||||
RTL_PASS, /* type */
|
||||
"sched_fusion", /* name */
|
||||
OPTGROUP_NONE, /* optinfo_flags */
|
||||
TV_SCHED_FUSION, /* tv_id */
|
||||
0, /* properties_required */
|
||||
0, /* properties_provided */
|
||||
0, /* properties_destroyed */
|
||||
0, /* todo_flags_start */
|
||||
TODO_df_finish, /* todo_flags_finish */
|
||||
};
|
||||
|
||||
class pass_sched_fusion : public rtl_opt_pass
|
||||
{
|
||||
public:
|
||||
pass_sched_fusion (gcc::context *ctxt)
|
||||
: rtl_opt_pass (pass_data_sched_fusion, ctxt)
|
||||
{}
|
||||
|
||||
/* opt_pass methods: */
|
||||
virtual bool gate (function *);
|
||||
virtual unsigned int execute (function *)
|
||||
{
|
||||
return rest_of_handle_sched_fusion ();
|
||||
}
|
||||
|
||||
}; // class pass_sched2
|
||||
|
||||
bool
|
||||
pass_sched_fusion::gate (function *)
|
||||
{
|
||||
#ifdef INSN_SCHEDULING
|
||||
/* Scheduling fusion relies on peephole2 to do real fusion work,
|
||||
so only enable it if peephole2 is in effect. */
|
||||
return (optimize > 0 && flag_peephole2
|
||||
&& flag_schedule_fusion && targetm.sched.fusion_priority != NULL);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // anon namespace
|
||||
|
||||
rtl_opt_pass *
|
||||
make_pass_sched_fusion (gcc::context *ctxt)
|
||||
{
|
||||
return new pass_sched_fusion (ctxt);
|
||||
}
|
||||
|
|
|
@ -1526,6 +1526,79 @@ parallelism required in output calculations chain.",
|
|||
int, (unsigned int opc, machine_mode mode),
|
||||
hook_int_uint_mode_1)
|
||||
|
||||
/* The following member value is a function that returns priority for
|
||||
fusion of each instruction via pointer parameters. */
|
||||
DEFHOOK
|
||||
(fusion_priority,
|
||||
"This hook is called by scheduling fusion pass. It calculates fusion\n\
|
||||
priorities for each instruction passed in by parameter. The priorities\n\
|
||||
are returned via pointer parameters.\n\
|
||||
\n\
|
||||
@var{insn} is the instruction whose priorities need to be calculated.\n\
|
||||
@var{max_pri} is the maximum priority can be returned in any cases.\n\
|
||||
@var{fusion_pri} is the pointer parameter through which @var{insn}'s\n\
|
||||
fusion priority should be calculated and returned.\n\
|
||||
@var{pri} is the pointer parameter through which @var{insn}'s priority\n\
|
||||
should be calculated and returned.\n\
|
||||
\n\
|
||||
Same @var{fusion_pri} should be returned for instructions which should\n\
|
||||
be scheduled together. Different @var{pri} should be returned for\n\
|
||||
instructions with same @var{fusion_pri}. @var{fusion_pri} is the major\n\
|
||||
sort key, @var{pri} is the minor sort key. All instructions will be\n\
|
||||
scheduled according to the two priorities. All priorities calculated\n\
|
||||
should be between 0 (exclusive) and @var{max_pri} (inclusive). To avoid\n\
|
||||
false dependencies, @var{fusion_pri} of instructions which need to be\n\
|
||||
scheduled together should be smaller than @var{fusion_pri} of irrelevant\n\
|
||||
instructions.\n\
|
||||
\n\
|
||||
Given below example:\n\
|
||||
\n\
|
||||
ldr r10, [r1, 4]\n\
|
||||
add r4, r4, r10\n\
|
||||
ldr r15, [r2, 8]\n\
|
||||
sub r5, r5, r15\n\
|
||||
ldr r11, [r1, 0]\n\
|
||||
add r4, r4, r11\n\
|
||||
ldr r16, [r2, 12]\n\
|
||||
sub r5, r5, r16\n\
|
||||
\n\
|
||||
On targets like ARM/AArch64, the two pairs of consecutive loads should be\n\
|
||||
merged. Since peephole2 pass can't help in this case unless consecutive\n\
|
||||
loads are actually next to each other in instruction flow. That's where\n\
|
||||
this scheduling fusion pass works. This hook calculates priority for each\n\
|
||||
instruction based on its fustion type, like:\n\
|
||||
\n\
|
||||
ldr r10, [r1, 4] ; fusion_pri=99, pri=96 \n\
|
||||
add r4, r4, r10 ; fusion_pri=100, pri=100 \n\
|
||||
ldr r15, [r2, 8] ; fusion_pri=98, pri=92 \n\
|
||||
sub r5, r5, r15 ; fusion_pri=100, pri=100 \n\
|
||||
ldr r11, [r1, 0] ; fusion_pri=99, pri=100 \n\
|
||||
add r4, r4, r11 ; fusion_pri=100, pri=100 \n\
|
||||
ldr r16, [r2, 12] ; fusion_pri=98, pri=88 \n\
|
||||
sub r5, r5, r16 ; fusion_pri=100, pri=100 \n\
|
||||
\n\
|
||||
Scheduling fusion pass then sorts all ready to issue instructions according\n\
|
||||
to the priorities. As a result, instructions of same fusion type will be\n\
|
||||
pushed together in instruction flow, like:\n\
|
||||
\n\
|
||||
ldr r11, [r1, 0]\n\
|
||||
ldr r10, [r1, 4]\n\
|
||||
ldr r15, [r2, 8]\n\
|
||||
ldr r16, [r2, 12]\n\
|
||||
add r4, r4, r10\n\
|
||||
sub r5, r5, r15\n\
|
||||
add r4, r4, r11\n\
|
||||
sub r5, r5, r16\n\
|
||||
\n\
|
||||
Now peephole2 pass can simply merge the two pairs of loads.\n\
|
||||
\n\
|
||||
Since scheduling fusion pass relies on peephole2 to do real fusion\n\
|
||||
work, it is only enabled by default when peephole2 is in effect.\n\
|
||||
\n\
|
||||
This is firstly introduced on ARM/AArch64 targets, please refer to\n\
|
||||
the hook implementation for how different fusion types are supported.",
|
||||
void, (rtx_insn *insn, int max_pri, int *fusion_pri, int *pri), NULL)
|
||||
|
||||
HOOK_VECTOR_END (sched)
|
||||
|
||||
/* Functions relating to OpenMP and Cilk Plus SIMD clones. */
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
2014-11-14 Bin Cheng <bin.cheng@arm.com>
|
||||
|
||||
* gcc.target/arm/ldrd-strd-pair-1.c: New test.
|
||||
* gcc.target/arm/vfp-1.c: Improve scanning string.
|
||||
|
||||
2014-11-13 Rong Xu <xur@google.com>
|
||||
|
||||
PR debug/63581
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target arm_prefer_ldrd_strd } */
|
||||
/* { dg-options "-O2 -mthumb" } */
|
||||
|
||||
struct
|
||||
{
|
||||
int x;
|
||||
int y;
|
||||
char c;
|
||||
int d;
|
||||
}a;
|
||||
|
||||
int foo(int x, int y)
|
||||
{
|
||||
int c;
|
||||
a.x = x;
|
||||
c = a.x;
|
||||
a.d = c;
|
||||
a.y = y;
|
||||
|
||||
return 0;
|
||||
}
|
||||
/* { dg-final { scan-assembler "strd\t" { target { arm_thumb2_ok } } } } */
|
|
@ -126,7 +126,7 @@ void test_convert () {
|
|||
}
|
||||
|
||||
void test_ldst (float f[], double d[]) {
|
||||
/* { dg-final { scan-assembler "vldr.32.+ \\\[r0, #1020\\\]" } } */
|
||||
/* { dg-final { scan-assembler "vldr.32.+ \\\[r0, #-?\[0-9\]+\\\]" } } */
|
||||
/* { dg-final { scan-assembler "vldr.32.+ \\\[r\[0-9\], #-1020\\\]" { target { arm32 && { ! arm_thumb2_ok } } } } } */
|
||||
/* { dg-final { scan-assembler "add.+ r0, #1024" } } */
|
||||
/* { dg-final { scan-assembler "vstr.32.+ \\\[r\[0-9\]\\\]\n" } } */
|
||||
|
|
|
@ -247,6 +247,7 @@ DEFTIMEVAR (TV_IFCVT2 , "if-conversion 2")
|
|||
DEFTIMEVAR (TV_COMBINE_STACK_ADJUST , "combine stack adjustments")
|
||||
DEFTIMEVAR (TV_PEEPHOLE2 , "peephole 2")
|
||||
DEFTIMEVAR (TV_RENAME_REGISTERS , "rename registers")
|
||||
DEFTIMEVAR (TV_SCHED_FUSION , "scheduling fusion")
|
||||
DEFTIMEVAR (TV_CPROP_REGISTERS , "hard reg cprop")
|
||||
DEFTIMEVAR (TV_SCHED2 , "scheduling 2")
|
||||
DEFTIMEVAR (TV_MACH_DEP , "machine dep reorg")
|
||||
|
|
|
@ -552,6 +552,7 @@ extern rtl_opt_pass *make_pass_branch_target_load_optimize1 (gcc::context
|
|||
extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
|
||||
*ctxt);
|
||||
extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_sched_fusion (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_peephole2 (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_if_after_reload (gcc::context *ctxt);
|
||||
extern rtl_opt_pass *make_pass_regrename (gcc::context *ctxt);
|
||||
|
|
Loading…
Reference in New Issue