re PR target/40657 (allocate local variables with fewer instructions)

PR target/40657
	* config/arm/arm.c (thumb1_extra_regs_pushed): New arg FOR_PROLOGUE.
	All callers changed.
	Handle the case when we're called for the epilogue.
	(thumb_unexpanded_epilogue): Use it.
	(thumb1_expand_epilogue): Likewise.

testsuite/
	PR target/40657
	* gcc.target/arm/pr40657-1.c: New test.
	* gcc.target/arm/pr40657-2.c: New test.
	* gcc.c-torture/execute/pr40657.c: New test.

From-SVN: r161988
This commit is contained in:
Bernd Schmidt 2010-07-09 09:03:22 +00:00 committed by Bernd Schmidt
parent ac2856486d
commit e784c52cd2
6 changed files with 161 additions and 48 deletions

View File

@ -1,3 +1,12 @@
2010-07-09 Bernd Schmidt <bernds@codesourcery.com>
PR target/40657
* config/arm/arm.c (thumb1_extra_regs_pushed): New arg FOR_PROLOGUE.
All callers changed.
Handle the case when we're called for the epilogue.
(thumb_unexpanded_epilogue): Use it.
(thumb1_expand_epilogue): Likewise.
2010-07-09 Jakub Jelinek <jakub@redhat.com>
* tree-vrp.c (extract_range_from_binary_expr) <BIT_AND_EXPR>: If

View File

@ -19565,6 +19565,81 @@ is_called_in_ARM_mode (tree func)
#endif
}
/* Given the stack offsets and register mask in OFFSETS, decide how
many additional registers to push instead of subtracting a constant
from SP. For epilogues the principle is the same except we use pop.
FOR_PROLOGUE indicates which we're generating. */
static int
thumb1_extra_regs_pushed (arm_stack_offsets *offsets, bool for_prologue)
{
HOST_WIDE_INT amount;
unsigned long live_regs_mask = offsets->saved_regs_mask;
/* Extract a mask of the ones we can give to the Thumb's push/pop
instruction. */
unsigned long l_mask = live_regs_mask & (for_prologue ? 0x40ff : 0xff);
/* Then count how many other high registers will need to be pushed. */
unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
int n_free, reg_base;
if (!for_prologue && frame_pointer_needed)
amount = offsets->locals_base - offsets->saved_regs;
else
amount = offsets->outgoing_args - offsets->saved_regs;
/* If the stack frame size is 512 exactly, we can save one load
instruction, which should make this a win even when optimizing
for speed. */
if (!optimize_size && amount != 512)
return 0;
/* Can't do this if there are high registers to push. */
if (high_regs_pushed != 0)
return 0;
/* Shouldn't do it in the prologue if no registers would normally
be pushed at all. In the epilogue, also allow it if we'll have
a pop insn for the PC. */
if (l_mask == 0
&& (for_prologue
|| TARGET_BACKTRACE
|| (live_regs_mask & 1 << LR_REGNUM) == 0
|| TARGET_INTERWORK
|| crtl->args.pretend_args_size != 0))
return 0;
/* Don't do this if thumb_expand_prologue wants to emit instructions
between the push and the stack frame allocation. */
if (for_prologue
&& ((flag_pic && arm_pic_register != INVALID_REGNUM)
|| (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)))
return 0;
reg_base = 0;
n_free = 0;
if (!for_prologue)
{
reg_base = arm_size_return_regs () / UNITS_PER_WORD;
live_regs_mask >>= reg_base;
}
while (reg_base + n_free < 8 && !(live_regs_mask & 1)
&& (for_prologue || call_used_regs[reg_base + n_free]))
{
live_regs_mask >>= 1;
n_free++;
}
if (n_free == 0)
return 0;
gcc_assert (amount / 4 * 4 == amount);
if (amount >= 512 && (amount - n_free * 4) < 512)
return (amount - 508) / 4;
if (amount <= n_free * 4)
return amount / 4;
return 0;
}
/* The bits which aren't usefully expanded as rtl. */
const char *
thumb_unexpanded_epilogue (void)
@ -19573,6 +19648,7 @@ thumb_unexpanded_epilogue (void)
int regno;
unsigned long live_regs_mask = 0;
int high_regs_pushed = 0;
int extra_pop;
int had_to_push_lr;
int size;
@ -19592,6 +19668,13 @@ thumb_unexpanded_epilogue (void)
the register is used to hold a return value. */
size = arm_size_return_regs ();
extra_pop = thumb1_extra_regs_pushed (offsets, false);
if (extra_pop > 0)
{
unsigned long extra_mask = (1 << extra_pop) - 1;
live_regs_mask |= extra_mask << (size / UNITS_PER_WORD);
}
/* The prolog may have pushed some high registers to use as
work registers. e.g. the testsuite file:
gcc/testsuite/gcc/gcc.c-torture/execute/complex-2.c
@ -19675,7 +19758,9 @@ thumb_unexpanded_epilogue (void)
live_regs_mask);
/* We have either just popped the return address into the
PC or it is was kept in LR for the entire function. */
PC or it is was kept in LR for the entire function.
Note that thumb_pushpop has already called thumb_exit if the
PC was in the list. */
if (!had_to_push_lr)
thumb_exit (asm_out_file, LR_REGNUM);
}
@ -19821,51 +19906,6 @@ thumb_compute_initial_elimination_offset (unsigned int from, unsigned int to)
}
}
/* Given the stack offsets and register mask in OFFSETS, decide
how many additional registers to push instead of subtracting
a constant from SP. */
static int
thumb1_extra_regs_pushed (arm_stack_offsets *offsets)
{
HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs;
unsigned long live_regs_mask = offsets->saved_regs_mask;
/* Extract a mask of the ones we can give to the Thumb's push instruction. */
unsigned long l_mask = live_regs_mask & 0x40ff;
/* Then count how many other high registers will need to be pushed. */
unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
int n_free;
/* If the stack frame size is 512 exactly, we can save one load
instruction, which should make this a win even when optimizing
for speed. */
if (!optimize_size && amount != 512)
return 0;
/* Can't do this if there are high registers to push, or if we
are not going to do a push at all. */
if (high_regs_pushed != 0 || l_mask == 0)
return 0;
/* Don't do this if thumb1_expand_prologue wants to emit instructions
between the push and the stack frame allocation. */
if ((flag_pic && arm_pic_register != INVALID_REGNUM)
|| (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))
return 0;
for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1)
n_free++;
if (n_free == 0)
return 0;
gcc_assert (amount / 4 * 4 == amount);
if (amount >= 512 && (amount - n_free * 4) < 512)
return (amount - 508) / 4;
if (amount <= n_free * 4)
return amount / 4;
return 0;
}
/* Generate the rest of a function's prologue. */
void
thumb1_expand_prologue (void)
@ -19902,7 +19942,7 @@ thumb1_expand_prologue (void)
stack_pointer_rtx);
amount = offsets->outgoing_args - offsets->saved_regs;
amount -= 4 * thumb1_extra_regs_pushed (offsets);
amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
if (amount)
{
if (amount < 512)
@ -19987,6 +20027,7 @@ thumb1_expand_epilogue (void)
emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx));
amount = offsets->locals_base - offsets->saved_regs;
}
amount -= 4 * thumb1_extra_regs_pushed (offsets, false);
gcc_assert (amount >= 0);
if (amount)
@ -20209,7 +20250,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
|| (high_regs_pushed == 0 && l_mask))
{
unsigned long mask = l_mask;
mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1;
mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1;
thumb_pushpop (f, mask, 1, &cfa_offset, mask);
}

View File

@ -1,3 +1,10 @@
2010-07-09 Bernd Schmidt <bernds@codesourcery.com>
PR target/40657
* gcc.target/arm/pr40657-1.c: New test.
* gcc.target/arm/pr40657-2.c: New test.
* gcc.c-torture/execute/pr40657.c: New test.
2010-07-09 Jakub Jelinek <jakub@redhat.com>
* gcc.dg/tree-ssa/vrp50.c: New test.

View File

@ -0,0 +1,23 @@
/* Verify that that Thumb-1 epilogue size optimization does not clobber the
return value. */
long long v = 0x123456789abc;
__attribute__((noinline)) void bar (int *x)
{
asm volatile ("" : "=m" (x) ::);
}
__attribute__((noinline)) long long foo()
{
int x;
bar(&x);
return v;
}
int main ()
{
if (foo () != v)
abort ();
exit (0);
}

View File

@ -0,0 +1,13 @@
/* { dg-options "-Os -march=armv5te -mthumb" } */
/* { dg-require-effective-target arm_thumb1_ok } */
/* { dg-final { scan-assembler "pop.*r1.*pc" } } */
/* { dg-final { scan-assembler-not "sub\[\\t \]*sp,\[\\t \]*sp" } } */
/* { dg-final { scan-assembler-not "add\[\\t \]*sp,\[\\t \]*sp" } } */
extern void bar(int*);
int foo()
{
int x;
bar(&x);
return x;
}

View File

@ -0,0 +1,20 @@
/* { dg-options "-Os -march=armv4t -mthumb" } */
/* { dg-require-effective-target arm_thumb1_ok } */
/* { dg-final { scan-assembler-not "sub\[\\t \]*sp,\[\\t \]*sp" } } */
/* { dg-final { scan-assembler-not "add\[\\t \]*sp,\[\\t \]*sp" } } */
/* Here, we test that if there's a pop of r[4567] in the epilogue,
add sp,sp,#12 is removed and replaced by three additional pops
of lower-numbered regs. */
extern void bar(int*);
int t1, t2, t3, t4, t5;
int foo()
{
int i,j,k,x = 0;
for (i = 0; i < t1; i++)
for (j = 0; j < t2; j++)
bar(&x);
return x;
}