From e784c52cd2d0599cc29c8a66240af64cfac79371 Mon Sep 17 00:00:00 2001 From: Bernd Schmidt Date: Fri, 9 Jul 2010 09:03:22 +0000 Subject: [PATCH] re PR target/40657 (allocate local variables with fewer instructions) PR target/40657 * config/arm/arm.c (thumb1_extra_regs_pushed): New arg FOR_PROLOGUE. All callers changed. Handle the case when we're called for the epilogue. (thumb_unexpanded_epilogue): Use it. (thumb1_expand_epilogue): Likewise. testsuite/ PR target/40657 * gcc.target/arm/pr40657-1.c: New test. * gcc.target/arm/pr40657-2.c: New test. * gcc.c-torture/execute/pr40657.c: New test. From-SVN: r161988 --- gcc/ChangeLog | 9 ++ gcc/config/arm/arm.c | 137 ++++++++++++------ gcc/testsuite/ChangeLog | 7 + gcc/testsuite/gcc.c-torture/execute/pr40657.c | 23 +++ gcc/testsuite/gcc.target/arm/pr40657-1.c | 13 ++ gcc/testsuite/gcc.target/arm/pr40657-2.c | 20 +++ 6 files changed, 161 insertions(+), 48 deletions(-) create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr40657.c create mode 100644 gcc/testsuite/gcc.target/arm/pr40657-1.c create mode 100644 gcc/testsuite/gcc.target/arm/pr40657-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e8e4f1b542f..a5d4ceb4978 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2010-07-09 Bernd Schmidt + + PR target/40657 + * config/arm/arm.c (thumb1_extra_regs_pushed): New arg FOR_PROLOGUE. + All callers changed. + Handle the case when we're called for the epilogue. + (thumb_unexpanded_epilogue): Use it. + (thumb1_expand_epilogue): Likewise. + 2010-07-09 Jakub Jelinek * tree-vrp.c (extract_range_from_binary_expr) : If diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 61c83291cb4..db4f701e068 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -19565,6 +19565,81 @@ is_called_in_ARM_mode (tree func) #endif } +/* Given the stack offsets and register mask in OFFSETS, decide how + many additional registers to push instead of subtracting a constant + from SP. For epilogues the principle is the same except we use pop. + FOR_PROLOGUE indicates which we're generating. */ +static int +thumb1_extra_regs_pushed (arm_stack_offsets *offsets, bool for_prologue) +{ + HOST_WIDE_INT amount; + unsigned long live_regs_mask = offsets->saved_regs_mask; + /* Extract a mask of the ones we can give to the Thumb's push/pop + instruction. */ + unsigned long l_mask = live_regs_mask & (for_prologue ? 0x40ff : 0xff); + /* Then count how many other high registers will need to be pushed. */ + unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00); + int n_free, reg_base; + + if (!for_prologue && frame_pointer_needed) + amount = offsets->locals_base - offsets->saved_regs; + else + amount = offsets->outgoing_args - offsets->saved_regs; + + /* If the stack frame size is 512 exactly, we can save one load + instruction, which should make this a win even when optimizing + for speed. */ + if (!optimize_size && amount != 512) + return 0; + + /* Can't do this if there are high registers to push. */ + if (high_regs_pushed != 0) + return 0; + + /* Shouldn't do it in the prologue if no registers would normally + be pushed at all. In the epilogue, also allow it if we'll have + a pop insn for the PC. */ + if (l_mask == 0 + && (for_prologue + || TARGET_BACKTRACE + || (live_regs_mask & 1 << LR_REGNUM) == 0 + || TARGET_INTERWORK + || crtl->args.pretend_args_size != 0)) + return 0; + + /* Don't do this if thumb_expand_prologue wants to emit instructions + between the push and the stack frame allocation. */ + if (for_prologue + && ((flag_pic && arm_pic_register != INVALID_REGNUM) + || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))) + return 0; + + reg_base = 0; + n_free = 0; + if (!for_prologue) + { + reg_base = arm_size_return_regs () / UNITS_PER_WORD; + live_regs_mask >>= reg_base; + } + + while (reg_base + n_free < 8 && !(live_regs_mask & 1) + && (for_prologue || call_used_regs[reg_base + n_free])) + { + live_regs_mask >>= 1; + n_free++; + } + + if (n_free == 0) + return 0; + gcc_assert (amount / 4 * 4 == amount); + + if (amount >= 512 && (amount - n_free * 4) < 512) + return (amount - 508) / 4; + if (amount <= n_free * 4) + return amount / 4; + return 0; +} + /* The bits which aren't usefully expanded as rtl. */ const char * thumb_unexpanded_epilogue (void) @@ -19573,6 +19648,7 @@ thumb_unexpanded_epilogue (void) int regno; unsigned long live_regs_mask = 0; int high_regs_pushed = 0; + int extra_pop; int had_to_push_lr; int size; @@ -19592,6 +19668,13 @@ thumb_unexpanded_epilogue (void) the register is used to hold a return value. */ size = arm_size_return_regs (); + extra_pop = thumb1_extra_regs_pushed (offsets, false); + if (extra_pop > 0) + { + unsigned long extra_mask = (1 << extra_pop) - 1; + live_regs_mask |= extra_mask << (size / UNITS_PER_WORD); + } + /* The prolog may have pushed some high registers to use as work registers. e.g. the testsuite file: gcc/testsuite/gcc/gcc.c-torture/execute/complex-2.c @@ -19675,7 +19758,9 @@ thumb_unexpanded_epilogue (void) live_regs_mask); /* We have either just popped the return address into the - PC or it is was kept in LR for the entire function. */ + PC or it is was kept in LR for the entire function. + Note that thumb_pushpop has already called thumb_exit if the + PC was in the list. */ if (!had_to_push_lr) thumb_exit (asm_out_file, LR_REGNUM); } @@ -19821,51 +19906,6 @@ thumb_compute_initial_elimination_offset (unsigned int from, unsigned int to) } } -/* Given the stack offsets and register mask in OFFSETS, decide - how many additional registers to push instead of subtracting - a constant from SP. */ -static int -thumb1_extra_regs_pushed (arm_stack_offsets *offsets) -{ - HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs; - unsigned long live_regs_mask = offsets->saved_regs_mask; - /* Extract a mask of the ones we can give to the Thumb's push instruction. */ - unsigned long l_mask = live_regs_mask & 0x40ff; - /* Then count how many other high registers will need to be pushed. */ - unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00); - int n_free; - - /* If the stack frame size is 512 exactly, we can save one load - instruction, which should make this a win even when optimizing - for speed. */ - if (!optimize_size && amount != 512) - return 0; - - /* Can't do this if there are high registers to push, or if we - are not going to do a push at all. */ - if (high_regs_pushed != 0 || l_mask == 0) - return 0; - - /* Don't do this if thumb1_expand_prologue wants to emit instructions - between the push and the stack frame allocation. */ - if ((flag_pic && arm_pic_register != INVALID_REGNUM) - || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)) - return 0; - - for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1) - n_free++; - - if (n_free == 0) - return 0; - gcc_assert (amount / 4 * 4 == amount); - - if (amount >= 512 && (amount - n_free * 4) < 512) - return (amount - 508) / 4; - if (amount <= n_free * 4) - return amount / 4; - return 0; -} - /* Generate the rest of a function's prologue. */ void thumb1_expand_prologue (void) @@ -19902,7 +19942,7 @@ thumb1_expand_prologue (void) stack_pointer_rtx); amount = offsets->outgoing_args - offsets->saved_regs; - amount -= 4 * thumb1_extra_regs_pushed (offsets); + amount -= 4 * thumb1_extra_regs_pushed (offsets, true); if (amount) { if (amount < 512) @@ -19987,6 +20027,7 @@ thumb1_expand_epilogue (void) emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx)); amount = offsets->locals_base - offsets->saved_regs; } + amount -= 4 * thumb1_extra_regs_pushed (offsets, false); gcc_assert (amount >= 0); if (amount) @@ -20209,7 +20250,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED) || (high_regs_pushed == 0 && l_mask)) { unsigned long mask = l_mask; - mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1; + mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1; thumb_pushpop (f, mask, 1, &cfa_offset, mask); } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index f6e5b37ca6a..71906e60a9a 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2010-07-09 Bernd Schmidt + + PR target/40657 + * gcc.target/arm/pr40657-1.c: New test. + * gcc.target/arm/pr40657-2.c: New test. + * gcc.c-torture/execute/pr40657.c: New test. + 2010-07-09 Jakub Jelinek * gcc.dg/tree-ssa/vrp50.c: New test. diff --git a/gcc/testsuite/gcc.c-torture/execute/pr40657.c b/gcc/testsuite/gcc.c-torture/execute/pr40657.c new file mode 100644 index 00000000000..e6d8dda9f99 --- /dev/null +++ b/gcc/testsuite/gcc.c-torture/execute/pr40657.c @@ -0,0 +1,23 @@ +/* Verify that that Thumb-1 epilogue size optimization does not clobber the + return value. */ + +long long v = 0x123456789abc; + +__attribute__((noinline)) void bar (int *x) +{ + asm volatile ("" : "=m" (x) ::); +} + +__attribute__((noinline)) long long foo() +{ + int x; + bar(&x); + return v; +} + +int main () +{ + if (foo () != v) + abort (); + exit (0); +} diff --git a/gcc/testsuite/gcc.target/arm/pr40657-1.c b/gcc/testsuite/gcc.target/arm/pr40657-1.c new file mode 100644 index 00000000000..a6ac6c78a1c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/pr40657-1.c @@ -0,0 +1,13 @@ +/* { dg-options "-Os -march=armv5te -mthumb" } */ +/* { dg-require-effective-target arm_thumb1_ok } */ +/* { dg-final { scan-assembler "pop.*r1.*pc" } } */ +/* { dg-final { scan-assembler-not "sub\[\\t \]*sp,\[\\t \]*sp" } } */ +/* { dg-final { scan-assembler-not "add\[\\t \]*sp,\[\\t \]*sp" } } */ + +extern void bar(int*); +int foo() +{ + int x; + bar(&x); + return x; +} diff --git a/gcc/testsuite/gcc.target/arm/pr40657-2.c b/gcc/testsuite/gcc.target/arm/pr40657-2.c new file mode 100644 index 00000000000..31d48376730 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/pr40657-2.c @@ -0,0 +1,20 @@ +/* { dg-options "-Os -march=armv4t -mthumb" } */ +/* { dg-require-effective-target arm_thumb1_ok } */ +/* { dg-final { scan-assembler-not "sub\[\\t \]*sp,\[\\t \]*sp" } } */ +/* { dg-final { scan-assembler-not "add\[\\t \]*sp,\[\\t \]*sp" } } */ + +/* Here, we test that if there's a pop of r[4567] in the epilogue, + add sp,sp,#12 is removed and replaced by three additional pops + of lower-numbered regs. */ + +extern void bar(int*); + +int t1, t2, t3, t4, t5; +int foo() +{ + int i,j,k,x = 0; + for (i = 0; i < t1; i++) + for (j = 0; j < t2; j++) + bar(&x); + return x; +}