diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5cb82911b82..9079462731f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,19 @@ +2006-03-23 J"orn Rennecke + + * config/sh/divtab-sh4.c, config/sh/divcost-analysis: New files. + * config/sh/lib1funcs.asm (div_table): Add !__SH5__ variant. + * config/sh/t-sh (LIB1ASMFUNCS): Add _div_table. + * config/sh/sh.opt (mdiv=): Amend description. + * config/sh/sh.h (TARGET_DIVIDE_CALL_DIV1): New macro. + (TARGET_DIVIDE_CALL_FP, TARGET_DIVIDE_CALL_TABLE): Likewise. + (sh_divide_strategy_e): Add new members SH_DIV_CALL_DIV1, + SH_DIV_CALL_FP, SH_DIV_CALL_TABLE and SH_DIV_INTRINSIC. + (OVERRIDE_OPTIONS): Also process sh_div_str for TARGET_SH1. + Calculate sh_divsi3_libfunc using TARGET_DIVIDE_* macros. + * config/sh/sh.md (udivsi3_i4_int, divsi3_i4_int): New patterns. + (udivsi3, divsi3): Use them. Check TARGET_DIVIDE_CALL_TABLE / + TARGET_DIVIDE_CALL_FP. + 2006-03-23 Maxim Kuvyrkov * haifa-sched.c (choose_ready): Fix type of the local variable. diff --git a/gcc/config/sh/divcost-analysis b/gcc/config/sh/divcost-analysis new file mode 100644 index 00000000000..541e31324b3 --- /dev/null +++ b/gcc/config/sh/divcost-analysis @@ -0,0 +1,76 @@ +Analysis of cycle costs for SH4: + +-> udiv_le128: 5 +-> udiv_ge64k: 6 +-> udiv udiv_25: 10 +-> pos_divisor: 3 +-> pos_result linear: 5 +-> pos_result - -: 5 +-> div_le128: 7 +-> div_ge64k: 9 +sdivsi3 -> udiv_25 13 +udiv25 -> div_ge64k_end: 15 +div_ge64k_end -> rts: 13 +div_le128 -> div_le128_2: 2, r1 latency 3 +udiv_le128 -> div_le128_2: 2, r1 latency 3 +(u)div_le128 -> div_by_1: 9 +(u)div_le128 -> rts: 17 +div_by_1(_neg) -> rts: 4 +div_ge64k -> div_r8: 2 +div_ge64k -> div_ge64k_2: 3 +udiv_ge64k -> udiv_r8: 3 +udiv_ge64k -> div_ge64k_2: 3 + LS +(u)div_ge64k -> div_ge64k_end: 13 +div_r8 -> div_r8_2: 2 +udiv_r8 -> div_r8_2: 2 + LS +(u)div_r8 -> rts: 21 + +-> - + neg_result: 5 +-> + - neg_result: 5 +-> div_le128_neg: 7 +-> div_ge64k_neg: 9 +-> div_r8_neg: 11 +-> <64k div_ge64k_neg_end: 28 +-> >=64k div_ge64k_neg_end: 22 +div_ge64k_neg_end ft -> rts: 14 +div_r8_neg_end -> rts: 4 +div_r8_neg -> div_r8_neg_end: 18 +div_le128_neg -> div_by_1_neg: 4 +div_le128_neg -> rts 18 + + absolute divisor range: + 1 [2..128] [129..64K) [64K..|divident|/256] >=64K,>|divident/256| +udiv 18 22 38 32 30 +sdiv pos: 20 24 41 35 32 +sdiv neg: 15 25 42 36 33 + + +fp-based: + +unsigned: 42 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site +signed: 33 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site + +call-div1: divisor range: + [1..64K) >= 64K +unsigned: 63 58 +signed: 76 76 + +SFUNC_STATIC call overhead: +mov.l 0f,r1 +bsrf r1 + +SFUNC_GOT call overhead - current: +mov.l 0f,r1 +mova 0f,r0 +mov.l 1f,r2 +add r1,r0 +mov.l @(r0,r2),r0 +jmp @r0 +; 3 cycles worse than SFUNC_STATIC + +SFUNC_GOT call overhead - improved assembler: +mov.l 0f,r1 +mova 0f,r0 +mov.l @(r0,r1),r0 +jmp @r0 +; 2 cycles worse than SFUNC_STATIC diff --git a/gcc/config/sh/divtab-sh4.c b/gcc/config/sh/divtab-sh4.c new file mode 100644 index 00000000000..e7de6c49a74 --- /dev/null +++ b/gcc/config/sh/divtab-sh4.c @@ -0,0 +1,90 @@ +/* Copyright (C) 2004 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2, or (at your option) any +later version. + +In addition to the permissions in the GNU General Public License, the +Free Software Foundation gives you unlimited permission to link the +compiled version of this file into combinations with other programs, +and to distribute those combinations without any restriction coming +from the use of this file. (The General Public License restrictions +do apply in other respects; for example, they cover modification of +the file, and distribution when not linked into a combine +executable.) + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; see the file COPYING. If not, write to +the Free Software Foundation, 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +/* Calculate division table for SH2..4 integer division + Contributed by Joern Rernnecke + joern.rennecke@superh.com */ + +#include +#include + +int +main () +{ + int i, j; + double q, r, err, max_err = 0, max_s_err = 0; + + puts("/* This table has been generated by divtab-sh4.c. */"); + puts ("\t.balign 4"); + puts ("LOCAL(div_table_clz):"); + /* output some dummy number for 1/0. */ + printf ("\t.byte\t%d\n", 0); + for (i = 1; i <= 128; i++) + { + int n = 0; + if (i == 128) + puts ("\ +/* Lookup table translating positive divisor to index into table of\n\ + normalized inverse. N.B. the '0' entry is also the last entry of the\n\ + previous table, and causes an unaligned access for division by zero. */\n\ +LOCAL(div_table_ix):"); + for (j = i; j <= 128; j += j) + n++; + printf ("\t.byte\t%d\n", n - 7); + } + for (i = 1; i <= 128; i++) + { + j = i < 0 ? -i : i; + while (j < 128) + j += j; + printf ("\t.byte\t%d\n", j * 2 - 96*4); + } + puts("\ +/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */\n\ + .balign 4\n\ +LOCAL(zero_l):"); + for (i = 64; i < 128; i++) + { + if (i == 96) + puts ("LOCAL(div_table):"); + q = 4.*(1<<30)*128/i; + r = ceil (q); + /* The value for 64 is actually differently scaled that it would + appear from this calculation. The implicit part is %01, not 10. + Still, since the value in the table is 0 either way, this + doesn't matter here. Still, the 1/64 entry is effectively a 1/128 + entry. */ + printf ("\t.long\t0x%X\n", (unsigned) r); + err = r - q; + if (err > max_err) + max_err = err; + err = err * i / 128; + if (err > max_s_err) + max_s_err = err; + } + printf ("\t/* maximum error: %f scaled: %f*/\n", max_err, max_s_err); + exit (0); +} diff --git a/gcc/config/sh/lib1funcs.asm b/gcc/config/sh/lib1funcs.asm index 53334500865..7dfe73ed783 100644 --- a/gcc/config/sh/lib1funcs.asm +++ b/gcc/config/sh/lib1funcs.asm @@ -1,5 +1,5 @@ /* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, - 2004, 2005 + 2004, 2005, 2006 Free Software Foundation, Inc. This file is free software; you can redistribute it and/or modify it @@ -3019,8 +3019,8 @@ GLOBAL(GCC_pop_shmedia_regs_nofpu): #endif /* __SH5__ == 32 */ #endif /* L_push_pop_shmedia_regs */ -#if __SH5__ #ifdef L_div_table +#if __SH5__ #if defined(__pic__) && defined(__SHMEDIA__) .global GLOBAL(sdivsi3) FUNC(GLOBAL(sdivsi3)) @@ -3247,5 +3247,632 @@ GLOBAL(div_table): .word 17738 .word 17136 .word 16639 + +#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__) +/* This code used shld, thus is not suitable for SH1 / SH2. */ + +/* Signed / unsigned division without use of FPU, optimized for SH4. + Uses a lookup table for divisors in the range -128 .. +128, and + div1 with case distinction for larger divisors in three more ranges. + The code is lumped together with the table to allow the use of mova. */ +#ifdef __LITTLE_ENDIAN__ +#define L_LSB 0 +#define L_LSWMSB 1 +#define L_MSWLSB 2 +#else +#define L_LSB 3 +#define L_LSWMSB 2 +#define L_MSWLSB 1 +#endif + + .balign 4 + .global GLOBAL(udivsi3_i4i) + FUNC(GLOBAL(udivsi3_i4i)) +GLOBAL(udivsi3_i4i): + mov.w LOCAL(c128_w), r1 + div0u + mov r4,r0 + shlr8 r0 + cmp/hi r1,r5 + extu.w r5,r1 + bf LOCAL(udiv_le128) + cmp/eq r5,r1 + bf LOCAL(udiv_ge64k) + shlr r0 + mov r5,r1 + shll16 r5 + mov.l r4,@-r15 + div1 r5,r0 + mov.l r1,@-r15 + div1 r5,r0 + div1 r5,r0 + bra LOCAL(udiv_25) + div1 r5,r0 + +LOCAL(div_le128): + mova LOCAL(div_table_ix),r0 + bra LOCAL(div_le128_2) + mov.b @(r0,r5),r1 +LOCAL(udiv_le128): + mov.l r4,@-r15 + mova LOCAL(div_table_ix),r0 + mov.b @(r0,r5),r1 + mov.l r5,@-r15 +LOCAL(div_le128_2): + mova LOCAL(div_table_inv),r0 + mov.l @(r0,r1),r1 + mov r5,r0 + tst #0xfe,r0 + mova LOCAL(div_table_clz),r0 + dmulu.l r1,r4 + mov.b @(r0,r5),r1 + bt/s LOCAL(div_by_1) + mov r4,r0 + mov.l @r15+,r5 + sts mach,r0 + /* clrt */ + addc r4,r0 + mov.l @r15+,r4 + rotcr r0 + rts + shld r1,r0 + +LOCAL(div_by_1_neg): + neg r4,r0 +LOCAL(div_by_1): + mov.l @r15+,r5 + rts + mov.l @r15+,r4 + +LOCAL(div_ge64k): + bt/s LOCAL(div_r8) + div0u + shll8 r5 + bra LOCAL(div_ge64k_2) + div1 r5,r0 +LOCAL(udiv_ge64k): + cmp/hi r0,r5 + mov r5,r1 + bt LOCAL(udiv_r8) + shll8 r5 + mov.l r4,@-r15 + div1 r5,r0 + mov.l r1,@-r15 +LOCAL(div_ge64k_2): + div1 r5,r0 + mov.l LOCAL(zero_l),r1 + .rept 4 + div1 r5,r0 + .endr + mov.l r1,@-r15 + div1 r5,r0 + mov.w LOCAL(m256_w),r1 + div1 r5,r0 + mov.b r0,@(L_LSWMSB,r15) + xor r4,r0 + and r1,r0 + bra LOCAL(div_ge64k_end) + xor r4,r0 + +LOCAL(div_r8): + shll16 r4 + bra LOCAL(div_r8_2) + shll8 r4 +LOCAL(udiv_r8): + mov.l r4,@-r15 + shll16 r4 + clrt + shll8 r4 + mov.l r5,@-r15 +LOCAL(div_r8_2): + rotcl r4 + mov r0,r1 + div1 r5,r1 + mov r4,r0 + rotcl r0 + mov r5,r4 + div1 r5,r1 + .rept 5 + rotcl r0; div1 r5,r1 + .endr + rotcl r0 + mov.l @r15+,r5 + div1 r4,r1 + mov.l @r15+,r4 + rts + rotcl r0 + + ENDFUNC(GLOBAL(udivsi3_i4i)) + + .global GLOBAL(sdivsi3_i4i) + FUNC(GLOBAL(sdivsi3_i4i)) + /* This is link-compatible with a GLOBAL(sdivsi3) call, + but we effectively clobber only r1. */ +GLOBAL(sdivsi3_i4i): + mov.l r4,@-r15 + cmp/pz r5 + mov.w LOCAL(c128_w), r1 + bt/s LOCAL(pos_divisor) + cmp/pz r4 + mov.l r5,@-r15 + neg r5,r5 + bt/s LOCAL(neg_result) + cmp/hi r1,r5 + neg r4,r4 +LOCAL(pos_result): + extu.w r5,r0 + bf LOCAL(div_le128) + cmp/eq r5,r0 + mov r4,r0 + shlr8 r0 + bf/s LOCAL(div_ge64k) + cmp/hi r0,r5 + div0u + shll16 r5 + div1 r5,r0 + div1 r5,r0 + div1 r5,r0 +LOCAL(udiv_25): + mov.l LOCAL(zero_l),r1 + div1 r5,r0 + div1 r5,r0 + mov.l r1,@-r15 + .rept 3 + div1 r5,r0 + .endr + mov.b r0,@(L_MSWLSB,r15) + xtrct r4,r0 + swap.w r0,r0 + .rept 8 + div1 r5,r0 + .endr + mov.b r0,@(L_LSWMSB,r15) +LOCAL(div_ge64k_end): + .rept 8 + div1 r5,r0 + .endr + mov.l @r15+,r4 ! zero-extension and swap using LS unit. + extu.b r0,r0 + mov.l @r15+,r5 + or r4,r0 + mov.l @r15+,r4 + rts + rotcl r0 + +LOCAL(div_le128_neg): + tst #0xfe,r0 + mova LOCAL(div_table_ix),r0 + mov.b @(r0,r5),r1 + mova LOCAL(div_table_inv),r0 + bt/s LOCAL(div_by_1_neg) + mov.l @(r0,r1),r1 + mova LOCAL(div_table_clz),r0 + dmulu.l r1,r4 + mov.b @(r0,r5),r1 + mov.l @r15+,r5 + sts mach,r0 + /* clrt */ + addc r4,r0 + mov.l @r15+,r4 + rotcr r0 + shld r1,r0 + rts + neg r0,r0 + +LOCAL(pos_divisor): + mov.l r5,@-r15 + bt/s LOCAL(pos_result) + cmp/hi r1,r5 + neg r4,r4 +LOCAL(neg_result): + extu.w r5,r0 + bf LOCAL(div_le128_neg) + cmp/eq r5,r0 + mov r4,r0 + shlr8 r0 + bf/s LOCAL(div_ge64k_neg) + cmp/hi r0,r5 + div0u + mov.l LOCAL(zero_l),r1 + shll16 r5 + div1 r5,r0 + mov.l r1,@-r15 + .rept 7 + div1 r5,r0 + .endr + mov.b r0,@(L_MSWLSB,r15) + xtrct r4,r0 + swap.w r0,r0 + .rept 8 + div1 r5,r0 + .endr + mov.b r0,@(L_LSWMSB,r15) +LOCAL(div_ge64k_neg_end): + .rept 8 + div1 r5,r0 + .endr + mov.l @r15+,r4 ! zero-extension and swap using LS unit. + extu.b r0,r1 + mov.l @r15+,r5 + or r4,r1 +LOCAL(div_r8_neg_end): + mov.l @r15+,r4 + rotcl r1 + rts + neg r1,r0 + +LOCAL(div_ge64k_neg): + bt/s LOCAL(div_r8_neg) + div0u + shll8 r5 + mov.l LOCAL(zero_l),r1 + .rept 6 + div1 r5,r0 + .endr + mov.l r1,@-r15 + div1 r5,r0 + mov.w LOCAL(m256_w),r1 + div1 r5,r0 + mov.b r0,@(L_LSWMSB,r15) + xor r4,r0 + and r1,r0 + bra LOCAL(div_ge64k_neg_end) + xor r4,r0 + +LOCAL(c128_w): + .word 128 + +LOCAL(div_r8_neg): + clrt + shll16 r4 + mov r4,r1 + shll8 r1 + mov r5,r4 + .rept 7 + rotcl r1; div1 r5,r0 + .endr + mov.l @r15+,r5 + rotcl r1 + bra LOCAL(div_r8_neg_end) + div1 r4,r0 + +LOCAL(m256_w): + .word 0xff00 +/* This table has been generated by divtab-sh4.c. */ + .balign 4 +LOCAL(div_table_clz): + .byte 0 + .byte 1 + .byte 0 + .byte -1 + .byte -1 + .byte -2 + .byte -2 + .byte -2 + .byte -2 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 +/* Lookup table translating positive divisor to index into table of + normalized inverse. N.B. the '0' entry is also the last entry of the + previous table, and causes an unaligned access for division by zero. */ +LOCAL(div_table_ix): + .byte -6 + .byte -128 + .byte -128 + .byte 0 + .byte -128 + .byte -64 + .byte 0 + .byte 64 + .byte -128 + .byte -96 + .byte -64 + .byte -32 + .byte 0 + .byte 32 + .byte 64 + .byte 96 + .byte -128 + .byte -112 + .byte -96 + .byte -80 + .byte -64 + .byte -48 + .byte -32 + .byte -16 + .byte 0 + .byte 16 + .byte 32 + .byte 48 + .byte 64 + .byte 80 + .byte 96 + .byte 112 + .byte -128 + .byte -120 + .byte -112 + .byte -104 + .byte -96 + .byte -88 + .byte -80 + .byte -72 + .byte -64 + .byte -56 + .byte -48 + .byte -40 + .byte -32 + .byte -24 + .byte -16 + .byte -8 + .byte 0 + .byte 8 + .byte 16 + .byte 24 + .byte 32 + .byte 40 + .byte 48 + .byte 56 + .byte 64 + .byte 72 + .byte 80 + .byte 88 + .byte 96 + .byte 104 + .byte 112 + .byte 120 + .byte -128 + .byte -124 + .byte -120 + .byte -116 + .byte -112 + .byte -108 + .byte -104 + .byte -100 + .byte -96 + .byte -92 + .byte -88 + .byte -84 + .byte -80 + .byte -76 + .byte -72 + .byte -68 + .byte -64 + .byte -60 + .byte -56 + .byte -52 + .byte -48 + .byte -44 + .byte -40 + .byte -36 + .byte -32 + .byte -28 + .byte -24 + .byte -20 + .byte -16 + .byte -12 + .byte -8 + .byte -4 + .byte 0 + .byte 4 + .byte 8 + .byte 12 + .byte 16 + .byte 20 + .byte 24 + .byte 28 + .byte 32 + .byte 36 + .byte 40 + .byte 44 + .byte 48 + .byte 52 + .byte 56 + .byte 60 + .byte 64 + .byte 68 + .byte 72 + .byte 76 + .byte 80 + .byte 84 + .byte 88 + .byte 92 + .byte 96 + .byte 100 + .byte 104 + .byte 108 + .byte 112 + .byte 116 + .byte 120 + .byte 124 + .byte -128 +/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */ + .balign 4 +LOCAL(zero_l): + .long 0x0 + .long 0xF81F81F9 + .long 0xF07C1F08 + .long 0xE9131AC0 + .long 0xE1E1E1E2 + .long 0xDAE6076C + .long 0xD41D41D5 + .long 0xCD856891 + .long 0xC71C71C8 + .long 0xC0E07039 + .long 0xBACF914D + .long 0xB4E81B4F + .long 0xAF286BCB + .long 0xA98EF607 + .long 0xA41A41A5 + .long 0x9EC8E952 + .long 0x9999999A + .long 0x948B0FCE + .long 0x8F9C18FA + .long 0x8ACB90F7 + .long 0x86186187 + .long 0x81818182 + .long 0x7D05F418 + .long 0x78A4C818 + .long 0x745D1746 + .long 0x702E05C1 + .long 0x6C16C16D + .long 0x68168169 + .long 0x642C8591 + .long 0x60581606 + .long 0x5C9882BA + .long 0x58ED2309 +LOCAL(div_table_inv): + .long 0x55555556 + .long 0x51D07EAF + .long 0x4E5E0A73 + .long 0x4AFD6A06 + .long 0x47AE147B + .long 0x446F8657 + .long 0x41414142 + .long 0x3E22CBCF + .long 0x3B13B13C + .long 0x38138139 + .long 0x3521CFB3 + .long 0x323E34A3 + .long 0x2F684BDB + .long 0x2C9FB4D9 + .long 0x29E4129F + .long 0x27350B89 + .long 0x24924925 + .long 0x21FB7813 + .long 0x1F7047DD + .long 0x1CF06ADB + .long 0x1A7B9612 + .long 0x18118119 + .long 0x15B1E5F8 + .long 0x135C8114 + .long 0x11111112 + .long 0xECF56BF + .long 0xC9714FC + .long 0xA6810A7 + .long 0x8421085 + .long 0x624DD30 + .long 0x4104105 + .long 0x2040811 + /* maximum error: 0.987342 scaled: 0.921875*/ + + ENDFUNC(GLOBAL(sdivsi3_i4i)) +#endif /* SH3 / SH4 */ + #endif /* L_div_table */ -#endif /* __SH5__ */ diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h index 099c938e269..1b7ff5dbe2c 100644 --- a/gcc/config/sh/sh.h +++ b/gcc/config/sh/sh.h @@ -234,6 +234,9 @@ do { \ #define TARGET_DIVIDE_INV20L (sh_div_strategy == SH_DIV_INV20L) #define TARGET_DIVIDE_INV_CALL (sh_div_strategy == SH_DIV_INV_CALL) #define TARGET_DIVIDE_INV_CALL2 (sh_div_strategy == SH_DIV_INV_CALL2) +#define TARGET_DIVIDE_CALL_DIV1 (sh_div_strategy == SH_DIV_CALL_DIV1) +#define TARGET_DIVIDE_CALL_FP (sh_div_strategy == SH_DIV_CALL_FP) +#define TARGET_DIVIDE_CALL_TABLE (sh_div_strategy == SH_DIV_CALL_TABLE) #define SELECT_SH1 (MASK_SH1) #define SELECT_SH2 (MASK_SH2 | SELECT_SH1) @@ -467,7 +470,7 @@ do { \ sh_div_str = SH_DIV_STR_FOR_SIZE ; \ } \ /* We can't meaningfully test TARGET_SHMEDIA here, because -m options \ - haven't been parsed yet, hence we';d read only the default. \ + haven't been parsed yet, hence we'd read only the default. \ sh_target_reg_class will return NO_REGS if this is not SHMEDIA, so \ it's OK to always set flag_branch_target_load_optimize. */ \ if (LEVEL > 1) \ @@ -492,16 +495,24 @@ do { \ extern int assembler_dialect; enum sh_divide_strategy_e { + /* SH5 strategies. */ SH_DIV_CALL, SH_DIV_CALL2, - SH_DIV_FP, + SH_DIV_FP, /* We could do this also for SH4. */ SH_DIV_INV, SH_DIV_INV_MINLAT, SH_DIV_INV20U, SH_DIV_INV20L, SH_DIV_INV_CALL, SH_DIV_INV_CALL2, - SH_DIV_INV_FP + SH_DIV_INV_FP, + /* SH1 .. SH4 strategies. Because of the small number of registers + available, the compiler uses knowledge of the actual et of registers + being clobbed by the different functions called. */ + SH_DIV_CALL_DIV1, /* No FPU, medium size, highest latency. */ + SH_DIV_CALL_FP, /* FPU needed, small size, high latency. */ + SH_DIV_CALL_TABLE, /* No FPU, large size, medium latency. */ + SH_DIV_INTRINSIC }; extern enum sh_divide_strategy_e sh_div_strategy; @@ -611,17 +622,46 @@ do { \ targetm.asm_out.aligned_op.di = NULL; \ targetm.asm_out.unaligned_op.di = NULL; \ } \ + if (TARGET_SH1) \ + { \ + if (! strcmp (sh_div_str, "call-div1")) \ + sh_div_strategy = SH_DIV_CALL_DIV1; \ + else if (! strcmp (sh_div_str, "call-fp") \ + && (TARGET_FPU_DOUBLE \ + || (TARGET_HARD_SH4 && TARGET_SH2E) \ + || (TARGET_SHCOMPACT && TARGET_FPU_ANY))) \ + sh_div_strategy = SH_DIV_CALL_FP; \ + else if (! strcmp (sh_div_str, "call-table") && TARGET_SH3) \ + sh_div_strategy = SH_DIV_CALL_TABLE; \ + else \ + /* Pick one that makes most sense for the target in general. \ + It is not much good to use different functions depending \ + on -Os, since then we'll end up with two different functions \ + when some of the code is compiled for size, and some for \ + speed. */ \ + \ + /* SH4 tends to emphasize speed. */ \ + if (TARGET_HARD_SH4) \ + sh_div_strategy = SH_DIV_CALL_TABLE; \ + /* These have their own way of doing things. */ \ + else if (TARGET_SH2A) \ + sh_div_strategy = SH_DIV_INTRINSIC; \ + /* ??? Should we use the integer SHmedia function instead? */ \ + else if (TARGET_SHCOMPACT && TARGET_FPU_ANY) \ + sh_div_strategy = SH_DIV_CALL_FP; \ + /* SH1 .. SH3 cores often go into small-footprint systems, so \ + default to the smallest implementation available. */ \ + else \ + sh_div_strategy = SH_DIV_CALL_DIV1; \ + } \ if (sh_divsi3_libfunc[0]) \ ; /* User supplied - leave it alone. */ \ - else if (TARGET_HARD_SH4 && TARGET_SH2E) \ + else if (TARGET_DIVIDE_CALL_FP) \ sh_divsi3_libfunc = "__sdivsi3_i4"; \ + else if (TARGET_DIVIDE_CALL_TABLE) \ + sh_divsi3_libfunc = "__sdivsi3_i4i"; \ else if (TARGET_SH5) \ - { \ - if (TARGET_FPU_ANY && TARGET_SH1) \ - sh_divsi3_libfunc = "__sdivsi3_i4"; \ - else \ - sh_divsi3_libfunc = "__sdivsi3_1"; \ - } \ + sh_divsi3_libfunc = "__sdivsi3_1"; \ else \ sh_divsi3_libfunc = "__sdivsi3"; \ if (TARGET_FMOVD) \ diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md index e2e477f2a57..1c1357e0309 100644 --- a/gcc/config/sh/sh.md +++ b/gcc/config/sh/sh.md @@ -1739,6 +1739,19 @@ [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) +(define_insn "udivsi3_i4_int" + [(set (match_operand:SI 0 "register_operand" "=z") + (udiv:SI (reg:SI R4_REG) (reg:SI R5_REG))) + (clobber (reg:SI T_REG)) + (clobber (reg:SI R1_REG)) + (clobber (reg:SI PR_REG)) + (use (match_operand:SI 1 "arith_reg_operand" "r"))] + "TARGET_SH1" + "jsr @%1%#" + [(set_attr "type" "sfunc") + (set_attr "needs_delay_slot" "yes")]) + + (define_expand "udivsi3" [(set (match_dup 3) (symbol_ref:SI "__udivsi3")) (set (reg:SI R4_REG) (match_operand:SI 1 "general_operand" "")) @@ -1757,7 +1770,12 @@ operands[3] = gen_reg_rtx (Pmode); /* Emit the move of the address to a pseudo outside of the libcall. */ - if (TARGET_HARD_SH4 && TARGET_SH2E) + if (TARGET_DIVIDE_CALL_TABLE) + { + function_symbol (operands[3], \"__udivsi3_i4i\", SFUNC_GOT); + last = gen_udivsi3_i4_int (operands[0], operands[3]); + } + else if (TARGET_DIVIDE_CALL_FP) { function_symbol (operands[3], \"__udivsi3_i4\", SFUNC_STATIC); if (TARGET_FPU_SINGLE) @@ -1975,6 +1993,18 @@ [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) +(define_insn "divsi3_i4_int" + [(set (match_operand:SI 0 "register_operand" "=z") + (div:SI (reg:SI R4_REG) (reg:SI R5_REG))) + (clobber (reg:SI T_REG)) + (clobber (reg:SI PR_REG)) + (clobber (reg:SI R1_REG)) + (use (match_operand:SI 1 "arith_reg_operand" "r"))] + "TARGET_SH1" + "jsr @%1%#" + [(set_attr "type" "sfunc") + (set_attr "needs_delay_slot" "yes")]) + (define_expand "divsi3" [(set (match_dup 3) (symbol_ref:SI "__sdivsi3")) (set (reg:SI R4_REG) (match_operand:SI 1 "general_operand" "")) @@ -1995,7 +2025,12 @@ operands[3] = gen_reg_rtx (Pmode); /* Emit the move of the address to a pseudo outside of the libcall. */ - if (TARGET_HARD_SH4 && TARGET_SH2E) + if (TARGET_DIVIDE_CALL_TABLE) + { + function_symbol (operands[3], sh_divsi3_libfunc, SFUNC_GOT); + last = gen_divsi3_i4_int (operands[0], operands[3]); + } + else if (TARGET_DIVIDE_CALL_FP) { function_symbol (operands[3], sh_divsi3_libfunc, SFUNC_STATIC); if (TARGET_FPU_SINGLE) diff --git a/gcc/config/sh/sh.opt b/gcc/config/sh/sh.opt index db332f35181..9b072fef7cb 100644 --- a/gcc/config/sh/sh.opt +++ b/gcc/config/sh/sh.opt @@ -1,6 +1,6 @@ ; Options for the SH port of the compiler. -; Copyright (C) 2005 Free Software Foundation, Inc. +; Copyright (C) 2005, 2006 Free Software Foundation, Inc. ; ; This file is part of GCC. ; @@ -158,7 +158,7 @@ Align doubles at 64-bit boundaries mdiv= Target RejectNegative Joined Var(sh_div_str) Init("") -Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp +Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp call-div1 call-fp call-table mdivsi3_libfunc= Target RejectNegative Joined Var(sh_divsi3_libfunc) Init("") diff --git a/gcc/config/sh/t-sh b/gcc/config/sh/t-sh index db86ad18c1d..65cc1ec53e9 100644 --- a/gcc/config/sh/t-sh +++ b/gcc/config/sh/t-sh @@ -5,6 +5,7 @@ sh-c.o: $(srcdir)/config/sh/sh-c.c \ LIB1ASMSRC = sh/lib1funcs.asm LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movmem \ _movmem_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \ + _div_table \ $(LIB1ASMFUNCS_CACHE) # We want fine grained libraries, so use the new code to build the