re PR target/6526 ([SH4] sdivsi3_i4 can clobber xd0/xd2)
PR target/6526 * config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation. PR target/6526 * gcc.target/sh/pr6526.c: New. From-SVN: r199873
This commit is contained in:
parent
3e56ed50d7
commit
0d00888247
@ -1,3 +1,8 @@
|
||||
2013-06-09 Oleg Endo <olegendo@gcc.gnu.org>
|
||||
|
||||
PR target/6526
|
||||
* gcc.target/sh/pr6526.c: New.
|
||||
|
||||
2013-06-09 Jakub Jelinek <jakub@redhat.com>
|
||||
|
||||
PR target/57568
|
||||
|
64
gcc/testsuite/gcc.target/sh/pr6526.c
Normal file
64
gcc/testsuite/gcc.target/sh/pr6526.c
Normal file
@ -0,0 +1,64 @@
|
||||
/* Check that the XF registers are not clobbered by an integer division
|
||||
that is done using double precision FPU division. */
|
||||
/* { dg-do run { target "sh*-*-*" } } */
|
||||
/* { dg-options "-O1 -mdiv=call-fp" } */
|
||||
/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m4*-single" "-m4*-single-only" } } */
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
extern void __set_fpscr (int);
|
||||
|
||||
void
|
||||
write_xf0 (float* f)
|
||||
{
|
||||
__asm__ __volatile__ ("frchg; fmov.s @%0,fr0; frchg" : : "r" (f) : "memory");
|
||||
}
|
||||
|
||||
void
|
||||
read_xf0 (float* f)
|
||||
{
|
||||
__asm__ __volatile__ ("frchg; fmov.s fr0,@%0; frchg" : : "r" (f) : "memory");
|
||||
}
|
||||
|
||||
int __attribute__ ((noinline))
|
||||
test_00 (int a, int b)
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
|
||||
unsigned int __attribute__ ((noinline))
|
||||
test_01 (unsigned a, unsigned b)
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
|
||||
int __attribute__ ((noinline))
|
||||
test_02 (int x)
|
||||
{
|
||||
return x & 0;
|
||||
}
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
float test_value;
|
||||
int r = 0;
|
||||
|
||||
/* Set FPSCR.FR to 1. */
|
||||
__set_fpscr (0x200000);
|
||||
|
||||
test_value = 123;
|
||||
write_xf0 (&test_value);
|
||||
r += test_00 (40, 4);
|
||||
read_xf0 (&test_value);
|
||||
assert (test_value == 123);
|
||||
|
||||
test_value = 321;
|
||||
write_xf0 (&test_value);
|
||||
r += test_01 (50, 5);
|
||||
read_xf0 (&test_value);
|
||||
assert (test_value == 321);
|
||||
|
||||
return test_02 (r);
|
||||
}
|
@ -1,3 +1,9 @@
|
||||
2013-06-09 Oleg Endo <olegendo@gcc.gnu.org>
|
||||
|
||||
PR target/6526
|
||||
* config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits
|
||||
other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation.
|
||||
|
||||
2013-06-08 Walter Lee <walt@tilera.com>
|
||||
|
||||
* config/tilepro/atomic.h: Don't include stdint.h or features.h.
|
||||
|
@ -1003,11 +1003,17 @@ hiset: sts macl,r0 ! r0 = bb*dd
|
||||
ENDFUNC(GLOBAL(mulsi3))
|
||||
#endif
|
||||
#endif /* ! __SH5__ */
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
32 bit signed integer division that uses FPU double precision division. */
|
||||
|
||||
#ifdef L_sdivsi3_i4
|
||||
.title "SH DIVIDE"
|
||||
!! 4 byte integer Divide code for the Renesas SH
|
||||
|
||||
#if defined (__SH4__) || defined (__SH2A__)
|
||||
!! args in r4 and r5, result in fpul, clobber dr0, dr2
|
||||
/* This variant is used when FPSCR.PR = 1 (double precision) is the default
|
||||
setting.
|
||||
Args in r4 and r5, result in fpul, clobber dr0, dr2. */
|
||||
|
||||
.global GLOBAL(sdivsi3_i4)
|
||||
HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
|
||||
@ -1021,8 +1027,13 @@ GLOBAL(sdivsi3_i4):
|
||||
ftrc dr0,fpul
|
||||
|
||||
ENDFUNC(GLOBAL(sdivsi3_i4))
|
||||
|
||||
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
|
||||
!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2
|
||||
/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
|
||||
setting.
|
||||
Args in r4 and r5, result in fpul, clobber r2, dr0, dr2.
|
||||
For this to work, we must temporarily switch the FPU do double precision,
|
||||
but we better do not touch FPSCR.FR. See PR 6526. */
|
||||
|
||||
#if ! __SH5__ || __SH5__ == 32
|
||||
#if __SH5__
|
||||
@ -1031,10 +1042,26 @@ GLOBAL(sdivsi3_i4):
|
||||
.global GLOBAL(sdivsi3_i4)
|
||||
HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
|
||||
GLOBAL(sdivsi3_i4):
|
||||
sts.l fpscr,@-r15
|
||||
mov #8,r2
|
||||
swap.w r2,r2
|
||||
|
||||
#ifndef __SH4A__
|
||||
mov.l r3,@-r15
|
||||
sts fpscr,r2
|
||||
mov #8,r3
|
||||
swap.w r3,r3 // r3 = 1 << 19 (FPSCR.PR bit)
|
||||
or r2,r3
|
||||
lds r3,fpscr // Set FPSCR.PR = 1.
|
||||
lds r4,fpul
|
||||
float fpul,dr0
|
||||
lds r5,fpul
|
||||
float fpul,dr2
|
||||
fdiv dr2,dr0
|
||||
ftrc dr0,fpul
|
||||
lds r2,fpscr
|
||||
rts
|
||||
mov.l @r15+,r3
|
||||
#else
|
||||
/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. */
|
||||
fpchg
|
||||
lds r4,fpul
|
||||
float fpul,dr0
|
||||
lds r5,fpul
|
||||
@ -1042,13 +1069,16 @@ GLOBAL(sdivsi3_i4):
|
||||
fdiv dr2,dr0
|
||||
ftrc dr0,fpul
|
||||
rts
|
||||
lds.l @r15+,fpscr
|
||||
fpchg
|
||||
|
||||
#endif /* __SH4A__ */
|
||||
|
||||
ENDFUNC(GLOBAL(sdivsi3_i4))
|
||||
#endif /* ! __SH5__ || __SH5__ == 32 */
|
||||
#endif /* ! __SH4__ || __SH2A__ */
|
||||
#endif
|
||||
#endif /* L_sdivsi3_i4 */
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
#ifdef L_sdivsi3
|
||||
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
|
||||
sh2e/sh3e code. */
|
||||
@ -1368,21 +1398,26 @@ div0: rts
|
||||
|
||||
ENDFUNC(GLOBAL(sdivsi3))
|
||||
#endif /* ! __SHMEDIA__ */
|
||||
#endif
|
||||
#ifdef L_udivsi3_i4
|
||||
#endif /* L_sdivsi3 */
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
32 bit unsigned integer division that uses FPU double precision division. */
|
||||
|
||||
#ifdef L_udivsi3_i4
|
||||
.title "SH DIVIDE"
|
||||
!! 4 byte integer Divide code for the Renesas SH
|
||||
|
||||
#if defined (__SH4__) || defined (__SH2A__)
|
||||
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
|
||||
!! and t bit
|
||||
/* This variant is used when FPSCR.PR = 1 (double precision) is the default
|
||||
setting.
|
||||
Args in r4 and r5, result in fpul,
|
||||
clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit */
|
||||
|
||||
.global GLOBAL(udivsi3_i4)
|
||||
HIDDEN_FUNC(GLOBAL(udivsi3_i4))
|
||||
GLOBAL(udivsi3_i4):
|
||||
mov #1,r1
|
||||
cmp/hi r1,r5
|
||||
bf trivial
|
||||
bf/s trivial
|
||||
rotr r1
|
||||
xor r1,r4
|
||||
lds r4,fpul
|
||||
@ -1409,12 +1444,13 @@ trivial:
|
||||
|
||||
.align 2
|
||||
#ifdef FMOVD_WORKS
|
||||
.align 3 ! make double below 8 byte aligned.
|
||||
.align 3 // Make the double below 8 byte aligned.
|
||||
#endif
|
||||
L1:
|
||||
.double 2147483648
|
||||
|
||||
ENDFUNC(GLOBAL(udivsi3_i4))
|
||||
|
||||
#elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__)
|
||||
#if ! __SH5__ || __SH5__ == 32
|
||||
!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
|
||||
@ -1436,21 +1472,33 @@ GLOBAL(udivsi3_i4):
|
||||
|
||||
ENDFUNC(GLOBAL(udivsi3_i4))
|
||||
#endif /* ! __SH5__ || __SH5__ == 32 */
|
||||
|
||||
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
|
||||
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
|
||||
/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
|
||||
setting.
|
||||
Args in r4 and r5, result in fpul,
|
||||
clobber r0, r1, r4, r5, dr0, dr2, dr4.
|
||||
For this to work, we must temporarily switch the FPU do double precision,
|
||||
but we better do not touch FPSCR.FR. See PR 6526. */
|
||||
|
||||
.global GLOBAL(udivsi3_i4)
|
||||
HIDDEN_FUNC(GLOBAL(udivsi3_i4))
|
||||
GLOBAL(udivsi3_i4):
|
||||
|
||||
#ifndef __SH4A__
|
||||
mov #1,r1
|
||||
cmp/hi r1,r5
|
||||
bf trivial
|
||||
bf/s trivial
|
||||
rotr r1 // r1 = 1 << 31
|
||||
sts.l fpscr,@-r15
|
||||
mova L1,r0
|
||||
lds.l @r0+,fpscr
|
||||
rotr r1
|
||||
xor r1,r4
|
||||
mov.l @(0,r15),r0
|
||||
xor r1,r5
|
||||
mov.l L2,r1
|
||||
lds r4,fpul
|
||||
or r0,r1
|
||||
mova L1,r0
|
||||
lds r1,fpscr
|
||||
#ifdef FMOVD_WORKS
|
||||
fmov.d @r0+,dr4
|
||||
#else
|
||||
@ -1458,7 +1506,6 @@ GLOBAL(udivsi3_i4):
|
||||
fmov.s @r0,DR41
|
||||
#endif
|
||||
float fpul,dr0
|
||||
xor r1,r5
|
||||
lds r5,fpul
|
||||
float fpul,dr2
|
||||
fadd dr4,dr0
|
||||
@ -1469,24 +1516,62 @@ GLOBAL(udivsi3_i4):
|
||||
lds.l @r15+,fpscr
|
||||
|
||||
#ifdef FMOVD_WORKS
|
||||
.align 3 ! make double below 8 byte aligned.
|
||||
.align 3 // Make the double below 8 byte aligned.
|
||||
#endif
|
||||
trivial:
|
||||
rts
|
||||
lds r4,fpul
|
||||
|
||||
.align 2
|
||||
L1:
|
||||
#ifndef FMOVD_WORKS
|
||||
.long 0x80000
|
||||
L2:
|
||||
#ifdef FMOVD_WORKS
|
||||
.long 0x180000 // FPSCR.PR = 1, FPSCR.SZ = 1
|
||||
#else
|
||||
.long 0x180000
|
||||
.long 0x80000 // FPSCR.PR = 1
|
||||
#endif
|
||||
L1:
|
||||
.double 2147483648
|
||||
|
||||
#else
|
||||
/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit.
|
||||
Although on SH4A fmovd usually works, it would require either additional
|
||||
two fschg instructions or an FPSCR push + pop. It's not worth the effort
|
||||
for loading only one double constant. */
|
||||
mov #1,r1
|
||||
cmp/hi r1,r5
|
||||
bf/s trivial
|
||||
rotr r1 // r1 = 1 << 31
|
||||
fpchg
|
||||
mova L1,r0
|
||||
xor r1,r4
|
||||
fmov.s @r0+,DR40
|
||||
lds r4,fpul
|
||||
fmov.s @r0,DR41
|
||||
xor r1,r5
|
||||
float fpul,dr0
|
||||
lds r5,fpul
|
||||
float fpul,dr2
|
||||
fadd dr4,dr0
|
||||
fadd dr4,dr2
|
||||
fdiv dr2,dr0
|
||||
ftrc dr0,fpul
|
||||
rts
|
||||
fpchg
|
||||
|
||||
trivial:
|
||||
rts
|
||||
lds r4,fpul
|
||||
|
||||
.align 2
|
||||
L1:
|
||||
.double 2147483648
|
||||
|
||||
#endif /* __SH4A__ */
|
||||
|
||||
|
||||
ENDFUNC(GLOBAL(udivsi3_i4))
|
||||
#endif /* ! __SH4__ */
|
||||
#endif
|
||||
#endif /* L_udivsi3_i4 */
|
||||
|
||||
#ifdef L_udivsi3
|
||||
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
|
||||
|
Loading…
Reference in New Issue
Block a user