re PR target/6526 ([SH4] sdivsi3_i4 can clobber xd0/xd2)

PR target/6526
	* config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits
	other than FPSCR.PR and FPSCR.SZ.  Add SH4A implementation.

	PR target/6526
	* gcc.target/sh/pr6526.c: New.

From-SVN: r199873
This commit is contained in:
Oleg Endo 2013-06-09 21:32:37 +00:00
parent 3e56ed50d7
commit 0d00888247
4 changed files with 231 additions and 71 deletions

View File

@ -1,3 +1,8 @@
2013-06-09 Oleg Endo <olegendo@gcc.gnu.org>
PR target/6526
* gcc.target/sh/pr6526.c: New.
2013-06-09 Jakub Jelinek <jakub@redhat.com>
PR target/57568

View File

@ -0,0 +1,64 @@
/* Check that the XF registers are not clobbered by an integer division
that is done using double precision FPU division. */
/* { dg-do run { target "sh*-*-*" } } */
/* { dg-options "-O1 -mdiv=call-fp" } */
/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m4*-single" "-m4*-single-only" } } */
#include <assert.h>
#include <stdlib.h>
extern void __set_fpscr (int);
void
write_xf0 (float* f)
{
__asm__ __volatile__ ("frchg; fmov.s @%0,fr0; frchg" : : "r" (f) : "memory");
}
void
read_xf0 (float* f)
{
__asm__ __volatile__ ("frchg; fmov.s fr0,@%0; frchg" : : "r" (f) : "memory");
}
int __attribute__ ((noinline))
test_00 (int a, int b)
{
return a / b;
}
unsigned int __attribute__ ((noinline))
test_01 (unsigned a, unsigned b)
{
return a / b;
}
int __attribute__ ((noinline))
test_02 (int x)
{
return x & 0;
}
int
main (void)
{
float test_value;
int r = 0;
/* Set FPSCR.FR to 1. */
__set_fpscr (0x200000);
test_value = 123;
write_xf0 (&test_value);
r += test_00 (40, 4);
read_xf0 (&test_value);
assert (test_value == 123);
test_value = 321;
write_xf0 (&test_value);
r += test_01 (50, 5);
read_xf0 (&test_value);
assert (test_value == 321);
return test_02 (r);
}

View File

@ -1,3 +1,9 @@
2013-06-09 Oleg Endo <olegendo@gcc.gnu.org>
PR target/6526
* config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits
other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation.
2013-06-08 Walter Lee <walt@tilera.com>
* config/tilepro/atomic.h: Don't include stdint.h or features.h.

View File

@ -1003,11 +1003,17 @@ hiset: sts macl,r0 ! r0 = bb*dd
ENDFUNC(GLOBAL(mulsi3))
#endif
#endif /* ! __SH5__ */
/*------------------------------------------------------------------------------
32 bit signed integer division that uses FPU double precision division. */
#ifdef L_sdivsi3_i4
.title "SH DIVIDE"
!! 4 byte integer Divide code for the Renesas SH
#if defined (__SH4__) || defined (__SH2A__)
!! args in r4 and r5, result in fpul, clobber dr0, dr2
/* This variant is used when FPSCR.PR = 1 (double precision) is the default
setting.
Args in r4 and r5, result in fpul, clobber dr0, dr2. */
.global GLOBAL(sdivsi3_i4)
HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
@ -1021,8 +1027,13 @@ GLOBAL(sdivsi3_i4):
ftrc dr0,fpul
ENDFUNC(GLOBAL(sdivsi3_i4))
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2
/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
setting.
Args in r4 and r5, result in fpul, clobber r2, dr0, dr2.
For this to work, we must temporarily switch the FPU do double precision,
but we better do not touch FPSCR.FR. See PR 6526. */
#if ! __SH5__ || __SH5__ == 32
#if __SH5__
@ -1031,10 +1042,26 @@ GLOBAL(sdivsi3_i4):
.global GLOBAL(sdivsi3_i4)
HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
sts.l fpscr,@-r15
mov #8,r2
swap.w r2,r2
#ifndef __SH4A__
mov.l r3,@-r15
sts fpscr,r2
mov #8,r3
swap.w r3,r3 // r3 = 1 << 19 (FPSCR.PR bit)
or r2,r3
lds r3,fpscr // Set FPSCR.PR = 1.
lds r4,fpul
float fpul,dr0
lds r5,fpul
float fpul,dr2
fdiv dr2,dr0
ftrc dr0,fpul
lds r2,fpscr
rts
mov.l @r15+,r3
#else
/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. */
fpchg
lds r4,fpul
float fpul,dr0
lds r5,fpul
@ -1042,13 +1069,16 @@ GLOBAL(sdivsi3_i4):
fdiv dr2,dr0
ftrc dr0,fpul
rts
lds.l @r15+,fpscr
fpchg
#endif /* __SH4A__ */
ENDFUNC(GLOBAL(sdivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#endif /* ! __SH4__ || __SH2A__ */
#endif
#endif /* L_sdivsi3_i4 */
//------------------------------------------------------------------------------
#ifdef L_sdivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
sh2e/sh3e code. */
@ -1368,21 +1398,26 @@ div0: rts
ENDFUNC(GLOBAL(sdivsi3))
#endif /* ! __SHMEDIA__ */
#endif
#ifdef L_udivsi3_i4
#endif /* L_sdivsi3 */
/*------------------------------------------------------------------------------
32 bit unsigned integer division that uses FPU double precision division. */
#ifdef L_udivsi3_i4
.title "SH DIVIDE"
!! 4 byte integer Divide code for the Renesas SH
#if defined (__SH4__) || defined (__SH2A__)
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
!! and t bit
/* This variant is used when FPSCR.PR = 1 (double precision) is the default
setting.
Args in r4 and r5, result in fpul,
clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit */
.global GLOBAL(udivsi3_i4)
HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
mov #1,r1
cmp/hi r1,r5
bf trivial
bf/s trivial
rotr r1
xor r1,r4
lds r4,fpul
@ -1409,12 +1444,13 @@ trivial:
.align 2
#ifdef FMOVD_WORKS
.align 3 ! make double below 8 byte aligned.
.align 3 // Make the double below 8 byte aligned.
#endif
L1:
.double 2147483648
ENDFUNC(GLOBAL(udivsi3_i4))
#elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__)
#if ! __SH5__ || __SH5__ == 32
!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
@ -1436,21 +1472,33 @@ GLOBAL(udivsi3_i4):
ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
setting.
Args in r4 and r5, result in fpul,
clobber r0, r1, r4, r5, dr0, dr2, dr4.
For this to work, we must temporarily switch the FPU do double precision,
but we better do not touch FPSCR.FR. See PR 6526. */
.global GLOBAL(udivsi3_i4)
HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
#ifndef __SH4A__
mov #1,r1
cmp/hi r1,r5
bf trivial
bf/s trivial
rotr r1 // r1 = 1 << 31
sts.l fpscr,@-r15
mova L1,r0
lds.l @r0+,fpscr
rotr r1
xor r1,r4
mov.l @(0,r15),r0
xor r1,r5
mov.l L2,r1
lds r4,fpul
or r0,r1
mova L1,r0
lds r1,fpscr
#ifdef FMOVD_WORKS
fmov.d @r0+,dr4
#else
@ -1458,7 +1506,6 @@ GLOBAL(udivsi3_i4):
fmov.s @r0,DR41
#endif
float fpul,dr0
xor r1,r5
lds r5,fpul
float fpul,dr2
fadd dr4,dr0
@ -1469,24 +1516,62 @@ GLOBAL(udivsi3_i4):
lds.l @r15+,fpscr
#ifdef FMOVD_WORKS
.align 3 ! make double below 8 byte aligned.
.align 3 // Make the double below 8 byte aligned.
#endif
trivial:
rts
lds r4,fpul
.align 2
L1:
#ifndef FMOVD_WORKS
.long 0x80000
L2:
#ifdef FMOVD_WORKS
.long 0x180000 // FPSCR.PR = 1, FPSCR.SZ = 1
#else
.long 0x180000
.long 0x80000 // FPSCR.PR = 1
#endif
L1:
.double 2147483648
#else
/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit.
Although on SH4A fmovd usually works, it would require either additional
two fschg instructions or an FPSCR push + pop. It's not worth the effort
for loading only one double constant. */
mov #1,r1
cmp/hi r1,r5
bf/s trivial
rotr r1 // r1 = 1 << 31
fpchg
mova L1,r0
xor r1,r4
fmov.s @r0+,DR40
lds r4,fpul
fmov.s @r0,DR41
xor r1,r5
float fpul,dr0
lds r5,fpul
float fpul,dr2
fadd dr4,dr0
fadd dr4,dr2
fdiv dr2,dr0
ftrc dr0,fpul
rts
fpchg
trivial:
rts
lds r4,fpul
.align 2
L1:
.double 2147483648
#endif /* __SH4A__ */
ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH4__ */
#endif
#endif /* L_udivsi3_i4 */
#ifdef L_udivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with