321 lines
6.2 KiB
ArmAsm
321 lines
6.2 KiB
ArmAsm
/* Copyright (C) 2006-2022 Free Software Foundation, Inc.
|
|
|
|
This file is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published by the
|
|
Free Software Foundation; either version 3, or (at your option) any
|
|
later version.
|
|
|
|
This file is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
Under Section 7 of GPL version 3, you are granted additional
|
|
permissions described in the GCC Runtime Library Exception, version
|
|
3.1, as published by the Free Software Foundation.
|
|
|
|
You should have received a copy of the GNU General Public License and
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
/* Moderately Space-optimized libgcc routines for the Renesas SH /
|
|
STMicroelectronics ST40 CPUs.
|
|
Contributed by J"orn Rennecke joern.rennecke@st.com. */
|
|
|
|
#include "lib1funcs.h"
|
|
|
|
#ifdef L_udivsi3_i4i
|
|
|
|
/* 88 bytes; sh4-200 cycle counts:
|
|
divisor >= 2G: 11 cycles
|
|
dividend < 2G: 48 cycles
|
|
dividend >= 2G: divisor != 1: 54 cycles
|
|
dividend >= 2G, divisor == 1: 22 cycles */
|
|
#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
|
|
!! args in r4 and r5, result in r0, clobber r1
|
|
|
|
.global GLOBAL(udivsi3_i4i)
|
|
FUNC(GLOBAL(udivsi3_i4i))
|
|
GLOBAL(udivsi3_i4i):
|
|
mova L1,r0
|
|
cmp/pz r5
|
|
sts fpscr,r1
|
|
lds.l @r0+,fpscr
|
|
sts.l fpul,@-r15
|
|
bf LOCAL(huge_divisor)
|
|
mov.l r1,@-r15
|
|
lds r4,fpul
|
|
cmp/pz r4
|
|
#ifdef FMOVD_WORKS
|
|
fmov.d dr0,@-r15
|
|
float fpul,dr0
|
|
fmov.d dr2,@-r15
|
|
bt LOCAL(dividend_adjusted)
|
|
mov #1,r1
|
|
fmov.d @r0,dr2
|
|
cmp/eq r1,r5
|
|
bt LOCAL(div_by_1)
|
|
fadd dr2,dr0
|
|
LOCAL(dividend_adjusted):
|
|
lds r5,fpul
|
|
float fpul,dr2
|
|
fdiv dr2,dr0
|
|
LOCAL(div_by_1):
|
|
fmov.d @r15+,dr2
|
|
ftrc dr0,fpul
|
|
fmov.d @r15+,dr0
|
|
#else /* !FMOVD_WORKS */
|
|
fmov.s DR01,@-r15
|
|
mov #1,r1
|
|
fmov.s DR00,@-r15
|
|
float fpul,dr0
|
|
fmov.s DR21,@-r15
|
|
bt/s LOCAL(dividend_adjusted)
|
|
fmov.s DR20,@-r15
|
|
cmp/eq r1,r5
|
|
bt LOCAL(div_by_1)
|
|
fmov.s @r0+,DR20
|
|
fmov.s @r0,DR21
|
|
fadd dr2,dr0
|
|
LOCAL(dividend_adjusted):
|
|
lds r5,fpul
|
|
float fpul,dr2
|
|
fdiv dr2,dr0
|
|
LOCAL(div_by_1):
|
|
fmov.s @r15+,DR20
|
|
fmov.s @r15+,DR21
|
|
ftrc dr0,fpul
|
|
fmov.s @r15+,DR00
|
|
fmov.s @r15+,DR01
|
|
#endif /* !FMOVD_WORKS */
|
|
lds.l @r15+,fpscr
|
|
sts fpul,r0
|
|
rts
|
|
lds.l @r15+,fpul
|
|
|
|
#ifdef FMOVD_WORKS
|
|
.p2align 3 ! make double below 8 byte aligned.
|
|
#endif
|
|
LOCAL(huge_divisor):
|
|
lds r1,fpscr
|
|
add #4,r15
|
|
cmp/hs r5,r4
|
|
rts
|
|
movt r0
|
|
|
|
.p2align 2
|
|
L1:
|
|
#ifndef FMOVD_WORKS
|
|
.long 0x80000
|
|
#else
|
|
.long 0x180000
|
|
#endif
|
|
.double 4294967296
|
|
|
|
ENDFUNC(GLOBAL(udivsi3_i4i))
|
|
#elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */
|
|
|
|
#if 0
|
|
/* With 36 bytes, the following would probably be the most compact
|
|
implementation, but with 139 cycles on an sh4-200, it is extremely slow. */
|
|
GLOBAL(udivsi3_i4i):
|
|
mov.l r2,@-r15
|
|
mov #0,r1
|
|
div0u
|
|
mov r1,r2
|
|
mov.l r3,@-r15
|
|
mov r1,r3
|
|
sett
|
|
mov r4,r0
|
|
LOCAL(loop):
|
|
rotcr r2
|
|
;
|
|
bt/s LOCAL(end)
|
|
cmp/gt r2,r3
|
|
rotcl r0
|
|
bra LOCAL(loop)
|
|
div1 r5,r1
|
|
LOCAL(end):
|
|
rotcl r0
|
|
mov.l @r15+,r3
|
|
rts
|
|
mov.l @r15+,r2
|
|
#endif /* 0 */
|
|
|
|
/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
|
|
sh4-200 run times:
|
|
udiv small divisor: 55 cycles
|
|
udiv large divisor: 52 cycles
|
|
sdiv small divisor, positive result: 59 cycles
|
|
sdiv large divisor, positive result: 56 cycles
|
|
sdiv small divisor, negative result: 65 cycles (*)
|
|
sdiv large divisor, negative result: 62 cycles (*)
|
|
(*): r2 is restored in the rts delay slot and has a lingering latency
|
|
of two more cycles. */
|
|
.balign 4
|
|
.global GLOBAL(udivsi3_i4i)
|
|
FUNC(GLOBAL(udivsi3_i4i))
|
|
FUNC(GLOBAL(sdivsi3_i4i))
|
|
GLOBAL(udivsi3_i4i):
|
|
sts pr,r1
|
|
mov.l r4,@-r15
|
|
extu.w r5,r0
|
|
cmp/eq r5,r0
|
|
swap.w r4,r0
|
|
shlr16 r4
|
|
bf/s LOCAL(large_divisor)
|
|
div0u
|
|
mov.l r5,@-r15
|
|
shll16 r5
|
|
LOCAL(sdiv_small_divisor):
|
|
div1 r5,r4
|
|
bsr LOCAL(div6)
|
|
div1 r5,r4
|
|
div1 r5,r4
|
|
bsr LOCAL(div6)
|
|
div1 r5,r4
|
|
xtrct r4,r0
|
|
xtrct r0,r4
|
|
bsr LOCAL(div7)
|
|
swap.w r4,r4
|
|
div1 r5,r4
|
|
bsr LOCAL(div7)
|
|
div1 r5,r4
|
|
xtrct r4,r0
|
|
mov.l @r15+,r5
|
|
swap.w r0,r0
|
|
mov.l @r15+,r4
|
|
jmp @r1
|
|
rotcl r0
|
|
LOCAL(div7):
|
|
div1 r5,r4
|
|
LOCAL(div6):
|
|
div1 r5,r4; div1 r5,r4; div1 r5,r4
|
|
div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
|
|
|
|
LOCAL(divx3):
|
|
rotcl r0
|
|
div1 r5,r4
|
|
rotcl r0
|
|
div1 r5,r4
|
|
rotcl r0
|
|
rts
|
|
div1 r5,r4
|
|
|
|
LOCAL(large_divisor):
|
|
mov.l r5,@-r15
|
|
LOCAL(sdiv_large_divisor):
|
|
xor r4,r0
|
|
.rept 4
|
|
rotcl r0
|
|
bsr LOCAL(divx3)
|
|
div1 r5,r4
|
|
.endr
|
|
mov.l @r15+,r5
|
|
mov.l @r15+,r4
|
|
jmp @r1
|
|
rotcl r0
|
|
ENDFUNC(GLOBAL(udivsi3_i4i))
|
|
|
|
.global GLOBAL(sdivsi3_i4i)
|
|
GLOBAL(sdivsi3_i4i):
|
|
mov.l r4,@-r15
|
|
cmp/pz r5
|
|
mov.l r5,@-r15
|
|
bt/s LOCAL(pos_divisor)
|
|
cmp/pz r4
|
|
neg r5,r5
|
|
extu.w r5,r0
|
|
bt/s LOCAL(neg_result)
|
|
cmp/eq r5,r0
|
|
neg r4,r4
|
|
LOCAL(pos_result):
|
|
swap.w r4,r0
|
|
bra LOCAL(sdiv_check_divisor)
|
|
sts pr,r1
|
|
LOCAL(pos_divisor):
|
|
extu.w r5,r0
|
|
bt/s LOCAL(pos_result)
|
|
cmp/eq r5,r0
|
|
neg r4,r4
|
|
LOCAL(neg_result):
|
|
mova LOCAL(negate_result),r0
|
|
;
|
|
mov r0,r1
|
|
swap.w r4,r0
|
|
lds r2,macl
|
|
sts pr,r2
|
|
LOCAL(sdiv_check_divisor):
|
|
shlr16 r4
|
|
bf/s LOCAL(sdiv_large_divisor)
|
|
div0u
|
|
bra LOCAL(sdiv_small_divisor)
|
|
shll16 r5
|
|
.balign 4
|
|
LOCAL(negate_result):
|
|
neg r0,r0
|
|
jmp @r2
|
|
sts macl,r2
|
|
ENDFUNC(GLOBAL(sdivsi3_i4i))
|
|
#endif /* !__SH_FPU_DOUBLE__ */
|
|
#endif /* L_udivsi3_i4i */
|
|
|
|
#ifdef L_sdivsi3_i4i
|
|
#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
|
|
/* 48 bytes, 45 cycles on sh4-200 */
|
|
!! args in r4 and r5, result in r0, clobber r1
|
|
|
|
.global GLOBAL(sdivsi3_i4i)
|
|
FUNC(GLOBAL(sdivsi3_i4i))
|
|
GLOBAL(sdivsi3_i4i):
|
|
sts.l fpscr,@-r15
|
|
sts fpul,r1
|
|
mova L1,r0
|
|
lds.l @r0+,fpscr
|
|
lds r4,fpul
|
|
#ifdef FMOVD_WORKS
|
|
fmov.d dr0,@-r15
|
|
float fpul,dr0
|
|
lds r5,fpul
|
|
fmov.d dr2,@-r15
|
|
#else
|
|
fmov.s DR01,@-r15
|
|
fmov.s DR00,@-r15
|
|
float fpul,dr0
|
|
lds r5,fpul
|
|
fmov.s DR21,@-r15
|
|
fmov.s DR20,@-r15
|
|
#endif
|
|
float fpul,dr2
|
|
fdiv dr2,dr0
|
|
#ifdef FMOVD_WORKS
|
|
fmov.d @r15+,dr2
|
|
#else
|
|
fmov.s @r15+,DR20
|
|
fmov.s @r15+,DR21
|
|
#endif
|
|
ftrc dr0,fpul
|
|
#ifdef FMOVD_WORKS
|
|
fmov.d @r15+,dr0
|
|
#else
|
|
fmov.s @r15+,DR00
|
|
fmov.s @r15+,DR01
|
|
#endif
|
|
lds.l @r15+,fpscr
|
|
sts fpul,r0
|
|
rts
|
|
lds r1,fpul
|
|
|
|
.p2align 2
|
|
L1:
|
|
#ifndef FMOVD_WORKS
|
|
.long 0x80000
|
|
#else
|
|
.long 0x180000
|
|
#endif
|
|
|
|
ENDFUNC(GLOBAL(sdivsi3_i4i))
|
|
#endif /* __SH_FPU_DOUBLE__ */
|
|
#endif /* L_sdivsi3_i4i */
|