212 lines
5.0 KiB
ArmAsm
212 lines
5.0 KiB
ArmAsm
/* libgcc functions for Blackfin.
|
|
Copyright (C) 2005, 2009 Free Software Foundation, Inc.
|
|
Contributed by Analog Devices.
|
|
|
|
This file is part of GCC.
|
|
|
|
GCC is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3, or (at your option)
|
|
any later version.
|
|
|
|
GCC is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
Under Section 7 of GPL version 3, you are granted additional
|
|
permissions described in the GCC Runtime Library Exception, version
|
|
3.1, as published by the Free Software Foundation.
|
|
|
|
You should have received a copy of the GNU General Public License and
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#ifdef L_divsi3
|
|
.text
|
|
.align 2
|
|
.global ___divsi3;
|
|
.type ___divsi3, STT_FUNC;
|
|
|
|
___divsi3:
|
|
[--SP]= RETS;
|
|
[--SP] = R7;
|
|
|
|
R2 = -R0;
|
|
CC = R0 < 0;
|
|
IF CC R0 = R2;
|
|
R7 = CC;
|
|
|
|
R2 = -R1;
|
|
CC = R1 < 0;
|
|
IF CC R1 = R2;
|
|
R2 = CC;
|
|
R7 = R7 ^ R2;
|
|
|
|
CALL ___udivsi3;
|
|
|
|
CC = R7;
|
|
R1 = -R0;
|
|
IF CC R0 = R1;
|
|
|
|
R7 = [SP++];
|
|
RETS = [SP++];
|
|
RTS;
|
|
#endif
|
|
|
|
#ifdef L_modsi3
|
|
.align 2
|
|
.global ___modsi3;
|
|
.type ___modsi3, STT_FUNC;
|
|
|
|
___modsi3:
|
|
[--SP] = RETS;
|
|
[--SP] = R0;
|
|
[--SP] = R1;
|
|
CALL ___divsi3;
|
|
R2 = [SP++];
|
|
R1 = [SP++];
|
|
R2 *= R0;
|
|
R0 = R1 - R2;
|
|
RETS = [SP++];
|
|
RTS;
|
|
#endif
|
|
|
|
#ifdef L_udivsi3
|
|
.align 2
|
|
.global ___udivsi3;
|
|
.type ___udivsi3, STT_FUNC;
|
|
|
|
___udivsi3:
|
|
P0 = 32;
|
|
LSETUP (0f, 1f) LC0 = P0;
|
|
/* upper half of dividend */
|
|
R3 = 0;
|
|
0:
|
|
/* The first time round in the loop we shift in garbage, but since we
|
|
perform 33 shifts, it doesn't matter. */
|
|
R0 = ROT R0 BY 1;
|
|
R3 = ROT R3 BY 1;
|
|
R2 = R3 - R1;
|
|
CC = R3 < R1 (IU);
|
|
1:
|
|
/* Last instruction of the loop. */
|
|
IF ! CC R3 = R2;
|
|
|
|
/* Shift in the last bit. */
|
|
R0 = ROT R0 BY 1;
|
|
/* R0 is the result, R3 contains the remainder. */
|
|
R0 = ~ R0;
|
|
RTS;
|
|
#endif
|
|
|
|
#ifdef L_umodsi3
|
|
.align 2
|
|
.global ___umodsi3;
|
|
.type ___umodsi3, STT_FUNC;
|
|
|
|
___umodsi3:
|
|
[--SP] = RETS;
|
|
CALL ___udivsi3;
|
|
R0 = R3;
|
|
RETS = [SP++];
|
|
RTS;
|
|
#endif
|
|
|
|
#ifdef L_umulsi3_highpart
|
|
.align 2
|
|
.global ___umulsi3_highpart;
|
|
.type ___umulsi3_highpart, STT_FUNC;
|
|
|
|
___umulsi3_highpart:
|
|
A1 = R1.L * R0.L (FU);
|
|
A1 = A1 >> 16;
|
|
A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU);
|
|
A1 += R0.L * R1.H (FU);
|
|
A1 = A1 >> 16;
|
|
A0 += A1;
|
|
R0 = A0 (FU);
|
|
RTS;
|
|
#endif
|
|
|
|
#ifdef L_smulsi3_highpart
|
|
.align 2
|
|
.global ___smulsi3_highpart;
|
|
.type ___smulsi3_highpart, STT_FUNC;
|
|
|
|
___smulsi3_highpart:
|
|
A1 = R1.L * R0.L (FU);
|
|
A1 = A1 >> 16;
|
|
A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M);
|
|
A1 += R1.H * R0.L (IS,M);
|
|
A1 = A1 >>> 16;
|
|
R0 = (A0 += A1);
|
|
RTS;
|
|
#endif
|
|
|
|
#ifdef L_muldi3
|
|
.align 2
|
|
.global ___muldi3;
|
|
.type ___muldi3, STT_FUNC;
|
|
|
|
/*
|
|
R1:R0 * R3:R2
|
|
= R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
|
|
[X] = (R1.h * R3.h) * 2^96
|
|
[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80
|
|
[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
|
|
[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
|
|
[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
|
|
[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16
|
|
[T4] + (R0.l * R2.l)
|
|
|
|
We can discard the first three lines marked "X" since we produce
|
|
only a 64 bit result. So, we need ten 16-bit multiplies.
|
|
|
|
Individual mul-acc results:
|
|
[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
|
|
[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
|
|
[E3] = R0.l * R2.h + R2.l * R0.h
|
|
[E4] = R0.l * R2.l
|
|
|
|
We also need to add high parts from lower-level results to higher ones:
|
|
E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
|
|
|
|
One interesting property is that all parts of the result that depend
|
|
on the sign of the multiplication are discarded. Those would be the
|
|
multiplications involving R1.h and R3.h, but only the top 16 bit of
|
|
the 32 bit result depend on the sign, and since R1.h and R3.h only
|
|
occur in E1, the top half of these results is cut off.
|
|
So, we can just use FU mode for all of the 16-bit multiplies, and
|
|
ignore questions of when to use mixed mode. */
|
|
|
|
___muldi3:
|
|
/* [SP] technically is part of the caller's frame, but we can
|
|
use it as scratch space. */
|
|
A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */
|
|
A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */
|
|
A0 += A1; /* E1 */
|
|
R4 = A0.w;
|
|
A0 = R0.l * R3.l (FU); /* E2 */
|
|
A0 += R2.l * R1.l (FU); /* E2 */
|
|
|
|
A1 = R2.L * R0.L (FU); /* E4 */
|
|
R3 = A1.w;
|
|
A1 = A1 >> 16; /* E3c */
|
|
A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */
|
|
A1 += R0.L * R2.H (FU); /* E3c */
|
|
R0 = A1.w;
|
|
A1 = A1 >> 16; /* E2c */
|
|
A0 += A1; /* E2c */
|
|
R1 = A0.w;
|
|
|
|
/* low(result) = low(E3c):low(E4) */
|
|
R0 = PACK (R0.l, R3.l);
|
|
/* high(result) = E2c + (E1 << 16) */
|
|
R1.h = R1.h + R4.l (NS) || R4 = [SP];
|
|
RTS;
|
|
|
|
.size ___muldi3, .-___muldi3
|
|
#endif
|