gcc/libgcc/config/h8300/lib1funcs.S

839 lines
14 KiB
ArmAsm

;; libgcc routines for the Renesas H8/300 CPU.
;; Contributed by Steve Chamberlain <sac@cygnus.com>
;; Optimizations by Toshiyasu Morita <toshiyasu.morita@renesas.com>
/* Copyright (C) 1994, 2000, 2001, 2002, 2003, 2004, 2009
Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.
This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* Assembler register definitions. */
#define A0 r0
#define A0L r0l
#define A0H r0h
#define A1 r1
#define A1L r1l
#define A1H r1h
#define A2 r2
#define A2L r2l
#define A2H r2h
#define A3 r3
#define A3L r3l
#define A3H r3h
#define S0 r4
#define S0L r4l
#define S0H r4h
#define S1 r5
#define S1L r5l
#define S1H r5h
#define S2 r6
#define S2L r6l
#define S2H r6h
#ifdef __H8300__
#define PUSHP push
#define POPP pop
#define A0P r0
#define A1P r1
#define A2P r2
#define A3P r3
#define S0P r4
#define S1P r5
#define S2P r6
#endif
#if defined (__H8300H__) || defined (__H8300S__) || defined (__H8300SX__)
#define PUSHP push.l
#define POPP pop.l
#define A0P er0
#define A1P er1
#define A2P er2
#define A3P er3
#define S0P er4
#define S1P er5
#define S2P er6
#define A0E e0
#define A1E e1
#define A2E e2
#define A3E e3
#endif
#ifdef __H8300H__
#ifdef __NORMAL_MODE__
.h8300hn
#else
.h8300h
#endif
#endif
#ifdef __H8300S__
#ifdef __NORMAL_MODE__
.h8300sn
#else
.h8300s
#endif
#endif
#ifdef __H8300SX__
#ifdef __NORMAL_MODE__
.h8300sxn
#else
.h8300sx
#endif
#endif
#ifdef L_cmpsi2
#ifdef __H8300__
.section .text
.align 2
.global ___cmpsi2
___cmpsi2:
cmp.w A0,A2
bne .L2
cmp.w A1,A3
bne .L4
mov.w #1,A0
rts
.L2:
bgt .L5
.L3:
mov.w #2,A0
rts
.L4:
bls .L3
.L5:
sub.w A0,A0
rts
.end
#endif
#endif /* L_cmpsi2 */
#ifdef L_ucmpsi2
#ifdef __H8300__
.section .text
.align 2
.global ___ucmpsi2
___ucmpsi2:
cmp.w A0,A2
bne .L2
cmp.w A1,A3
bne .L4
mov.w #1,A0
rts
.L2:
bhi .L5
.L3:
mov.w #2,A0
rts
.L4:
bls .L3
.L5:
sub.w A0,A0
rts
.end
#endif
#endif /* L_ucmpsi2 */
#ifdef L_divhi3
;; HImode divides for the H8/300.
;; We bunch all of this into one object file since there are several
;; "supporting routines".
; general purpose normalize routine
;
; divisor in A0
; dividend in A1
; turns both into +ve numbers, and leaves what the answer sign
; should be in A2L
#ifdef __H8300__
.section .text
.align 2
divnorm:
or A0H,A0H ; is divisor > 0
stc ccr,A2L
bge _lab1
not A0H ; no - then make it +ve
not A0L
adds #1,A0
_lab1: or A1H,A1H ; look at dividend
bge _lab2
not A1H ; it is -ve, make it positive
not A1L
adds #1,A1
xor #0x8,A2L; and toggle sign of result
_lab2: rts
;; Basically the same, except that the sign of the divisor determines
;; the sign.
modnorm:
or A0H,A0H ; is divisor > 0
stc ccr,A2L
bge _lab7
not A0H ; no - then make it +ve
not A0L
adds #1,A0
_lab7: or A1H,A1H ; look at dividend
bge _lab8
not A1H ; it is -ve, make it positive
not A1L
adds #1,A1
_lab8: rts
; A0=A0/A1 signed
.global ___divhi3
___divhi3:
bsr divnorm
bsr ___udivhi3
negans: btst #3,A2L ; should answer be negative ?
beq _lab4
not A0H ; yes, so make it so
not A0L
adds #1,A0
_lab4: rts
; A0=A0%A1 signed
.global ___modhi3
___modhi3:
bsr modnorm
bsr ___udivhi3
mov A3,A0
bra negans
; A0=A0%A1 unsigned
.global ___umodhi3
___umodhi3:
bsr ___udivhi3
mov A3,A0
rts
; A0=A0/A1 unsigned
; A3=A0%A1 unsigned
; A2H trashed
; D high 8 bits of denom
; d low 8 bits of denom
; N high 8 bits of num
; n low 8 bits of num
; M high 8 bits of mod
; m low 8 bits of mod
; Q high 8 bits of quot
; q low 8 bits of quot
; P preserve
; The H8/300 only has a 16/8 bit divide, so we look at the incoming and
; see how to partition up the expression.
.global ___udivhi3
___udivhi3:
; A0 A1 A2 A3
; Nn Dd P
sub.w A3,A3 ; Nn Dd xP 00
or A1H,A1H
bne divlongway
or A0H,A0H
beq _lab6
; we know that D == 0 and N is != 0
mov.b A0H,A3L ; Nn Dd xP 0N
divxu A1L,A3 ; MQ
mov.b A3L,A0H ; Q
; dealt with N, do n
_lab6: mov.b A0L,A3L ; n
divxu A1L,A3 ; mq
mov.b A3L,A0L ; Qq
mov.b A3H,A3L ; m
mov.b #0x0,A3H ; Qq 0m
rts
; D != 0 - which means the denominator is
; loop around to get the result.
divlongway:
mov.b A0H,A3L ; Nn Dd xP 0N
mov.b #0x0,A0H ; high byte of answer has to be zero
mov.b #0x8,A2H ; 8
div8: add.b A0L,A0L ; n*=2
rotxl A3L ; Make remainder bigger
rotxl A3H
sub.w A1,A3 ; Q-=N
bhs setbit ; set a bit ?
add.w A1,A3 ; no : too far , Q+=N
dec A2H
bne div8 ; next bit
rts
setbit: inc A0L ; do insert bit
dec A2H
bne div8 ; next bit
rts
#endif /* __H8300__ */
#endif /* L_divhi3 */
#ifdef L_divsi3
;; 4 byte integer divides for the H8/300.
;;
;; We have one routine which does all the work and lots of
;; little ones which prepare the args and massage the sign.
;; We bunch all of this into one object file since there are several
;; "supporting routines".
.section .text
.align 2
; Put abs SIs into r0/r1 and r2/r3, and leave a 1 in r6l with sign of rest.
; This function is here to keep branch displacements small.
#ifdef __H8300__
divnorm:
mov.b A0H,A0H ; is the numerator -ve
stc ccr,S2L ; keep the sign in bit 3 of S2L
bge postive
; negate arg
not A0H
not A1H
not A0L
not A1L
add #1,A1L
addx #0,A1H
addx #0,A0L
addx #0,A0H
postive:
mov.b A2H,A2H ; is the denominator -ve
bge postive2
not A2L
not A2H
not A3L
not A3H
add.b #1,A3L
addx #0,A3H
addx #0,A2L
addx #0,A2H
xor.b #0x08,S2L ; toggle the result sign
postive2:
rts
;; Basically the same, except that the sign of the divisor determines
;; the sign.
modnorm:
mov.b A0H,A0H ; is the numerator -ve
stc ccr,S2L ; keep the sign in bit 3 of S2L
bge mpostive
; negate arg
not A0H
not A1H
not A0L
not A1L
add #1,A1L
addx #0,A1H
addx #0,A0L
addx #0,A0H
mpostive:
mov.b A2H,A2H ; is the denominator -ve
bge mpostive2
not A2L
not A2H
not A3L
not A3H
add.b #1,A3L
addx #0,A3H
addx #0,A2L
addx #0,A2H
mpostive2:
rts
#else /* __H8300H__ */
divnorm:
mov.l A0P,A0P ; is the numerator -ve
stc ccr,S2L ; keep the sign in bit 3 of S2L
bge postive
neg.l A0P ; negate arg
postive:
mov.l A1P,A1P ; is the denominator -ve
bge postive2
neg.l A1P ; negate arg
xor.b #0x08,S2L ; toggle the result sign
postive2:
rts
;; Basically the same, except that the sign of the divisor determines
;; the sign.
modnorm:
mov.l A0P,A0P ; is the numerator -ve
stc ccr,S2L ; keep the sign in bit 3 of S2L
bge mpostive
neg.l A0P ; negate arg
mpostive:
mov.l A1P,A1P ; is the denominator -ve
bge mpostive2
neg.l A1P ; negate arg
mpostive2:
rts
#endif
; numerator in A0/A1
; denominator in A2/A3
.global ___modsi3
___modsi3:
#ifdef __H8300__
PUSHP S2P
PUSHP S0P
PUSHP S1P
bsr modnorm
bsr divmodsi4
mov S0,A0
mov S1,A1
bra exitdiv
#else
PUSHP S2P
bsr modnorm
bsr ___udivsi3
mov.l er3,er0
bra exitdiv
#endif
;; H8/300H and H8S version of ___udivsi3 is defined later in
;; the file.
#ifdef __H8300__
.global ___udivsi3
___udivsi3:
PUSHP S2P
PUSHP S0P
PUSHP S1P
bsr divmodsi4
bra reti
#endif
.global ___umodsi3
___umodsi3:
#ifdef __H8300__
PUSHP S2P
PUSHP S0P
PUSHP S1P
bsr divmodsi4
mov S0,A0
mov S1,A1
bra reti
#else
bsr ___udivsi3
mov.l er3,er0
rts
#endif
.global ___divsi3
___divsi3:
#ifdef __H8300__
PUSHP S2P
PUSHP S0P
PUSHP S1P
jsr divnorm
jsr divmodsi4
#else
PUSHP S2P
jsr divnorm
bsr ___udivsi3
#endif
; examine what the sign should be
exitdiv:
btst #3,S2L
beq reti
; should be -ve
#ifdef __H8300__
not A0H
not A1H
not A0L
not A1L
add #1,A1L
addx #0,A1H
addx #0,A0L
addx #0,A0H
#else /* __H8300H__ */
neg.l A0P
#endif
reti:
#ifdef __H8300__
POPP S1P
POPP S0P
#endif
POPP S2P
rts
; takes A0/A1 numerator (A0P for H8/300H)
; A2/A3 denominator (A1P for H8/300H)
; returns A0/A1 quotient (A0P for H8/300H)
; S0/S1 remainder (S0P for H8/300H)
; trashes S2H
#ifdef __H8300__
divmodsi4:
sub.w S0,S0 ; zero play area
mov.w S0,S1
mov.b A2H,S2H
or A2L,S2H
or A3H,S2H
bne DenHighNonZero
mov.b A0H,A0H
bne NumByte0Zero
mov.b A0L,A0L
bne NumByte1Zero
mov.b A1H,A1H
bne NumByte2Zero
bra NumByte3Zero
NumByte0Zero:
mov.b A0H,S1L
divxu A3L,S1
mov.b S1L,A0H
NumByte1Zero:
mov.b A0L,S1L
divxu A3L,S1
mov.b S1L,A0L
NumByte2Zero:
mov.b A1H,S1L
divxu A3L,S1
mov.b S1L,A1H
NumByte3Zero:
mov.b A1L,S1L
divxu A3L,S1
mov.b S1L,A1L
mov.b S1H,S1L
mov.b #0x0,S1H
rts
; have to do the divide by shift and test
DenHighNonZero:
mov.b A0H,S1L
mov.b A0L,A0H
mov.b A1H,A0L
mov.b A1L,A1H
mov.b #0,A1L
mov.b #24,S2H ; only do 24 iterations
nextbit:
add.w A1,A1 ; double the answer guess
rotxl A0L
rotxl A0H
rotxl S1L ; double remainder
rotxl S1H
rotxl S0L
rotxl S0H
sub.w A3,S1 ; does it all fit
subx A2L,S0L
subx A2H,S0H
bhs setone
add.w A3,S1 ; no, restore mistake
addx A2L,S0L
addx A2H,S0H
dec S2H
bne nextbit
rts
setone:
inc A1L
dec S2H
bne nextbit
rts
#else /* __H8300H__ */
;; This function also computes the remainder and stores it in er3.
.global ___udivsi3
___udivsi3:
mov.w A1E,A1E ; denominator top word 0?
bne DenHighNonZero
; do it the easy way, see page 107 in manual
mov.w A0E,A2
extu.l A2P
divxu.w A1,A2P
mov.w A2E,A0E
divxu.w A1,A0P
mov.w A0E,A3
mov.w A2,A0E
extu.l A3P
rts
; er0 = er0 / er1
; er3 = er0 % er1
; trashes er1 er2
; expects er1 >= 2^16
DenHighNonZero:
mov.l er0,er3
mov.l er1,er2
#ifdef __H8300H__
divmod_L21:
shlr.l er0
shlr.l er2 ; make divisor < 2^16
mov.w e2,e2
bne divmod_L21
#else
shlr.l #2,er2 ; make divisor < 2^16
mov.w e2,e2
beq divmod_L22A
divmod_L21:
shlr.l #2,er0
divmod_L22:
shlr.l #2,er2 ; make divisor < 2^16
mov.w e2,e2
bne divmod_L21
divmod_L22A:
rotxl.w r2
bcs divmod_L23
shlr.l er0
bra divmod_L24
divmod_L23:
rotxr.w r2
shlr.l #2,er0
divmod_L24:
#endif
;; At this point,
;; er0 contains shifted dividend
;; er1 contains divisor
;; er2 contains shifted divisor
;; er3 contains dividend, later remainder
divxu.w r2,er0 ; r0 now contains the approximate quotient (AQ)
extu.l er0
beq divmod_L25
subs #1,er0 ; er0 = AQ - 1
mov.w e1,r2
mulxu.w r0,er2 ; er2 = upper (AQ - 1) * divisor
sub.w r2,e3 ; dividend - 65536 * er2
mov.w r1,r2
mulxu.w r0,er2 ; compute er3 = remainder (tentative)
sub.l er2,er3 ; er3 = dividend - (AQ - 1) * divisor
divmod_L25:
cmp.l er1,er3 ; is divisor < remainder?
blo divmod_L26
adds #1,er0
sub.l er1,er3 ; correct the remainder
divmod_L26:
rts
#endif
#endif /* L_divsi3 */
#ifdef L_mulhi3
;; HImode multiply.
; The H8/300 only has an 8*8->16 multiply.
; The answer is the same as:
;
; product = (srca.l * srcb.l) + ((srca.h * srcb.l) + (srcb.h * srca.l)) * 256
; (we can ignore A1.h * A0.h cause that will all off the top)
; A0 in
; A1 in
; A0 answer
#ifdef __H8300__
.section .text
.align 2
.global ___mulhi3
___mulhi3:
mov.b A1L,A2L ; A2l gets srcb.l
mulxu A0L,A2 ; A2 gets first sub product
mov.b A0H,A3L ; prepare for
mulxu A1L,A3 ; second sub product
add.b A3L,A2H ; sum first two terms
mov.b A1H,A3L ; third sub product
mulxu A0L,A3
add.b A3L,A2H ; almost there
mov.w A2,A0 ; that is
rts
#endif
#endif /* L_mulhi3 */
#ifdef L_mulsi3
;; SImode multiply.
;;
;; I think that shift and add may be sufficient for this. Using the
;; supplied 8x8->16 would need 10 ops of 14 cycles each + overhead. This way
;; the inner loop uses maybe 20 cycles + overhead, but terminates
;; quickly on small args.
;;
;; A0/A1 src_a
;; A2/A3 src_b
;;
;; while (a)
;; {
;; if (a & 1)
;; r += b;
;; a >>= 1;
;; b <<= 1;
;; }
.section .text
.align 2
#ifdef __H8300__
.global ___mulsi3
___mulsi3:
PUSHP S0P
PUSHP S1P
sub.w S0,S0
sub.w S1,S1
; while (a)
_top: mov.w A0,A0
bne _more
mov.w A1,A1
beq _done
_more: ; if (a & 1)
bld #0,A1L
bcc _nobit
; r += b
add.w A3,S1
addx A2L,S0L
addx A2H,S0H
_nobit:
; a >>= 1
shlr A0H
rotxr A0L
rotxr A1H
rotxr A1L
; b <<= 1
add.w A3,A3
addx A2L,A2L
addx A2H,A2H
bra _top
_done:
mov.w S0,A0
mov.w S1,A1
POPP S1P
POPP S0P
rts
#else /* __H8300H__ */
;
; mulsi3 for H8/300H - based on Renesas SH implementation
;
; by Toshiyasu Morita
;
; Old code:
;
; 16b * 16b = 372 states (worst case)
; 32b * 32b = 724 states (worst case)
;
; New code:
;
; 16b * 16b = 48 states
; 16b * 32b = 72 states
; 32b * 32b = 92 states
;
.global ___mulsi3
___mulsi3:
mov.w r1,r2 ; ( 2 states) b * d
mulxu r0,er2 ; (22 states)
mov.w e0,r3 ; ( 2 states) a * d
beq L_skip1 ; ( 4 states)
mulxu r1,er3 ; (22 states)
add.w r3,e2 ; ( 2 states)
L_skip1:
mov.w e1,r3 ; ( 2 states) c * b
beq L_skip2 ; ( 4 states)
mulxu r0,er3 ; (22 states)
add.w r3,e2 ; ( 2 states)
L_skip2:
mov.l er2,er0 ; ( 2 states)
rts ; (10 states)
#endif
#endif /* L_mulsi3 */
#ifdef L_fixunssfsi_asm
/* For the h8300 we use asm to save some bytes, to
allow more programs to fit into the tiny address
space. For the H8/300H and H8S, the C version is good enough. */
#ifdef __H8300__
/* We still treat NANs different than libgcc2.c, but then, the
behavior is undefined anyways. */
.global ___fixunssfsi
___fixunssfsi:
cmp.b #0x4f,r0h
bge Large_num
jmp @___fixsfsi
Large_num:
bhi L_huge_num
xor.b #0x80,A0L
bmi L_shift8
L_huge_num:
mov.w #65535,A0
mov.w A0,A1
rts
L_shift8:
mov.b A0L,A0H
mov.b A1H,A0L
mov.b A1L,A1H
mov.b #0,A1L
rts
#endif
#endif /* L_fixunssfsi_asm */