/* ieee754-df.S double-precision floating point support for ARM

   Copyright (C) 2003-2013 Free Software Foundation, Inc.
   Contributed by Nicolas Pitre (nico@cam.org)

   This file is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by the
   Free Software Foundation; either version 3, or (at your option) any
   later version.

   This file is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/*
 * Notes: 
 * 
 * The goal of this code is to be as fast as possible.  This is
 * not meant to be easy to understand for the casual reader.
 * For slightly simpler code please see the single precision version
 * of this file.
 * 
 * Only the default rounding mode is intended for best performances.
 * Exceptions aren't supported yet, but that can be added quite easily
 * if necessary without impacting performances.
 */


#ifndef __ARMEB__
#define xl r0
#define xh r1
#define yl r2
#define yh r3
#else
#define xh r0
#define xl r1
#define yh r2
#define yl r3
#endif


#ifdef L_arm_negdf2

ARM_FUNC_START negdf2
ARM_FUNC_ALIAS aeabi_dneg negdf2

	@ flip sign bit
	eor	xh, xh, #0x80000000
	RET

	FUNC_END aeabi_dneg
	FUNC_END negdf2

#endif

#ifdef L_arm_addsubdf3

ARM_FUNC_START aeabi_drsub

	eor	xh, xh, #0x80000000	@ flip sign bit of first arg
	b	1f	

ARM_FUNC_START subdf3
ARM_FUNC_ALIAS aeabi_dsub subdf3

	eor	yh, yh, #0x80000000	@ flip sign bit of second arg
#if defined(__INTERWORKING_STUBS__)
	b	1f			@ Skip Thumb-code prologue
#endif

ARM_FUNC_START adddf3
ARM_FUNC_ALIAS aeabi_dadd adddf3

1:	do_push	{r4, r5, lr}

	@ Look for zeroes, equal values, INF, or NAN.
	shift1	lsl, r4, xh, #1
	shift1	lsl, r5, yh, #1
	teq	r4, r5
	do_it	eq
	teqeq	xl, yl
	do_it	ne, ttt
	COND(orr,s,ne)	ip, r4, xl
	COND(orr,s,ne)	ip, r5, yl
	COND(mvn,s,ne)	ip, r4, asr #21
	COND(mvn,s,ne)	ip, r5, asr #21
	beq	LSYM(Lad_s)

	@ Compute exponent difference.  Make largest exponent in r4,
	@ corresponding arg in xh-xl, and positive exponent difference in r5.
	shift1	lsr, r4, r4, #21
	rsbs	r5, r4, r5, lsr #21
	do_it	lt
	rsblt	r5, r5, #0
	ble	1f
	add	r4, r4, r5
	eor	yl, xl, yl
	eor	yh, xh, yh
	eor	xl, yl, xl
	eor	xh, yh, xh
	eor	yl, xl, yl
	eor	yh, xh, yh
1:
	@ If exponent difference is too large, return largest argument
	@ already in xh-xl.  We need up to 54 bit to handle proper rounding
	@ of 0x1p54 - 1.1.
	cmp	r5, #54
	do_it	hi
	RETLDM	"r4, r5" hi

	@ Convert mantissa to signed integer.
	tst	xh, #0x80000000
	mov	xh, xh, lsl #12
	mov	ip, #0x00100000
	orr	xh, ip, xh, lsr #12
	beq	1f
#if defined(__thumb2__)
	negs	xl, xl
	sbc	xh, xh, xh, lsl #1
#else
	rsbs	xl, xl, #0
	rsc	xh, xh, #0
#endif
1:
	tst	yh, #0x80000000
	mov	yh, yh, lsl #12
	orr	yh, ip, yh, lsr #12
	beq	1f
#if defined(__thumb2__)
	negs	yl, yl
	sbc	yh, yh, yh, lsl #1
#else
	rsbs	yl, yl, #0
	rsc	yh, yh, #0
#endif
1:
	@ If exponent == difference, one or both args were denormalized.
	@ Since this is not common case, rescale them off line.
	teq	r4, r5
	beq	LSYM(Lad_d)
LSYM(Lad_x):

	@ Compensate for the exponent overlapping the mantissa MSB added later
	sub	r4, r4, #1

	@ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip.
	rsbs	lr, r5, #32
	blt	1f
	shift1	lsl, ip, yl, lr
	shiftop adds xl xl yl lsr r5 yl
	adc	xh, xh, #0
	shiftop adds xl xl yh lsl lr yl
	shiftop adcs xh xh yh asr r5 yh
	b	2f
1:	sub	r5, r5, #32
	add	lr, lr, #32
	cmp	yl, #1
	shift1	lsl,ip, yh, lr
	do_it	cs
	orrcs	ip, ip, #2		@ 2 not 1, to allow lsr #1 later
	shiftop adds xl xl yh asr r5 yh
	adcs	xh, xh, yh, asr #31
2:
	@ We now have a result in xh-xl-ip.
	@ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above)
	and	r5, xh, #0x80000000
	bpl	LSYM(Lad_p)
#if defined(__thumb2__)
	mov	lr, #0
	negs	ip, ip
	sbcs	xl, lr, xl
	sbc	xh, lr, xh
#else
	rsbs	ip, ip, #0
	rscs	xl, xl, #0
	rsc	xh, xh, #0
#endif

	@ Determine how to normalize the result.
LSYM(Lad_p):
	cmp	xh, #0x00100000
	bcc	LSYM(Lad_a)
	cmp	xh, #0x00200000
	bcc	LSYM(Lad_e)

	@ Result needs to be shifted right.
	movs	xh, xh, lsr #1
	movs	xl, xl, rrx
	mov	ip, ip, rrx
	add	r4, r4, #1

	@ Make sure we did not bust our exponent.
	mov	r2, r4, lsl #21
	cmn	r2, #(2 << 21)
	bcs	LSYM(Lad_o)

	@ Our result is now properly aligned into xh-xl, remaining bits in ip.
	@ Round with MSB of ip. If halfway between two numbers, round towards
	@ LSB of xl = 0.
	@ Pack final result together.
LSYM(Lad_e):
	cmp	ip, #0x80000000
	do_it	eq
	COND(mov,s,eq)	ip, xl, lsr #1
	adcs	xl, xl, #0
	adc	xh, xh, r4, lsl #20
	orr	xh, xh, r5
	RETLDM	"r4, r5"

	@ Result must be shifted left and exponent adjusted.
LSYM(Lad_a):
	movs	ip, ip, lsl #1
	adcs	xl, xl, xl
	adc	xh, xh, xh
	tst	xh, #0x00100000
	sub	r4, r4, #1
	bne	LSYM(Lad_e)

	@ No rounding necessary since ip will always be 0 at this point.
LSYM(Lad_l):

#if __ARM_ARCH__ < 5

	teq	xh, #0
	movne	r3, #20
	moveq	r3, #52
	moveq	xh, xl
	moveq	xl, #0
	mov	r2, xh
	cmp	r2, #(1 << 16)
	movhs	r2, r2, lsr #16
	subhs	r3, r3, #16
	cmp	r2, #(1 << 8)
	movhs	r2, r2, lsr #8
	subhs	r3, r3, #8
	cmp	r2, #(1 << 4)
	movhs	r2, r2, lsr #4
	subhs	r3, r3, #4
	cmp	r2, #(1 << 2)
	subhs	r3, r3, #2
	sublo	r3, r3, r2, lsr #1
	sub	r3, r3, r2, lsr #3

#else

	teq	xh, #0
	do_it	eq, t
	moveq	xh, xl
	moveq	xl, #0
	clz	r3, xh
	do_it	eq
	addeq	r3, r3, #32
	sub	r3, r3, #11

#endif

	@ determine how to shift the value.
	subs	r2, r3, #32
	bge	2f
	adds	r2, r2, #12
	ble	1f

	@ shift value left 21 to 31 bits, or actually right 11 to 1 bits
	@ since a register switch happened above.
	add	ip, r2, #20
	rsb	r2, r2, #12
	shift1	lsl, xl, xh, ip
	shift1	lsr, xh, xh, r2
	b	3f

	@ actually shift value left 1 to 20 bits, which might also represent
	@ 32 to 52 bits if counting the register switch that happened earlier.
1:	add	r2, r2, #20
2:	do_it	le
	rsble	ip, r2, #32
	shift1	lsl, xh, xh, r2
#if defined(__thumb2__)
	lsr	ip, xl, ip
	itt	le
	orrle	xh, xh, ip
	lslle	xl, xl, r2
#else
	orrle	xh, xh, xl, lsr ip
	movle	xl, xl, lsl r2
#endif

	@ adjust exponent accordingly.
3:	subs	r4, r4, r3
	do_it	ge, tt
	addge	xh, xh, r4, lsl #20
	orrge	xh, xh, r5
	RETLDM	"r4, r5" ge

	@ Exponent too small, denormalize result.
	@ Find out proper shift value.
	mvn	r4, r4
	subs	r4, r4, #31
	bge	2f
	adds	r4, r4, #12
	bgt	1f

	@ shift result right of 1 to 20 bits, sign is in r5.
	add	r4, r4, #20
	rsb	r2, r4, #32
	shift1	lsr, xl, xl, r4
	shiftop orr xl xl xh lsl r2 yh
	shiftop orr xh r5 xh lsr r4 yh
	RETLDM	"r4, r5"

	@ shift result right of 21 to 31 bits, or left 11 to 1 bits after
	@ a register switch from xh to xl.
1:	rsb	r4, r4, #12
	rsb	r2, r4, #32
	shift1	lsr, xl, xl, r2
	shiftop orr xl xl xh lsl r4 yh
	mov	xh, r5
	RETLDM	"r4, r5"

	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
	@ from xh to xl.
2:	shift1	lsr, xl, xh, r4
	mov	xh, r5
	RETLDM	"r4, r5"

	@ Adjust exponents for denormalized arguments.
	@ Note that r4 must not remain equal to 0.
LSYM(Lad_d):
	teq	r4, #0
	eor	yh, yh, #0x00100000
	do_it	eq, te
	eoreq	xh, xh, #0x00100000
	addeq	r4, r4, #1
	subne	r5, r5, #1
	b	LSYM(Lad_x)


LSYM(Lad_s):
	mvns	ip, r4, asr #21
	do_it	ne
	COND(mvn,s,ne)	ip, r5, asr #21
	beq	LSYM(Lad_i)

	teq	r4, r5
	do_it	eq
	teqeq	xl, yl
	beq	1f

	@ Result is x + 0.0 = x or 0.0 + y = y.
	orrs	ip, r4, xl
	do_it	eq, t
	moveq	xh, yh
	moveq	xl, yl
	RETLDM	"r4, r5"

1:	teq	xh, yh

	@ Result is x - x = 0.
	do_it	ne, tt
	movne	xh, #0
	movne	xl, #0
	RETLDM	"r4, r5" ne

	@ Result is x + x = 2x.
	movs	ip, r4, lsr #21
	bne	2f
	movs	xl, xl, lsl #1
	adcs	xh, xh, xh
	do_it	cs
	orrcs	xh, xh, #0x80000000
	RETLDM	"r4, r5"
2:	adds	r4, r4, #(2 << 21)
	do_it	cc, t
	addcc	xh, xh, #(1 << 20)
	RETLDM	"r4, r5" cc
	and	r5, xh, #0x80000000

	@ Overflow: return INF.
LSYM(Lad_o):
	orr	xh, r5, #0x7f000000
	orr	xh, xh, #0x00f00000
	mov	xl, #0
	RETLDM	"r4, r5"

	@ At least one of x or y is INF/NAN.
	@   if xh-xl != INF/NAN: return yh-yl (which is INF/NAN)
	@   if yh-yl != INF/NAN: return xh-xl (which is INF/NAN)
	@   if either is NAN: return NAN
	@   if opposite sign: return NAN
	@   otherwise return xh-xl (which is INF or -INF)
LSYM(Lad_i):
	mvns	ip, r4, asr #21
	do_it	ne, te
	movne	xh, yh
	movne	xl, yl
	COND(mvn,s,eq)	ip, r5, asr #21
	do_it	ne, t
	movne	yh, xh
	movne	yl, xl
	orrs	r4, xl, xh, lsl #12
	do_it	eq, te
	COND(orr,s,eq)	r5, yl, yh, lsl #12
	teqeq	xh, yh
	orrne	xh, xh, #0x00080000	@ quiet NAN
	RETLDM	"r4, r5"

	FUNC_END aeabi_dsub
	FUNC_END subdf3
	FUNC_END aeabi_dadd
	FUNC_END adddf3

ARM_FUNC_START floatunsidf
ARM_FUNC_ALIAS aeabi_ui2d floatunsidf

	teq	r0, #0
	do_it	eq, t
	moveq	r1, #0
	RETc(eq)
	do_push	{r4, r5, lr}
	mov	r4, #0x400		@ initial exponent
	add	r4, r4, #(52-1 - 1)
	mov	r5, #0			@ sign bit is 0
	.ifnc	xl, r0
	mov	xl, r0
	.endif
	mov	xh, #0
	b	LSYM(Lad_l)

	FUNC_END aeabi_ui2d
	FUNC_END floatunsidf

ARM_FUNC_START floatsidf
ARM_FUNC_ALIAS aeabi_i2d floatsidf

	teq	r0, #0
	do_it	eq, t
	moveq	r1, #0
	RETc(eq)
	do_push	{r4, r5, lr}
	mov	r4, #0x400		@ initial exponent
	add	r4, r4, #(52-1 - 1)
	ands	r5, r0, #0x80000000	@ sign bit in r5
	do_it	mi
	rsbmi	r0, r0, #0		@ absolute value
	.ifnc	xl, r0
	mov	xl, r0
	.endif
	mov	xh, #0
	b	LSYM(Lad_l)

	FUNC_END aeabi_i2d
	FUNC_END floatsidf

ARM_FUNC_START extendsfdf2
ARM_FUNC_ALIAS aeabi_f2d extendsfdf2

	movs	r2, r0, lsl #1		@ toss sign bit
	mov	xh, r2, asr #3		@ stretch exponent
	mov	xh, xh, rrx		@ retrieve sign bit
	mov	xl, r2, lsl #28		@ retrieve remaining bits
	do_it	ne, ttt
	COND(and,s,ne)	r3, r2, #0xff000000	@ isolate exponent
	teqne	r3, #0xff000000		@ if not 0, check if INF or NAN
	eorne	xh, xh, #0x38000000	@ fixup exponent otherwise.
	RETc(ne)			@ and return it.

	teq	r2, #0			@ if actually 0
	do_it	ne, e
	teqne	r3, #0xff000000		@ or INF or NAN
	RETc(eq)			@ we are done already.

	@ value was denormalized.  We can normalize it now.
	do_push	{r4, r5, lr}
	mov	r4, #0x380		@ setup corresponding exponent
	and	r5, xh, #0x80000000	@ move sign bit in r5
	bic	xh, xh, #0x80000000
	b	LSYM(Lad_l)

	FUNC_END aeabi_f2d
	FUNC_END extendsfdf2

ARM_FUNC_START floatundidf
ARM_FUNC_ALIAS aeabi_ul2d floatundidf

	orrs	r2, r0, r1
	do_it	eq
	RETc(eq)

	do_push	{r4, r5, lr}

	mov	r5, #0
	b	2f

ARM_FUNC_START floatdidf
ARM_FUNC_ALIAS aeabi_l2d floatdidf

	orrs	r2, r0, r1
	do_it	eq
	RETc(eq)

	do_push	{r4, r5, lr}

	ands	r5, ah, #0x80000000	@ sign bit in r5
	bpl	2f
#if defined(__thumb2__)
	negs	al, al
	sbc	ah, ah, ah, lsl #1
#else
	rsbs	al, al, #0
	rsc	ah, ah, #0
#endif
2:
	mov	r4, #0x400		@ initial exponent
	add	r4, r4, #(52-1 - 1)

	@ If FP word order does not match integer word order, swap the words.
	.ifnc	xh, ah
	mov	ip, al
	mov	xh, ah
	mov	xl, ip
	.endif

	movs	ip, xh, lsr #22
	beq	LSYM(Lad_p)

	@ The value is too big.  Scale it down a bit...
	mov	r2, #3
	movs	ip, ip, lsr #3
	do_it	ne
	addne	r2, r2, #3
	movs	ip, ip, lsr #3
	do_it	ne
	addne	r2, r2, #3
	add	r2, r2, ip, lsr #3

	rsb	r3, r2, #32
	shift1	lsl, ip, xl, r3
	shift1	lsr, xl, xl, r2
	shiftop orr xl xl xh lsl r3 lr
	shift1	lsr, xh, xh, r2
	add	r4, r4, r2
	b	LSYM(Lad_p)

	FUNC_END floatdidf
	FUNC_END aeabi_l2d
	FUNC_END floatundidf
	FUNC_END aeabi_ul2d

#endif /* L_addsubdf3 */

#ifdef L_arm_muldivdf3

ARM_FUNC_START muldf3
ARM_FUNC_ALIAS aeabi_dmul muldf3
	do_push	{r4, r5, r6, lr}

	@ Mask out exponents, trap any zero/denormal/INF/NAN.
	mov	ip, #0xff
	orr	ip, ip, #0x700
	ands	r4, ip, xh, lsr #20
	do_it	ne, tte
	COND(and,s,ne)	r5, ip, yh, lsr #20
	teqne	r4, ip
	teqne	r5, ip
	bleq	LSYM(Lml_s)

	@ Add exponents together
	add	r4, r4, r5

	@ Determine final sign.
	eor	r6, xh, yh

	@ Convert mantissa to unsigned integer.
	@ If power of two, branch to a separate path.
	bic	xh, xh, ip, lsl #21
	bic	yh, yh, ip, lsl #21
	orrs	r5, xl, xh, lsl #12
	do_it	ne
	COND(orr,s,ne)	r5, yl, yh, lsl #12
	orr	xh, xh, #0x00100000
	orr	yh, yh, #0x00100000
	beq	LSYM(Lml_1)

#if __ARM_ARCH__ < 4

	@ Put sign bit in r6, which will be restored in yl later.
	and   r6, r6, #0x80000000

	@ Well, no way to make it shorter without the umull instruction.
	stmfd	sp!, {r6, r7, r8, r9, sl, fp}
	mov	r7, xl, lsr #16
	mov	r8, yl, lsr #16
	mov	r9, xh, lsr #16
	mov	sl, yh, lsr #16
	bic	xl, xl, r7, lsl #16
	bic	yl, yl, r8, lsl #16
	bic	xh, xh, r9, lsl #16
	bic	yh, yh, sl, lsl #16
	mul	ip, xl, yl
	mul	fp, xl, r8
	mov	lr, #0
	adds	ip, ip, fp, lsl #16
	adc	lr, lr, fp, lsr #16
	mul	fp, r7, yl
	adds	ip, ip, fp, lsl #16
	adc	lr, lr, fp, lsr #16
	mul	fp, xl, sl
	mov	r5, #0
	adds	lr, lr, fp, lsl #16
	adc	r5, r5, fp, lsr #16
	mul	fp, r7, yh
	adds	lr, lr, fp, lsl #16
	adc	r5, r5, fp, lsr #16
	mul	fp, xh, r8
	adds	lr, lr, fp, lsl #16
	adc	r5, r5, fp, lsr #16
	mul	fp, r9, yl
	adds	lr, lr, fp, lsl #16
	adc	r5, r5, fp, lsr #16
	mul	fp, xh, sl
	mul	r6, r9, sl
	adds	r5, r5, fp, lsl #16
	adc	r6, r6, fp, lsr #16
	mul	fp, r9, yh
	adds	r5, r5, fp, lsl #16
	adc	r6, r6, fp, lsr #16
	mul	fp, xl, yh
	adds	lr, lr, fp
	mul	fp, r7, sl
	adcs	r5, r5, fp
	mul	fp, xh, yl
	adc	r6, r6, #0
	adds	lr, lr, fp
	mul	fp, r9, r8
	adcs	r5, r5, fp
	mul	fp, r7, r8
	adc	r6, r6, #0
	adds	lr, lr, fp
	mul	fp, xh, yh
	adcs	r5, r5, fp
	adc	r6, r6, #0
	ldmfd	sp!, {yl, r7, r8, r9, sl, fp}

#else

	@ Here is the actual multiplication.
	umull	ip, lr, xl, yl
	mov	r5, #0
	umlal	lr, r5, xh, yl
	and	yl, r6, #0x80000000
	umlal	lr, r5, xl, yh
	mov	r6, #0
	umlal	r5, r6, xh, yh

#endif

	@ The LSBs in ip are only significant for the final rounding.
	@ Fold them into lr.
	teq	ip, #0
	do_it	ne
	orrne	lr, lr, #1

	@ Adjust result upon the MSB position.
	sub	r4, r4, #0xff
	cmp	r6, #(1 << (20-11))
	sbc	r4, r4, #0x300
	bcs	1f
	movs	lr, lr, lsl #1
	adcs	r5, r5, r5
	adc	r6, r6, r6
1:
	@ Shift to final position, add sign to result.
	orr	xh, yl, r6, lsl #11
	orr	xh, xh, r5, lsr #21
	mov	xl, r5, lsl #11
	orr	xl, xl, lr, lsr #21
	mov	lr, lr, lsl #11

	@ Check exponent range for under/overflow.
	subs	ip, r4, #(254 - 1)
	do_it	hi
	cmphi	ip, #0x700
	bhi	LSYM(Lml_u)

	@ Round the result, merge final exponent.
	cmp	lr, #0x80000000
	do_it	eq
	COND(mov,s,eq)	lr, xl, lsr #1
	adcs	xl, xl, #0
	adc	xh, xh, r4, lsl #20
	RETLDM	"r4, r5, r6"

	@ Multiplication by 0x1p*: let''s shortcut a lot of code.
LSYM(Lml_1):
	and	r6, r6, #0x80000000
	orr	xh, r6, xh
	orr	xl, xl, yl
	eor	xh, xh, yh
	subs	r4, r4, ip, lsr #1
	do_it	gt, tt
	COND(rsb,s,gt)	r5, r4, ip
	orrgt	xh, xh, r4, lsl #20
	RETLDM	"r4, r5, r6" gt

	@ Under/overflow: fix things up for the code below.
	orr	xh, xh, #0x00100000
	mov	lr, #0
	subs	r4, r4, #1

LSYM(Lml_u):
	@ Overflow?
	bgt	LSYM(Lml_o)

	@ Check if denormalized result is possible, otherwise return signed 0.
	cmn	r4, #(53 + 1)
	do_it	le, tt
	movle	xl, #0
	bicle	xh, xh, #0x7fffffff
	RETLDM	"r4, r5, r6" le

	@ Find out proper shift value.
	rsb	r4, r4, #0
	subs	r4, r4, #32
	bge	2f
	adds	r4, r4, #12
	bgt	1f

	@ shift result right of 1 to 20 bits, preserve sign bit, round, etc.
	add	r4, r4, #20
	rsb	r5, r4, #32
	shift1	lsl, r3, xl, r5
	shift1	lsr, xl, xl, r4
	shiftop orr xl xl xh lsl r5 r2
	and	r2, xh, #0x80000000
	bic	xh, xh, #0x80000000
	adds	xl, xl, r3, lsr #31
	shiftop adc xh r2 xh lsr r4 r6
	orrs	lr, lr, r3, lsl #1
	do_it	eq
	biceq	xl, xl, r3, lsr #31
	RETLDM	"r4, r5, r6"

	@ shift result right of 21 to 31 bits, or left 11 to 1 bits after
	@ a register switch from xh to xl. Then round.
1:	rsb	r4, r4, #12
	rsb	r5, r4, #32
	shift1	lsl, r3, xl, r4
	shift1	lsr, xl, xl, r5
	shiftop orr xl xl xh lsl r4 r2
	bic	xh, xh, #0x7fffffff
	adds	xl, xl, r3, lsr #31
	adc	xh, xh, #0
	orrs	lr, lr, r3, lsl #1
	do_it	eq
	biceq	xl, xl, r3, lsr #31
	RETLDM	"r4, r5, r6"

	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
	@ from xh to xl.  Leftover bits are in r3-r6-lr for rounding.
2:	rsb	r5, r4, #32
	shiftop orr lr lr xl lsl r5 r2
	shift1	lsr, r3, xl, r4
	shiftop orr r3 r3 xh lsl r5 r2
	shift1	lsr, xl, xh, r4
	bic	xh, xh, #0x7fffffff
	shiftop bic xl xl xh lsr r4 r2
	add	xl, xl, r3, lsr #31
	orrs	lr, lr, r3, lsl #1
	do_it	eq
	biceq	xl, xl, r3, lsr #31
	RETLDM	"r4, r5, r6"

	@ One or both arguments are denormalized.
	@ Scale them leftwards and preserve sign bit.
LSYM(Lml_d):
	teq	r4, #0
	bne	2f
	and	r6, xh, #0x80000000
1:	movs	xl, xl, lsl #1
	adc	xh, xh, xh
	tst	xh, #0x00100000
	do_it	eq
	subeq	r4, r4, #1
	beq	1b
	orr	xh, xh, r6
	teq	r5, #0
	do_it	ne
	RETc(ne)
2:	and	r6, yh, #0x80000000
3:	movs	yl, yl, lsl #1
	adc	yh, yh, yh
	tst	yh, #0x00100000
	do_it	eq
	subeq	r5, r5, #1
	beq	3b
	orr	yh, yh, r6
	RET

LSYM(Lml_s):
	@ Isolate the INF and NAN cases away
	teq	r4, ip
	and	r5, ip, yh, lsr #20
	do_it	ne
	teqne	r5, ip
	beq	1f

	@ Here, one or more arguments are either denormalized or zero.
	orrs	r6, xl, xh, lsl #1
	do_it	ne
	COND(orr,s,ne)	r6, yl, yh, lsl #1
	bne	LSYM(Lml_d)

	@ Result is 0, but determine sign anyway.
LSYM(Lml_z):
	eor	xh, xh, yh
	and	xh, xh, #0x80000000
	mov	xl, #0
	RETLDM	"r4, r5, r6"

1:	@ One or both args are INF or NAN.
	orrs	r6, xl, xh, lsl #1
	do_it	eq, te
	moveq	xl, yl
	moveq	xh, yh
	COND(orr,s,ne)	r6, yl, yh, lsl #1
	beq	LSYM(Lml_n)		@ 0 * INF or INF * 0 -> NAN
	teq	r4, ip
	bne	1f
	orrs	r6, xl, xh, lsl #12
	bne	LSYM(Lml_n)		@ NAN * <anything> -> NAN
1:	teq	r5, ip
	bne	LSYM(Lml_i)
	orrs	r6, yl, yh, lsl #12
	do_it	ne, t
	movne	xl, yl
	movne	xh, yh
	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN

	@ Result is INF, but we need to determine its sign.
LSYM(Lml_i):
	eor	xh, xh, yh

	@ Overflow: return INF (sign already in xh).
LSYM(Lml_o):
	and	xh, xh, #0x80000000
	orr	xh, xh, #0x7f000000
	orr	xh, xh, #0x00f00000
	mov	xl, #0
	RETLDM	"r4, r5, r6"

	@ Return a quiet NAN.
LSYM(Lml_n):
	orr	xh, xh, #0x7f000000
	orr	xh, xh, #0x00f80000
	RETLDM	"r4, r5, r6"

	FUNC_END aeabi_dmul
	FUNC_END muldf3

ARM_FUNC_START divdf3
ARM_FUNC_ALIAS aeabi_ddiv divdf3
	
	do_push	{r4, r5, r6, lr}

	@ Mask out exponents, trap any zero/denormal/INF/NAN.
	mov	ip, #0xff
	orr	ip, ip, #0x700
	ands	r4, ip, xh, lsr #20
	do_it	ne, tte
	COND(and,s,ne)	r5, ip, yh, lsr #20
	teqne	r4, ip
	teqne	r5, ip
	bleq	LSYM(Ldv_s)

	@ Subtract divisor exponent from dividend''s.
	sub	r4, r4, r5

	@ Preserve final sign into lr.
	eor	lr, xh, yh

	@ Convert mantissa to unsigned integer.
	@ Dividend -> r5-r6, divisor -> yh-yl.
	orrs	r5, yl, yh, lsl #12
	mov	xh, xh, lsl #12
	beq	LSYM(Ldv_1)
	mov	yh, yh, lsl #12
	mov	r5, #0x10000000
	orr	yh, r5, yh, lsr #4
	orr	yh, yh, yl, lsr #24
	mov	yl, yl, lsl #8
	orr	r5, r5, xh, lsr #4
	orr	r5, r5, xl, lsr #24
	mov	r6, xl, lsl #8

	@ Initialize xh with final sign bit.
	and	xh, lr, #0x80000000

	@ Ensure result will land to known bit position.
	@ Apply exponent bias accordingly.
	cmp	r5, yh
	do_it	eq
	cmpeq	r6, yl
	adc	r4, r4, #(255 - 2)
	add	r4, r4, #0x300
	bcs	1f
	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
1:
	@ Perform first subtraction to align result to a nibble.
	subs	r6, r6, yl
	sbc	r5, r5, yh
	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
	mov	xl, #0x00100000
	mov	ip, #0x00080000

	@ The actual division loop.
1:	subs	lr, r6, yl
	sbcs	lr, r5, yh
	do_it	cs, tt
	subcs	r6, r6, yl
	movcs	r5, lr
	orrcs	xl, xl, ip
	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
	subs	lr, r6, yl
	sbcs	lr, r5, yh
	do_it	cs, tt
	subcs	r6, r6, yl
	movcs	r5, lr
	orrcs	xl, xl, ip, lsr #1
	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
	subs	lr, r6, yl
	sbcs	lr, r5, yh
	do_it	cs, tt
	subcs	r6, r6, yl
	movcs	r5, lr
	orrcs	xl, xl, ip, lsr #2
	movs	yh, yh, lsr #1
	mov	yl, yl, rrx
	subs	lr, r6, yl
	sbcs	lr, r5, yh
	do_it	cs, tt
	subcs	r6, r6, yl
	movcs	r5, lr
	orrcs	xl, xl, ip, lsr #3

	orrs	lr, r5, r6
	beq	2f
	mov	r5, r5, lsl #4
	orr	r5, r5, r6, lsr #28
	mov	r6, r6, lsl #4
	mov	yh, yh, lsl #3
	orr	yh, yh, yl, lsr #29
	mov	yl, yl, lsl #3
	movs	ip, ip, lsr #4
	bne	1b

	@ We are done with a word of the result.
	@ Loop again for the low word if this pass was for the high word.
	tst	xh, #0x00100000
	bne	3f
	orr	xh, xh, xl
	mov	xl, #0
	mov	ip, #0x80000000
	b	1b
2:
	@ Be sure result starts in the high word.
	tst	xh, #0x00100000
	do_it	eq, t
	orreq	xh, xh, xl
	moveq	xl, #0
3:
	@ Check exponent range for under/overflow.
	subs	ip, r4, #(254 - 1)
	do_it	hi
	cmphi	ip, #0x700
	bhi	LSYM(Lml_u)

	@ Round the result, merge final exponent.
	subs	ip, r5, yh
	do_it	eq, t
	COND(sub,s,eq)	ip, r6, yl
	COND(mov,s,eq)	ip, xl, lsr #1
	adcs	xl, xl, #0
	adc	xh, xh, r4, lsl #20
	RETLDM	"r4, r5, r6"

	@ Division by 0x1p*: shortcut a lot of code.
LSYM(Ldv_1):
	and	lr, lr, #0x80000000
	orr	xh, lr, xh, lsr #12
	adds	r4, r4, ip, lsr #1
	do_it	gt, tt
	COND(rsb,s,gt)	r5, r4, ip
	orrgt	xh, xh, r4, lsl #20
	RETLDM	"r4, r5, r6" gt

	orr	xh, xh, #0x00100000
	mov	lr, #0
	subs	r4, r4, #1
	b	LSYM(Lml_u)

	@ Result mightt need to be denormalized: put remainder bits
	@ in lr for rounding considerations.
LSYM(Ldv_u):
	orr	lr, r5, r6
	b	LSYM(Lml_u)

	@ One or both arguments is either INF, NAN or zero.
LSYM(Ldv_s):
	and	r5, ip, yh, lsr #20
	teq	r4, ip
	do_it	eq
	teqeq	r5, ip
	beq	LSYM(Lml_n)		@ INF/NAN / INF/NAN -> NAN
	teq	r4, ip
	bne	1f
	orrs	r4, xl, xh, lsl #12
	bne	LSYM(Lml_n)		@ NAN / <anything> -> NAN
	teq	r5, ip
	bne	LSYM(Lml_i)		@ INF / <anything> -> INF
	mov	xl, yl
	mov	xh, yh
	b	LSYM(Lml_n)		@ INF / (INF or NAN) -> NAN
1:	teq	r5, ip
	bne	2f
	orrs	r5, yl, yh, lsl #12
	beq	LSYM(Lml_z)		@ <anything> / INF -> 0
	mov	xl, yl
	mov	xh, yh
	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
2:	@ If both are nonzero, we need to normalize and resume above.
	orrs	r6, xl, xh, lsl #1
	do_it	ne
	COND(orr,s,ne)	r6, yl, yh, lsl #1
	bne	LSYM(Lml_d)
	@ One or both arguments are 0.
	orrs	r4, xl, xh, lsl #1
	bne	LSYM(Lml_i)		@ <non_zero> / 0 -> INF
	orrs	r5, yl, yh, lsl #1
	bne	LSYM(Lml_z)		@ 0 / <non_zero> -> 0
	b	LSYM(Lml_n)		@ 0 / 0 -> NAN

	FUNC_END aeabi_ddiv
	FUNC_END divdf3

#endif /* L_muldivdf3 */

#ifdef L_arm_cmpdf2

@ Note: only r0 (return value) and ip are clobbered here.

ARM_FUNC_START gtdf2
ARM_FUNC_ALIAS gedf2 gtdf2
	mov	ip, #-1
	b	1f

ARM_FUNC_START ltdf2
ARM_FUNC_ALIAS ledf2 ltdf2
	mov	ip, #1
	b	1f

ARM_FUNC_START cmpdf2
ARM_FUNC_ALIAS nedf2 cmpdf2
ARM_FUNC_ALIAS eqdf2 cmpdf2
	mov	ip, #1			@ how should we specify unordered here?

1:	str	ip, [sp, #-4]!

	@ Trap any INF/NAN first.
	mov	ip, xh, lsl #1
	mvns	ip, ip, asr #21
	mov	ip, yh, lsl #1
	do_it	ne
	COND(mvn,s,ne)	ip, ip, asr #21
	beq	3f

	@ Test for equality.
	@ Note that 0.0 is equal to -0.0.
2:	add	sp, sp, #4
	orrs	ip, xl, xh, lsl #1	@ if x == 0.0 or -0.0
	do_it	eq, e
	COND(orr,s,eq)	ip, yl, yh, lsl #1	@ and y == 0.0 or -0.0
	teqne	xh, yh			@ or xh == yh
	do_it	eq, tt
	teqeq	xl, yl			@ and xl == yl
	moveq	r0, #0			@ then equal.
	RETc(eq)

	@ Clear C flag
	cmn	r0, #0

	@ Compare sign, 
	teq	xh, yh

	@ Compare values if same sign
	do_it	pl
	cmppl	xh, yh
	do_it	eq
	cmpeq	xl, yl

	@ Result:
	do_it	cs, e
	movcs	r0, yh, asr #31
	mvncc	r0, yh, asr #31
	orr	r0, r0, #1
	RET

	@ Look for a NAN.
3:	mov	ip, xh, lsl #1
	mvns	ip, ip, asr #21
	bne	4f
	orrs	ip, xl, xh, lsl #12
	bne	5f			@ x is NAN
4:	mov	ip, yh, lsl #1
	mvns	ip, ip, asr #21
	bne	2b
	orrs	ip, yl, yh, lsl #12
	beq	2b			@ y is not NAN
5:	ldr	r0, [sp], #4		@ unordered return code
	RET

	FUNC_END gedf2
	FUNC_END gtdf2
	FUNC_END ledf2
	FUNC_END ltdf2
	FUNC_END nedf2
	FUNC_END eqdf2
	FUNC_END cmpdf2

ARM_FUNC_START aeabi_cdrcmple

	mov	ip, r0
	mov	r0, r2
	mov	r2, ip
	mov	ip, r1
	mov	r1, r3
	mov	r3, ip
	b	6f
	
ARM_FUNC_START aeabi_cdcmpeq
ARM_FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq

	@ The status-returning routines are required to preserve all
	@ registers except ip, lr, and cpsr.
6:	do_push	{r0, lr}
	ARM_CALL cmpdf2
	@ Set the Z flag correctly, and the C flag unconditionally.
	cmp	r0, #0
	@ Clear the C flag if the return value was -1, indicating
	@ that the first operand was smaller than the second.
	do_it	mi
	cmnmi	r0, #0
	RETLDM	"r0"

	FUNC_END aeabi_cdcmple
	FUNC_END aeabi_cdcmpeq
	FUNC_END aeabi_cdrcmple
	
ARM_FUNC_START	aeabi_dcmpeq

	str	lr, [sp, #-8]!
	ARM_CALL aeabi_cdcmple
	do_it	eq, e
	moveq	r0, #1	@ Equal to.
	movne	r0, #0	@ Less than, greater than, or unordered.
	RETLDM

	FUNC_END aeabi_dcmpeq

ARM_FUNC_START	aeabi_dcmplt

	str	lr, [sp, #-8]!
	ARM_CALL aeabi_cdcmple
	do_it	cc, e
	movcc	r0, #1	@ Less than.
	movcs	r0, #0	@ Equal to, greater than, or unordered.
	RETLDM

	FUNC_END aeabi_dcmplt

ARM_FUNC_START	aeabi_dcmple

	str	lr, [sp, #-8]!
	ARM_CALL aeabi_cdcmple
	do_it	ls, e
	movls	r0, #1  @ Less than or equal to.
	movhi	r0, #0	@ Greater than or unordered.
	RETLDM

	FUNC_END aeabi_dcmple

ARM_FUNC_START	aeabi_dcmpge

	str	lr, [sp, #-8]!
	ARM_CALL aeabi_cdrcmple
	do_it	ls, e
	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
	RETLDM

	FUNC_END aeabi_dcmpge

ARM_FUNC_START	aeabi_dcmpgt

	str	lr, [sp, #-8]!
	ARM_CALL aeabi_cdrcmple
	do_it	cc, e
	movcc	r0, #1	@ Operand 2 is less than operand 1.
	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
			@ or they are unordered.
	RETLDM

	FUNC_END aeabi_dcmpgt

#endif /* L_cmpdf2 */

#ifdef L_arm_unorddf2

ARM_FUNC_START unorddf2
ARM_FUNC_ALIAS aeabi_dcmpun unorddf2

	mov	ip, xh, lsl #1
	mvns	ip, ip, asr #21
	bne	1f
	orrs	ip, xl, xh, lsl #12
	bne	3f			@ x is NAN
1:	mov	ip, yh, lsl #1
	mvns	ip, ip, asr #21
	bne	2f
	orrs	ip, yl, yh, lsl #12
	bne	3f			@ y is NAN
2:	mov	r0, #0			@ arguments are ordered.
	RET

3:	mov	r0, #1			@ arguments are unordered.
	RET

	FUNC_END aeabi_dcmpun
	FUNC_END unorddf2

#endif /* L_unorddf2 */

#ifdef L_arm_fixdfsi

ARM_FUNC_START fixdfsi
ARM_FUNC_ALIAS aeabi_d2iz fixdfsi

	@ check exponent range.
	mov	r2, xh, lsl #1
	adds	r2, r2, #(1 << 21)
	bcs	2f			@ value is INF or NAN
	bpl	1f			@ value is too small
	mov	r3, #(0xfffffc00 + 31)
	subs	r2, r3, r2, asr #21
	bls	3f			@ value is too large

	@ scale value
	mov	r3, xh, lsl #11
	orr	r3, r3, #0x80000000
	orr	r3, r3, xl, lsr #21
	tst	xh, #0x80000000		@ the sign bit
	shift1	lsr, r0, r3, r2
	do_it	ne
	rsbne	r0, r0, #0
	RET

1:	mov	r0, #0
	RET

2:	orrs	xl, xl, xh, lsl #12
	bne	4f			@ x is NAN.
3:	ands	r0, xh, #0x80000000	@ the sign bit
	do_it	eq
	moveq	r0, #0x7fffffff		@ maximum signed positive si
	RET

4:	mov	r0, #0			@ How should we convert NAN?
	RET

	FUNC_END aeabi_d2iz
	FUNC_END fixdfsi

#endif /* L_fixdfsi */

#ifdef L_arm_fixunsdfsi

ARM_FUNC_START fixunsdfsi
ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi

	@ check exponent range.
	movs	r2, xh, lsl #1
	bcs	1f			@ value is negative
	adds	r2, r2, #(1 << 21)
	bcs	2f			@ value is INF or NAN
	bpl	1f			@ value is too small
	mov	r3, #(0xfffffc00 + 31)
	subs	r2, r3, r2, asr #21
	bmi	3f			@ value is too large

	@ scale value
	mov	r3, xh, lsl #11
	orr	r3, r3, #0x80000000
	orr	r3, r3, xl, lsr #21
	shift1	lsr, r0, r3, r2
	RET

1:	mov	r0, #0
	RET

2:	orrs	xl, xl, xh, lsl #12
	bne	4f			@ value is NAN.
3:	mov	r0, #0xffffffff		@ maximum unsigned si
	RET

4:	mov	r0, #0			@ How should we convert NAN?
	RET

	FUNC_END aeabi_d2uiz
	FUNC_END fixunsdfsi

#endif /* L_fixunsdfsi */

#ifdef L_arm_truncdfsf2

ARM_FUNC_START truncdfsf2
ARM_FUNC_ALIAS aeabi_d2f truncdfsf2

	@ check exponent range.
	mov	r2, xh, lsl #1
	subs	r3, r2, #((1023 - 127) << 21)
	do_it	cs, t
	COND(sub,s,cs)	ip, r3, #(1 << 21)
	COND(rsb,s,cs)	ip, ip, #(254 << 21)
	bls	2f			@ value is out of range

1:	@ shift and round mantissa
	and	ip, xh, #0x80000000
	mov	r2, xl, lsl #3
	orr	xl, ip, xl, lsr #29
	cmp	r2, #0x80000000
	adc	r0, xl, r3, lsl #2
	do_it	eq
	biceq	r0, r0, #1
	RET

2:	@ either overflow or underflow
	tst	xh, #0x40000000
	bne	3f			@ overflow

	@ check if denormalized value is possible
	adds	r2, r3, #(23 << 21)
	do_it	lt, t
	andlt	r0, xh, #0x80000000	@ too small, return signed 0.
	RETc(lt)

	@ denormalize value so we can resume with the code above afterwards.
	orr	xh, xh, #0x00100000
	mov	r2, r2, lsr #21
	rsb	r2, r2, #24
	rsb	ip, r2, #32
#if defined(__thumb2__)
	lsls	r3, xl, ip
#else
	movs	r3, xl, lsl ip
#endif
	shift1	lsr, xl, xl, r2
	do_it	ne
	orrne	xl, xl, #1		@ fold r3 for rounding considerations. 
	mov	r3, xh, lsl #11
	mov	r3, r3, lsr #11
	shiftop orr xl xl r3 lsl ip ip
	shift1	lsr, r3, r3, r2
	mov	r3, r3, lsl #1
	b	1b

3:	@ chech for NAN
	mvns	r3, r2, asr #21
	bne	5f			@ simple overflow
	orrs	r3, xl, xh, lsl #12
	do_it	ne, tt
	movne	r0, #0x7f000000
	orrne	r0, r0, #0x00c00000
	RETc(ne)			@ return NAN

5:	@ return INF with sign
	and	r0, xh, #0x80000000
	orr	r0, r0, #0x7f000000
	orr	r0, r0, #0x00800000
	RET

	FUNC_END aeabi_d2f
	FUNC_END truncdfsf2

#endif /* L_truncdfsf2 */