351 lines
7.3 KiB
ArmAsm
351 lines
7.3 KiB
ArmAsm
/* From the Intel IA-64 Optimization Guide, choose the minimum latency
|
|
alternative. */
|
|
|
|
#include <sysdep.h>
|
|
#undef ret
|
|
|
|
#include <shlib-compat.h>
|
|
|
|
#if SHLIB_COMPAT(libc, GLIBC_2_2, GLIBC_2_2_6)
|
|
|
|
/* __divtf3
|
|
Compute a 80-bit IEEE double-extended quotient.
|
|
farg0 holds the dividend. farg1 holds the divisor. */
|
|
|
|
ENTRY(___divtf3)
|
|
cmp.eq p7, p0 = r0, r0
|
|
frcpa.s0 f10, p6 = farg0, farg1
|
|
;;
|
|
(p6) cmp.ne p7, p0 = r0, r0
|
|
.pred.rel.mutex p6, p7
|
|
(p6) fnma.s1 f11 = farg1, f10, f1
|
|
(p6) fma.s1 f12 = farg0, f10, f0
|
|
;;
|
|
(p6) fma.s1 f13 = f11, f11, f0
|
|
(p6) fma.s1 f14 = f11, f11, f11
|
|
;;
|
|
(p6) fma.s1 f11 = f13, f13, f11
|
|
(p6) fma.s1 f13 = f14, f10, f10
|
|
;;
|
|
(p6) fma.s1 f10 = f13, f11, f10
|
|
(p6) fnma.s1 f11 = farg1, f12, farg0
|
|
;;
|
|
(p6) fma.s1 f11 = f11, f10, f12
|
|
(p6) fnma.s1 f12 = farg1, f10, f1
|
|
;;
|
|
(p6) fma.s1 f10 = f12, f10, f10
|
|
(p6) fnma.s1 f12 = farg1, f11, farg0
|
|
;;
|
|
(p6) fma.s0 fret0 = f12, f10, f11
|
|
(p7) mov fret0 = f10
|
|
br.ret.sptk rp
|
|
END(___divtf3)
|
|
.symver ___divtf3, __divtf3@GLIBC_2.2
|
|
|
|
/* __divdf3
|
|
Compute a 64-bit IEEE double quotient.
|
|
farg0 holds the dividend. farg1 holds the divisor. */
|
|
|
|
ENTRY(___divdf3)
|
|
cmp.eq p7, p0 = r0, r0
|
|
frcpa.s0 f10, p6 = farg0, farg1
|
|
;;
|
|
(p6) cmp.ne p7, p0 = r0, r0
|
|
.pred.rel.mutex p6, p7
|
|
(p6) fmpy.s1 f11 = farg0, f10
|
|
(p6) fnma.s1 f12 = farg1, f10, f1
|
|
;;
|
|
(p6) fma.s1 f11 = f12, f11, f11
|
|
(p6) fmpy.s1 f13 = f12, f12
|
|
;;
|
|
(p6) fma.s1 f10 = f12, f10, f10
|
|
(p6) fma.s1 f11 = f13, f11, f11
|
|
;;
|
|
(p6) fmpy.s1 f12 = f13, f13
|
|
(p6) fma.s1 f10 = f13, f10, f10
|
|
;;
|
|
(p6) fma.d.s1 f11 = f12, f11, f11
|
|
(p6) fma.s1 f10 = f12, f10, f10
|
|
;;
|
|
(p6) fnma.d.s1 f8 = farg1, f11, farg0
|
|
;;
|
|
(p6) fma.d fret0 = f8, f10, f11
|
|
(p7) mov fret0 = f10
|
|
br.ret.sptk rp
|
|
;;
|
|
END(___divdf3)
|
|
.symver ___divdf3, __divdf3@GLIBC_2.2
|
|
|
|
/* __divsf3
|
|
Compute a 32-bit IEEE float quotient.
|
|
farg0 holds the dividend. farg1 holds the divisor. */
|
|
|
|
ENTRY(___divsf3)
|
|
cmp.eq p7, p0 = r0, r0
|
|
frcpa.s0 f10, p6 = farg0, farg1
|
|
;;
|
|
(p6) cmp.ne p7, p0 = r0, r0
|
|
.pred.rel.mutex p6, p7
|
|
(p6) fmpy.s1 f8 = farg0, f10
|
|
(p6) fnma.s1 f9 = farg1, f10, f1
|
|
;;
|
|
(p6) fma.s1 f8 = f9, f8, f8
|
|
(p6) fmpy.s1 f9 = f9, f9
|
|
;;
|
|
(p6) fma.s1 f8 = f9, f8, f8
|
|
(p6) fmpy.s1 f9 = f9, f9
|
|
;;
|
|
(p6) fma.d.s1 f10 = f9, f8, f8
|
|
;;
|
|
(p6) fnorm.s.s0 fret0 = f10
|
|
(p7) mov fret0 = f10
|
|
br.ret.sptk rp
|
|
;;
|
|
END(___divsf3)
|
|
.symver ___divsf3, __divsf3@GLIBC_2.2
|
|
|
|
/* __divdi3
|
|
Compute a 64-bit integer quotient.
|
|
in0 holds the dividend. in1 holds the divisor. */
|
|
|
|
ENTRY(___divdi3)
|
|
.regstk 2,0,0,0
|
|
/* Transfer inputs to FP registers. */
|
|
setf.sig f8 = in0
|
|
setf.sig f9 = in1
|
|
;;
|
|
/* Convert the inputs to FP, so that they won't be treated as
|
|
unsigned. */
|
|
fcvt.xf f8 = f8
|
|
fcvt.xf f9 = f9
|
|
;;
|
|
/* Compute the reciprocal approximation. */
|
|
frcpa.s1 f10, p6 = f8, f9
|
|
;;
|
|
/* 3 Newton-Raphson iterations. */
|
|
(p6) fnma.s1 f11 = f9, f10, f1
|
|
(p6) fmpy.s1 f12 = f8, f10
|
|
;;
|
|
(p6) fmpy.s1 f13 = f11, f11
|
|
(p6) fma.s1 f12 = f11, f12, f12
|
|
;;
|
|
(p6) fma.s1 f10 = f11, f10, f10
|
|
(p6) fma.s1 f11 = f13, f12, f12
|
|
;;
|
|
(p6) fma.s1 f10 = f13, f10, f10
|
|
(p6) fnma.s1 f12 = f9, f11, f8
|
|
;;
|
|
(p6) fma.s1 f10 = f12, f10, f11
|
|
;;
|
|
/* Round quotient to an integer. */
|
|
fcvt.fx.trunc.s1 f10 = f10
|
|
;;
|
|
/* Transfer result to GP registers. */
|
|
getf.sig ret0 = f10
|
|
br.ret.sptk rp
|
|
;;
|
|
END(___divdi3)
|
|
.symver ___divdi3, __divdi3@GLIBC_2.2
|
|
|
|
/* __moddi3
|
|
Compute a 64-bit integer modulus.
|
|
in0 holds the dividend (a). in1 holds the divisor (b). */
|
|
|
|
ENTRY(___moddi3)
|
|
.regstk 2,0,0,0
|
|
/* Transfer inputs to FP registers. */
|
|
setf.sig f14 = in0
|
|
setf.sig f9 = in1
|
|
;;
|
|
/* Convert the inputs to FP, so that they won't be treated as
|
|
unsigned. */
|
|
fcvt.xf f8 = f14
|
|
fcvt.xf f9 = f9
|
|
;;
|
|
/* Compute the reciprocal approximation. */
|
|
frcpa.s1 f10, p6 = f8, f9
|
|
;;
|
|
/* 3 Newton-Raphson iterations. */
|
|
(p6) fmpy.s1 f12 = f8, f10
|
|
(p6) fnma.s1 f11 = f9, f10, f1
|
|
;;
|
|
(p6) fma.s1 f12 = f11, f12, f12
|
|
(p6) fmpy.s1 f13 = f11, f11
|
|
;;
|
|
(p6) fma.s1 f10 = f11, f10, f10
|
|
(p6) fma.s1 f11 = f13, f12, f12
|
|
;;
|
|
sub in1 = r0, in1
|
|
(p6) fma.s1 f10 = f13, f10, f10
|
|
(p6) fnma.s1 f12 = f9, f11, f8
|
|
;;
|
|
setf.sig f9 = in1
|
|
(p6) fma.s1 f10 = f12, f10, f11
|
|
;;
|
|
fcvt.fx.trunc.s1 f10 = f10
|
|
;;
|
|
/* r = q * (-b) + a */
|
|
xma.l f10 = f10, f9, f14
|
|
;;
|
|
/* Transfer result to GP registers. */
|
|
getf.sig ret0 = f10
|
|
br.ret.sptk rp
|
|
;;
|
|
END(___moddi3)
|
|
.symver ___moddi3, __moddi3@GLIBC_2.2
|
|
|
|
/* __udivdi3
|
|
Compute a 64-bit unsigned integer quotient.
|
|
in0 holds the dividend. in1 holds the divisor. */
|
|
|
|
ENTRY(___udivdi3)
|
|
.regstk 2,0,0,0
|
|
/* Transfer inputs to FP registers. */
|
|
setf.sig f8 = in0
|
|
setf.sig f9 = in1
|
|
;;
|
|
/* Convert the inputs to FP, to avoid FP software-assist faults. */
|
|
fcvt.xuf.s1 f8 = f8
|
|
fcvt.xuf.s1 f9 = f9
|
|
;;
|
|
/* Compute the reciprocal approximation. */
|
|
frcpa.s1 f10, p6 = f8, f9
|
|
;;
|
|
/* 3 Newton-Raphson iterations. */
|
|
(p6) fnma.s1 f11 = f9, f10, f1
|
|
(p6) fmpy.s1 f12 = f8, f10
|
|
;;
|
|
(p6) fmpy.s1 f13 = f11, f11
|
|
(p6) fma.s1 f12 = f11, f12, f12
|
|
;;
|
|
(p6) fma.s1 f10 = f11, f10, f10
|
|
(p6) fma.s1 f11 = f13, f12, f12
|
|
;;
|
|
(p6) fma.s1 f10 = f13, f10, f10
|
|
(p6) fnma.s1 f12 = f9, f11, f8
|
|
;;
|
|
(p6) fma.s1 f10 = f12, f10, f11
|
|
;;
|
|
/* Round quotient to an unsigned integer. */
|
|
fcvt.fxu.trunc.s1 f10 = f10
|
|
;;
|
|
/* Transfer result to GP registers. */
|
|
getf.sig ret0 = f10
|
|
br.ret.sptk rp
|
|
;;
|
|
END(___udivdi3)
|
|
.symver ___udivdi3, __udivdi3@GLIBC_2.2
|
|
|
|
/* __umoddi3
|
|
Compute a 64-bit unsigned integer modulus.
|
|
in0 holds the dividend (a). in1 holds the divisor (b). */
|
|
|
|
ENTRY(___umoddi3)
|
|
.regstk 2,0,0,0
|
|
/* Transfer inputs to FP registers. */
|
|
setf.sig f14 = in0
|
|
setf.sig f9 = in1
|
|
;;
|
|
/* Convert the inputs to FP, to avoid FP software assist faults. */
|
|
fcvt.xuf.s1 f8 = f14
|
|
fcvt.xuf.s1 f9 = f9
|
|
;;
|
|
/* Compute the reciprocal approximation. */
|
|
frcpa.s1 f10, p6 = f8, f9
|
|
;;
|
|
/* 3 Newton-Raphson iterations. */
|
|
(p6) fmpy.s1 f12 = f8, f10
|
|
(p6) fnma.s1 f11 = f9, f10, f1
|
|
;;
|
|
(p6) fma.s1 f12 = f11, f12, f12
|
|
(p6) fmpy.s1 f13 = f11, f11
|
|
;;
|
|
(p6) fma.s1 f10 = f11, f10, f10
|
|
(p6) fma.s1 f11 = f13, f12, f12
|
|
;;
|
|
sub in1 = r0, in1
|
|
(p6) fma.s1 f10 = f13, f10, f10
|
|
(p6) fnma.s1 f12 = f9, f11, f8
|
|
;;
|
|
setf.sig f9 = in1
|
|
(p6) fma.s1 f10 = f12, f10, f11
|
|
;;
|
|
/* Round quotient to an unsigned integer. */
|
|
fcvt.fxu.trunc.s1 f10 = f10
|
|
;;
|
|
/* r = q * (-b) + a */
|
|
xma.l f10 = f10, f9, f14
|
|
;;
|
|
/* Transfer result to GP registers. */
|
|
getf.sig ret0 = f10
|
|
br.ret.sptk rp
|
|
;;
|
|
END(___umoddi3)
|
|
.symver ___umoddi3, __umoddi3@GLIBC_2.2
|
|
|
|
/* __multi3
|
|
Compute a 128-bit multiply of 128-bit multiplicands.
|
|
in0/in1 holds one multiplicand (a), in2/in3 holds the other one (b). */
|
|
|
|
ENTRY(___multi3)
|
|
.regstk 4,0,0,0
|
|
setf.sig f6 = in1
|
|
movl r19 = 0xffffffff
|
|
setf.sig f7 = in2
|
|
;;
|
|
and r14 = r19, in0
|
|
;;
|
|
setf.sig f10 = r14
|
|
and r14 = r19, in2
|
|
xmpy.l f9 = f6, f7
|
|
;;
|
|
setf.sig f6 = r14
|
|
shr.u r14 = in0, 32
|
|
;;
|
|
setf.sig f7 = r14
|
|
shr.u r14 = in2, 32
|
|
;;
|
|
setf.sig f8 = r14
|
|
xmpy.l f11 = f10, f6
|
|
xmpy.l f6 = f7, f6
|
|
;;
|
|
getf.sig r16 = f11
|
|
xmpy.l f7 = f7, f8
|
|
;;
|
|
shr.u r14 = r16, 32
|
|
and r16 = r19, r16
|
|
getf.sig r17 = f6
|
|
setf.sig f6 = in0
|
|
;;
|
|
setf.sig f11 = r14
|
|
getf.sig r21 = f7
|
|
setf.sig f7 = in3
|
|
;;
|
|
xma.l f11 = f10, f8, f11
|
|
xma.l f6 = f6, f7, f9
|
|
;;
|
|
getf.sig r18 = f11
|
|
;;
|
|
add r18 = r18, r17
|
|
;;
|
|
and r15 = r19, r18
|
|
cmp.ltu p7, p6 = r18, r17
|
|
;;
|
|
getf.sig r22 = f6
|
|
(p7) adds r14 = 1, r19
|
|
;;
|
|
(p7) add r21 = r21, r14
|
|
shr.u r14 = r18, 32
|
|
shl r15 = r15, 32
|
|
;;
|
|
add r20 = r21, r14
|
|
;;
|
|
add ret0 = r15, r16
|
|
add ret1 = r22, r20
|
|
br.ret.sptk rp
|
|
;;
|
|
END(___multi3)
|
|
.symver ___multi3, __multi3@GLIBC_2.2
|
|
|
|
#endif
|