From 989bdb7461a5f1a8bbb5eb8525b4fa498f3644b4 Mon Sep 17 00:00:00 2001 From: Georg-Johann Lay Date: Mon, 21 Nov 2011 08:56:44 +0000 Subject: [PATCH] re PR target/49313 (Inefficient libgcc implementations for avr) PR target/49313 * config/avr/t-avr (LIB2FUNCS_EXCLUDE): Add _moddi3, _umoddi3. (LIB1ASMFUNCS): Add _divdi3, _udivdi3, _udivmod64, _negdi2. * config/avr/lib1funcs.S (wmov): New assembler macro. (__umoddi3, __udivdi3, __udivdi3_umoddi3): New functions. (__moddi3, __divdi3, __divdi3_moddi3): New functions. (__udivmod64): New function. (__negdi2): New function. From-SVN: r181551 --- libgcc/ChangeLog | 11 ++ libgcc/config/avr/lib1funcs.S | 356 ++++++++++++++++++++++++++++++++++ libgcc/config/avr/t-avr | 4 + 3 files changed, 371 insertions(+) diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index bfcbb8e57bb..b3c8d66b072 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,3 +1,14 @@ +2011-11-21 Georg-Johann Lay + + PR target/49313 + * config/avr/t-avr (LIB2FUNCS_EXCLUDE): Add _moddi3, _umoddi3. + (LIB1ASMFUNCS): Add _divdi3, _udivdi3, _udivmod64, _negdi2. + * config/avr/lib1funcs.S (wmov): New assembler macro. + (__umoddi3, __udivdi3, __udivdi3_umoddi3): New functions. + (__moddi3, __divdi3, __divdi3_moddi3): New functions. + (__udivmod64): New function. + (__negdi2): New function. + 2011-11-21 Gerald Pfeifer * config.host (*-*-freebsd[12], *-*-freebsd[12].*, diff --git a/libgcc/config/avr/lib1funcs.S b/libgcc/config/avr/lib1funcs.S index aee69e50d8e..c592c4caa5d 100644 --- a/libgcc/config/avr/lib1funcs.S +++ b/libgcc/config/avr/lib1funcs.S @@ -61,6 +61,15 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #endif .endm +.macro wmov r_dest, r_src +#if defined (__AVR_HAVE_MOVW__) + movw \r_dest, \r_src +#else + mov \r_dest, \r_src + mov \r_dest+1, \r_src+1 +#endif +.endm + #if defined (__AVR_HAVE_JMP_CALL__) #define XCALL call #define XJMP jmp @@ -846,6 +855,352 @@ __divmodsi4_exit: ENDF __divmodsi4 #endif /* defined (L_divmodsi4) */ + +/******************************************************* + Division 64 / 64 + Modulo 64 % 64 +*******************************************************/ + +;; Use Speed-optimized Version on "big" Devices, i.e. Devices with +;; at least 16k of Program Memory. For smaller Devices, depend +;; on MOVW. + +#if defined (__AVR_HAVE_JMP_CALL__) +# define SPEED_DIV 8 +#elif defined (__AVR_HAVE_MOVW__) +# define SPEED_DIV 16 +#else +# define SPEED_DIV 0 +#endif + +;; A[0..7]: In: Dividend; +;; Out: Quotient (T = 0) +;; Out: Remainder (T = 1) +#define A0 18 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 +#define A4 A0+4 +#define A5 A0+5 +#define A6 A0+6 +#define A7 A0+7 + +;; B[0..7]: In: Divisor; Out: Clobber +#define B0 10 +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 +#define B4 B0+4 +#define B5 B0+5 +#define B6 B0+6 +#define B7 B0+7 + +;; C[0..7]: Expand remainder; Out: Remainder (unused) +#define C0 8 +#define C1 C0+1 +#define C2 30 +#define C3 C2+1 +#define C4 28 +#define C5 C4+1 +#define C6 26 +#define C7 C6+1 + +;; Holds Signs during Division Routine +#define SS __tmp_reg__ + +;; Bit-Counter in Division Routine +#define R_cnt __zero_reg__ + +;; Scratch Register for Negation +#define NN r31 + +#if defined (L_udivdi3) + +;; R25:R18 = R24:R18 umod R17:R10 +;; Ordinary ABI-Function + +DEFUN __umoddi3 + set + rjmp __udivdi3_umoddi3 +ENDF __umoddi3 + +;; R25:R18 = R24:R18 udiv R17:R10 +;; Ordinary ABI-Function + +DEFUN __udivdi3 + clt +ENDF __udivdi3 + +DEFUN __udivdi3_umoddi3 + push C0 + push C1 + push C4 + push C5 + XCALL __udivmod64 + pop C5 + pop C4 + pop C1 + pop C0 + ret +ENDF __udivdi3_umoddi3 +#endif /* L_udivdi3 */ + +#if defined (L_udivmod64) + +;; Worker Routine for 64-Bit unsigned Quotient and Remainder Computation +;; No Registers saved/restored; the Callers will take Care. +;; Preserves B[] and T-flag +;; T = 0: Compute Quotient in A[] +;; T = 1: Compute Remainder in A[] and shift SS one Bit left + +DEFUN __udivmod64 + + ;; Clear Remainder (C6, C7 will follow) + clr C0 + clr C1 + wmov C2, C0 + wmov C4, C0 + ldi C7, 64 + +#if SPEED_DIV == 0 || SPEED_DIV == 16 + ;; Initialize Loop-Counter + mov R_cnt, C7 + wmov C6, C0 +#endif /* SPEED_DIV */ + +#if SPEED_DIV == 8 + + push A7 + clr C6 + +1: ;; Compare shifted Devidend against Divisor + ;; If -- even after Shifting -- it is smaller... + CP A7,B0 $ cpc C0,B1 $ cpc C1,B2 $ cpc C2,B3 + cpc C3,B4 $ cpc C4,B5 $ cpc C5,B6 $ cpc C6,B7 + brcc 2f + + ;; ...then we can subtract it. Thus, it is legal to shift left + $ mov C6,C5 $ mov C5,C4 $ mov C4,C3 + mov C3,C2 $ mov C2,C1 $ mov C1,C0 $ mov C0,A7 + mov A7,A6 $ mov A6,A5 $ mov A5,A4 $ mov A4,A3 + mov A3,A2 $ mov A2,A1 $ mov A1,A0 $ clr A0 + + ;; 8 Bits are done + subi C7, 8 + brne 1b + + ;; Shifted 64 Bits: A7 has traveled to C7 + pop C7 + ;; Divisor is greater than Dividend. We have: + ;; A[] % B[] = A[] + ;; A[] / B[] = 0 + ;; Thus, we can return immediately + rjmp 5f + +2: ;; Initialze Bit-Counter with Number of Bits still to be performed + mov R_cnt, C7 + + ;; Push of A7 is not needed because C7 is still 0 + pop C7 + clr C7 + +#elif SPEED_DIV == 16 + + ;; Compare shifted Dividend against Divisor + cp A7, B3 + cpc C0, B4 + cpc C1, B5 + cpc C2, B6 + cpc C3, B7 + brcc 2f + + ;; Divisor is greater than shifted Dividen: We can shift the Dividend + ;; and it is still smaller than the Divisor --> Shift one 32-Bit Chunk + wmov C2,A6 $ wmov C0,A4 + wmov A6,A2 $ wmov A4,A0 + wmov A2,C6 $ wmov A0,C4 + + ;; Set Bit Counter to 32 + lsr R_cnt +2: +#elif SPEED_DIV +#error SPEED_DIV = ? +#endif /* SPEED_DIV */ + +;; The very Division + Remainder Routine + +3: ;; Left-shift Dividend... + lsl A0 $ rol A1 $ rol A2 $ rol A3 + rol A4 $ rol A5 $ rol A6 $ rol A7 + + ;; ...into Remainder + rol C0 $ rol C1 $ rol C2 $ rol C3 + rol C4 $ rol C5 $ rol C6 $ rol C7 + + ;; Compare Remainder and Divisor + CP C0,B0 $ cpc C1,B1 $ cpc C2,B2 $ cpc C3,B3 + cpc C4,B4 $ cpc C5,B5 $ cpc C6,B6 $ cpc C7,B7 + + brcs 4f + + ;; Divisor fits into Remainder: Subtract it from Remainder... + SUB C0,B0 $ sbc C1,B1 $ sbc C2,B2 $ sbc C3,B3 + sbc C4,B4 $ sbc C5,B5 $ sbc C6,B6 $ sbc C7,B7 + + ;; ...and set according Bit in the upcoming Quotient + ;; The Bit will travel to its final Position + ori A0, 1 + +4: ;; This Bit is done + dec R_cnt + brne 3b + ;; __zero_reg__ is 0 again + + ;; T = 0: We are fine with the Quotient in A[] + ;; T = 1: Copy Remainder to A[] +5: brtc 6f + wmov A0, C0 + wmov A2, C2 + wmov A4, C4 + wmov A6, C6 + ;; Move the Sign of the Result to SS.7 + lsl SS + +6: ret + +ENDF __udivmod64 +#endif /* L_udivmod64 */ + + +#if defined (L_divdi3) + +;; R25:R18 = R24:R18 mod R17:R10 +;; Ordinary ABI-Function + +DEFUN __moddi3 + set + rjmp __divdi3_moddi3 +ENDF __moddi3 + +;; R25:R18 = R24:R18 div R17:R10 +;; Ordinary ABI-Function + +DEFUN __divdi3 + clt +ENDF __divdi3 + +DEFUN __divdi3_moddi3 +#if SPEED_DIV + mov r31, A7 + or r31, B7 + brmi 0f + ;; Both Signs are 0: the following Complexitiy is not needed + XJMP __udivdi3_umoddi3 +#endif /* SPEED_DIV */ + +0: ;; The Prologue + ;; Save Z = 12 Registers: Y, 17...8 + ;; No Frame needed (X = 0) + clr r26 + clr r27 + ldi r30, lo8(gs(1f)) + ldi r31, hi8(gs(1f)) + XJMP __prologue_saves__ + ((18 - 12) * 2) + +1: ;; SS.7 will contain the Sign of the Quotient (A.sign * B.sign) + ;; SS.6 will contain the Sign of the Remainder (A.sign) + mov SS, A7 + asr SS + ;; Adjust Dividend's Sign as needed +#if SPEED_DIV + ;; Compiling for Speed we know that at least one Sign must be < 0 + ;; Thus, if A[] >= 0 then we know B[] < 0 + brpl 22f +#else + brpl 21f +#endif /* SPEED_DIV */ + + XCALL __negdi2 + + ;; Adjust Divisor's Sign and SS.7 as needed +21: tst B7 + brpl 3f +22: ldi NN, 1 << 7 + eor SS, NN + + ldi NN, -1 + com B4 $ com B5 $ com B6 $ com B7 + $ com B1 $ com B2 $ com B3 + NEG B0 + $ sbc B1,NN $ sbc B2,NN $ sbc B3,NN + sbc B4,NN $ sbc B5,NN $ sbc B6,NN $ sbc B7,NN + +3: ;; Do the unsigned 64-Bit Division/Modulo (depending on T-flag) + XCALL __udivmod64 + + ;; Adjust Result's Sign +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + tst SS + brpl 4f +#else + sbrc SS, 7 +#endif /* __AVR_HAVE_JMP_CALL__ */ + XCALL __negdi2 + +4: ;; Epilogue: Restore the Z = 12 Registers and return + in r28, __SP_L__ + in r29, __SP_H__ + ldi r30, 12 + XJMP __epilogue_restores__ + ((18 - 12) * 2) + +ENDF __divdi3_moddi3 + +#undef R_cnt +#undef SS +#undef NN + +#endif /* L_divdi3 */ + +#if defined (L_negdi2) +DEFUN __negdi2 + + com A4 $ com A5 $ com A6 $ com A7 + $ com A1 $ com A2 $ com A3 + NEG A0 + $ sbci A1,-1 $ sbci A2,-1 $ sbci A3,-1 + sbci A4,-1 $ sbci A5,-1 $ sbci A6,-1 $ sbci A7,-1 + ret + +ENDF __negdi2 +#endif /* L_negdi2 */ + +#undef C7 +#undef C6 +#undef C5 +#undef C4 +#undef C3 +#undef C2 +#undef C1 +#undef C0 + +#undef B7 +#undef B6 +#undef B5 +#undef B4 +#undef B3 +#undef B2 +#undef B1 +#undef B0 + +#undef A7 +#undef A6 +#undef A5 +#undef A4 +#undef A3 +#undef A2 +#undef A1 +#undef A0 + .section .text.libgcc.prologue, "ax", @progbits @@ -854,6 +1209,7 @@ ENDF __divmodsi4 **********************************/ #if defined (L_prologue) +;; This function does not clobber T-flag; 64-bit division relies on it DEFUN __prologue_saves__ push r2 push r3 diff --git a/libgcc/config/avr/t-avr b/libgcc/config/avr/t-avr index 5ebb17e4eb5..b5c8d05186a 100644 --- a/libgcc/config/avr/t-avr +++ b/libgcc/config/avr/t-avr @@ -15,6 +15,9 @@ LIB1ASMFUNCS = \ _divmodpsi4 _udivmodpsi4 \ _udivmodsi4 \ _divmodsi4 \ + _divdi3 _udivdi3 \ + _udivmod64 \ + _negdi2 \ _prologue \ _epilogue \ _exit \ @@ -50,6 +53,7 @@ LIB1ASMFUNCS = \ _fmul _fmuls _fmulsu LIB2FUNCS_EXCLUDE = \ + _moddi3 _umoddi3 \ _clz # We do not have the DF type.