From 989bdb7461a5f1a8bbb5eb8525b4fa498f3644b4 Mon Sep 17 00:00:00 2001
From: Georg-Johann Lay <avr@gjlay.de>
Date: Mon, 21 Nov 2011 08:56:44 +0000
Subject: [PATCH] re PR target/49313 (Inefficient libgcc implementations for
 avr)

	PR target/49313
	* config/avr/t-avr (LIB2FUNCS_EXCLUDE): Add _moddi3, _umoddi3.
	(LIB1ASMFUNCS): Add _divdi3, _udivdi3, _udivmod64, _negdi2.
	* config/avr/lib1funcs.S (wmov): New assembler macro.
	(__umoddi3, __udivdi3, __udivdi3_umoddi3): New functions.
	(__moddi3, __divdi3, __divdi3_moddi3): New functions.
	(__udivmod64): New function.
	(__negdi2): New function.

From-SVN: r181551
---
 libgcc/ChangeLog              |  11 ++
 libgcc/config/avr/lib1funcs.S | 356 ++++++++++++++++++++++++++++++++++
 libgcc/config/avr/t-avr       |   4 +
 3 files changed, 371 insertions(+)

diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog
index bfcbb8e57bb..b3c8d66b072 100644
--- a/libgcc/ChangeLog
+++ b/libgcc/ChangeLog
@@ -1,3 +1,14 @@
+2011-11-21  Georg-Johann Lay  <avr@gjlay.de>
+
+	PR target/49313
+	* config/avr/t-avr (LIB2FUNCS_EXCLUDE): Add _moddi3, _umoddi3.
+	(LIB1ASMFUNCS): Add _divdi3, _udivdi3, _udivmod64, _negdi2.
+	* config/avr/lib1funcs.S (wmov): New assembler macro.
+	(__umoddi3, __udivdi3, __udivdi3_umoddi3): New functions.
+	(__moddi3, __divdi3, __divdi3_moddi3): New functions.
+	(__udivmod64): New function.
+	(__negdi2): New function.
+
 2011-11-21  Gerald Pfeifer  <gerald@pfeifer.com>
 
 	* config.host (*-*-freebsd[12], *-*-freebsd[12].*,
diff --git a/libgcc/config/avr/lib1funcs.S b/libgcc/config/avr/lib1funcs.S
index aee69e50d8e..c592c4caa5d 100644
--- a/libgcc/config/avr/lib1funcs.S
+++ b/libgcc/config/avr/lib1funcs.S
@@ -61,6 +61,15 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #endif
 	.endm
 
+.macro	wmov  r_dest, r_src
+#if defined (__AVR_HAVE_MOVW__)
+    movw \r_dest,   \r_src
+#else
+    mov \r_dest,    \r_src
+    mov \r_dest+1,  \r_src+1
+#endif
+.endm
+
 #if defined (__AVR_HAVE_JMP_CALL__)
 #define XCALL call
 #define XJMP  jmp
@@ -846,6 +855,352 @@ __divmodsi4_exit:
 ENDF __divmodsi4
 #endif /* defined (L_divmodsi4) */
 
+
+/*******************************************************
+       Division 64 / 64
+       Modulo   64 % 64
+*******************************************************/
+
+;; Use Speed-optimized Version on "big" Devices, i.e. Devices with
+;; at least 16k of Program Memory.  For smaller Devices, depend
+;; on MOVW.
+
+#if defined (__AVR_HAVE_JMP_CALL__)
+#   define SPEED_DIV 8
+#elif defined (__AVR_HAVE_MOVW__)
+#   define SPEED_DIV 16
+#else
+#   define SPEED_DIV 0
+#endif
+
+;; A[0..7]: In: Dividend;
+;; Out: Quotient  (T = 0)
+;; Out: Remainder (T = 1)
+#define A0  18
+#define A1  A0+1
+#define A2  A0+2
+#define A3  A0+3
+#define A4  A0+4
+#define A5  A0+5
+#define A6  A0+6
+#define A7  A0+7
+
+;; B[0..7]: In: Divisor;   Out: Clobber
+#define B0  10
+#define B1  B0+1
+#define B2  B0+2
+#define B3  B0+3
+#define B4  B0+4
+#define B5  B0+5
+#define B6  B0+6
+#define B7  B0+7
+
+;; C[0..7]: Expand remainder;  Out: Remainder (unused)
+#define C0  8
+#define C1  C0+1
+#define C2  30
+#define C3  C2+1
+#define C4  28
+#define C5  C4+1
+#define C6  26
+#define C7  C6+1
+
+;; Holds Signs during Division Routine
+#define SS      __tmp_reg__
+
+;; Bit-Counter in Division Routine
+#define R_cnt   __zero_reg__
+
+;; Scratch Register for Negation
+#define NN      r31
+
+#if defined (L_udivdi3)
+
+;; R25:R18 = R24:R18  umod  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __umoddi3
+    set
+    rjmp __udivdi3_umoddi3
+ENDF __umoddi3
+
+;; R25:R18 = R24:R18  udiv  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __udivdi3
+    clt
+ENDF __udivdi3
+
+DEFUN __udivdi3_umoddi3
+    push    C0
+    push    C1
+    push    C4
+    push    C5
+    XCALL   __udivmod64
+    pop     C5
+    pop     C4
+    pop     C1
+    pop     C0
+    ret
+ENDF __udivdi3_umoddi3
+#endif /* L_udivdi3 */
+
+#if defined (L_udivmod64)
+
+;; Worker Routine for 64-Bit unsigned Quotient and Remainder Computation
+;; No Registers saved/restored; the Callers will take Care.
+;; Preserves B[] and T-flag
+;; T = 0: Compute Quotient  in A[]
+;; T = 1: Compute Remainder in A[] and shift SS one Bit left
+
+DEFUN __udivmod64
+
+    ;; Clear Remainder (C6, C7 will follow)
+    clr     C0
+    clr     C1
+    wmov    C2, C0
+    wmov    C4, C0
+    ldi     C7, 64
+
+#if SPEED_DIV == 0 || SPEED_DIV == 16
+    ;; Initialize Loop-Counter
+    mov     R_cnt, C7
+    wmov    C6, C0
+#endif /* SPEED_DIV */
+
+#if SPEED_DIV == 8
+
+    push    A7
+    clr     C6
+
+1:  ;; Compare shifted Devidend against Divisor
+    ;; If -- even after Shifting -- it is smaller...
+    CP  A7,B0  $  cpc C0,B1  $  cpc C1,B2  $  cpc C2,B3  
+    cpc C3,B4  $  cpc C4,B5  $  cpc C5,B6  $  cpc C6,B7  
+    brcc    2f
+
+    ;; ...then we can subtract it.  Thus, it is legal to shift left
+               $  mov C6,C5  $  mov C5,C4  $  mov C4,C3
+    mov C3,C2  $  mov C2,C1  $  mov C1,C0  $  mov C0,A7
+    mov A7,A6  $  mov A6,A5  $  mov A5,A4  $  mov A4,A3
+    mov A3,A2  $  mov A2,A1  $  mov A1,A0  $  clr A0
+
+    ;; 8 Bits are done
+    subi    C7, 8
+    brne    1b
+
+    ;; Shifted 64 Bits:  A7 has traveled to C7
+    pop     C7
+    ;; Divisor is greater than Dividend. We have:
+    ;; A[] % B[] = A[]
+    ;; A[] / B[] = 0
+    ;; Thus, we can return immediately
+    rjmp    5f
+
+2:  ;; Initialze Bit-Counter with Number of Bits still to be performed
+    mov     R_cnt, C7
+
+    ;; Push of A7 is not needed because C7 is still 0
+    pop     C7
+    clr     C7
+
+#elif  SPEED_DIV == 16
+
+    ;; Compare shifted Dividend against Divisor
+    cp      A7, B3
+    cpc     C0, B4
+    cpc     C1, B5
+    cpc     C2, B6
+    cpc     C3, B7
+    brcc    2f
+
+    ;; Divisor is greater than shifted Dividen: We can shift the Dividend
+    ;; and it is still smaller than the Divisor --> Shift one 32-Bit Chunk
+    wmov  C2,A6  $  wmov C0,A4
+    wmov  A6,A2  $  wmov A4,A0
+    wmov  A2,C6  $  wmov A0,C4
+
+    ;; Set Bit Counter to 32
+    lsr     R_cnt
+2:
+#elif SPEED_DIV
+#error SPEED_DIV = ?
+#endif /* SPEED_DIV */
+
+;; The very Division + Remainder Routine
+
+3:  ;; Left-shift Dividend...
+    lsl A0     $  rol A1     $  rol A2     $  rol A3
+    rol A4     $  rol A5     $  rol A6     $  rol A7
+
+    ;; ...into Remainder
+    rol C0     $  rol C1     $  rol C2     $  rol C3
+    rol C4     $  rol C5     $  rol C6     $  rol C7
+
+    ;; Compare Remainder and Divisor
+    CP  C0,B0  $  cpc C1,B1  $  cpc C2,B2  $  cpc C3,B3
+    cpc C4,B4  $  cpc C5,B5  $  cpc C6,B6  $  cpc C7,B7
+
+    brcs 4f
+
+    ;; Divisor fits into Remainder:  Subtract it from Remainder...
+    SUB C0,B0  $  sbc C1,B1  $  sbc C2,B2  $  sbc C3,B3
+    sbc C4,B4  $  sbc C5,B5  $  sbc C6,B6  $  sbc C7,B7
+
+    ;; ...and set according Bit in the upcoming Quotient
+    ;; The Bit will travel to its final Position
+    ori A0, 1
+
+4:  ;; This Bit is done
+    dec     R_cnt
+    brne    3b
+    ;; __zero_reg__ is 0 again
+
+    ;; T = 0: We are fine with the Quotient in A[]
+    ;; T = 1: Copy Remainder to A[]
+5:  brtc    6f
+    wmov    A0, C0
+    wmov    A2, C2
+    wmov    A4, C4
+    wmov    A6, C6
+    ;; Move the Sign of the Result to SS.7
+    lsl     SS
+
+6:  ret
+
+ENDF __udivmod64
+#endif /* L_udivmod64 */
+    
+
+#if defined (L_divdi3)
+
+;; R25:R18 = R24:R18  mod  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __moddi3
+    set
+    rjmp    __divdi3_moddi3
+ENDF __moddi3
+
+;; R25:R18 = R24:R18  div  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __divdi3
+    clt
+ENDF __divdi3
+
+DEFUN  __divdi3_moddi3
+#if SPEED_DIV
+    mov     r31, A7
+    or      r31, B7
+    brmi    0f
+    ;; Both Signs are 0:  the following Complexitiy is not needed
+    XJMP    __udivdi3_umoddi3
+#endif /* SPEED_DIV */    
+
+0:  ;; The Prologue
+    ;; Save Z = 12 Registers:  Y, 17...8
+    ;; No Frame needed (X = 0)
+    clr r26
+    clr r27
+    ldi r30, lo8(gs(1f))
+    ldi r31, hi8(gs(1f))
+    XJMP __prologue_saves__ + ((18 - 12) * 2)
+
+1:  ;; SS.7 will contain the Sign of the Quotient  (A.sign * B.sign)
+    ;; SS.6 will contain the Sign of the Remainder (A.sign)
+    mov     SS, A7
+    asr     SS
+    ;; Adjust Dividend's Sign as needed
+#if SPEED_DIV
+    ;; Compiling for Speed we know that at least one Sign must be < 0
+    ;; Thus, if A[] >= 0 then we know B[] < 0
+    brpl    22f
+#else
+    brpl    21f
+#endif /* SPEED_DIV */
+   
+    XCALL   __negdi2
+
+    ;; Adjust Divisor's Sign and SS.7 as needed
+21: tst     B7
+    brpl    3f
+22: ldi     NN, 1 << 7
+    eor     SS, NN
+
+    ldi NN, -1
+    com B4     $  com B5     $  com B6     $  com B7
+               $  com B1     $  com B2     $  com B3
+    NEG B0
+               $  sbc B1,NN  $  sbc B2,NN  $  sbc B3,NN
+    sbc B4,NN  $  sbc B5,NN  $  sbc B6,NN  $  sbc B7,NN
+
+3:  ;; Do the unsigned 64-Bit Division/Modulo (depending on T-flag)
+    XCALL   __udivmod64
+
+    ;; Adjust Result's Sign
+#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
+    tst     SS
+    brpl    4f
+#else
+    sbrc    SS, 7
+#endif /* __AVR_HAVE_JMP_CALL__ */
+    XCALL   __negdi2
+
+4:  ;; Epilogue: Restore the Z = 12 Registers and return
+    in r28, __SP_L__
+    in r29, __SP_H__
+    ldi r30, 12
+    XJMP __epilogue_restores__ + ((18 - 12) * 2)
+
+ENDF __divdi3_moddi3
+
+#undef R_cnt
+#undef SS
+#undef NN
+
+#endif /* L_divdi3 */
+
+#if defined (L_negdi2)
+DEFUN __negdi2
+
+    com  A4    $  com  A5    $  com  A6    $  com  A7
+               $  com  A1    $  com  A2    $  com  A3
+    NEG  A0
+               $  sbci A1,-1 $  sbci A2,-1 $  sbci A3,-1
+    sbci A4,-1 $  sbci A5,-1 $  sbci A6,-1 $  sbci A7,-1
+    ret
+
+ENDF __negdi2
+#endif /* L_negdi2 */
+
+#undef C7
+#undef C6
+#undef C5
+#undef C4
+#undef C3
+#undef C2
+#undef C1
+#undef C0
+
+#undef B7
+#undef B6
+#undef B5
+#undef B4
+#undef B3
+#undef B2
+#undef B1
+#undef B0
+
+#undef A7
+#undef A6
+#undef A5
+#undef A4
+#undef A3
+#undef A2
+#undef A1
+#undef A0
+
 
 .section .text.libgcc.prologue, "ax", @progbits
     
@@ -854,6 +1209,7 @@ ENDF __divmodsi4
  **********************************/
 #if defined (L_prologue)
 
+;; This function does not clobber T-flag; 64-bit division relies on it
 DEFUN __prologue_saves__
 	push r2
 	push r3
diff --git a/libgcc/config/avr/t-avr b/libgcc/config/avr/t-avr
index 5ebb17e4eb5..b5c8d05186a 100644
--- a/libgcc/config/avr/t-avr
+++ b/libgcc/config/avr/t-avr
@@ -15,6 +15,9 @@ LIB1ASMFUNCS = \
 	_divmodpsi4 _udivmodpsi4 \
 	_udivmodsi4 \
 	_divmodsi4 \
+	_divdi3 _udivdi3 \
+	_udivmod64 \
+	_negdi2 \
 	_prologue \
 	_epilogue \
 	_exit \
@@ -50,6 +53,7 @@ LIB1ASMFUNCS = \
 	_fmul _fmuls _fmulsu
 
 LIB2FUNCS_EXCLUDE = \
+	_moddi3 _umoddi3 \
 	_clz
 
 # We do not have the DF type.