From 463e45dcb4aa1a878b5ae5c6bbaf4ae70e24bf50 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Mon, 9 Nov 2020 19:08:30 -0800
Subject: [PATCH] softfloat: Introduce sh[lr]_double primitives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Have x86_64 assembly for them, with a fallback.
This avoids shuffling values through %cl in the x86 case.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 fpu/softfloat.c                | 102 +++++++++++++++++++++++++--------
 include/fpu/softfloat-macros.h |  36 ++++++++++++
 2 files changed, 115 insertions(+), 23 deletions(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 571309e74f..34689959a9 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -957,15 +957,12 @@ static int frac128_normalize(FloatParts128 *a)
 {
     if (a->frac_hi) {
         int shl = clz64(a->frac_hi);
-        if (shl) {
-            int shr = 64 - shl;
-            a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
-            a->frac_lo = (a->frac_lo << shl);
-        }
+        a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl);
+        a->frac_lo <<= shl;
         return shl;
     } else if (a->frac_lo) {
         int shl = clz64(a->frac_lo);
-        a->frac_hi = (a->frac_lo << shl);
+        a->frac_hi = a->frac_lo << shl;
         a->frac_lo = 0;
         return shl + 64;
     }
@@ -976,7 +973,7 @@ static int frac256_normalize(FloatParts256 *a)
 {
     uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
     uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
-    int ret, shl, shr;
+    int ret, shl;
 
     if (likely(a0)) {
         shl = clz64(a0);
@@ -1006,11 +1003,10 @@ static int frac256_normalize(FloatParts256 *a)
         ret += shl;
     }
 
-    shr = -shl & 63;
-    a0 = (a0 << shl) | (a1 >> shr);
-    a1 = (a1 << shl) | (a2 >> shr);
-    a2 = (a2 << shl) | (a3 >> shr);
-    a3 = (a3 << shl);
+    a0 = shl_double(a0, a1, shl);
+    a1 = shl_double(a1, a2, shl);
+    a2 = shl_double(a2, a3, shl);
+    a3 <<= shl;
 
  done:
     a->frac_hi = a0;
@@ -1029,7 +1025,20 @@ static void frac64_shl(FloatParts64 *a, int c)
 
 static void frac128_shl(FloatParts128 *a, int c)
 {
-    shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
+    uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
+
+    if (c & 64) {
+        a0 = a1, a1 = 0;
+    }
+
+    c &= 63;
+    if (c) {
+        a0 = shl_double(a0, a1, c);
+        a1 = a1 << c;
+    }
+
+    a->frac_hi = a0;
+    a->frac_lo = a1;
 }
 
 #define frac_shl(A, C)  FRAC_GENERIC_64_128(shl, A)(A, C)
@@ -1041,19 +1050,68 @@ static void frac64_shr(FloatParts64 *a, int c)
 
 static void frac128_shr(FloatParts128 *a, int c)
 {
-    shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
+    uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
+
+    if (c & 64) {
+        a1 = a0, a0 = 0;
+    }
+
+    c &= 63;
+    if (c) {
+        a1 = shr_double(a0, a1, c);
+        a0 = a0 >> c;
+    }
+
+    a->frac_hi = a0;
+    a->frac_lo = a1;
 }
 
 #define frac_shr(A, C)  FRAC_GENERIC_64_128(shr, A)(A, C)
 
 static void frac64_shrjam(FloatParts64 *a, int c)
 {
-    shift64RightJamming(a->frac, c, &a->frac);
+    uint64_t a0 = a->frac;
+
+    if (likely(c != 0)) {
+        if (likely(c < 64)) {
+            a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0);
+        } else {
+            a0 = a0 != 0;
+        }
+        a->frac = a0;
+    }
 }
 
 static void frac128_shrjam(FloatParts128 *a, int c)
 {
-    shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
+    uint64_t a0 = a->frac_hi, a1 = a->frac_lo;
+    uint64_t sticky = 0;
+
+    if (unlikely(c == 0)) {
+        return;
+    } else if (likely(c < 64)) {
+        /* nothing */
+    } else if (likely(c < 128)) {
+        sticky = a1;
+        a1 = a0;
+        a0 = 0;
+        c &= 63;
+        if (c == 0) {
+            goto done;
+        }
+    } else {
+        sticky = a0 | a1;
+        a0 = a1 = 0;
+        goto done;
+    }
+
+    sticky |= shr_double(a1, 0, c);
+    a1 = shr_double(a0, a1, c);
+    a0 = a0 >> c;
+
+ done:
+    a->frac_lo = a1 | (sticky != 0);
+    a->frac_hi = a0;
 }
 
 static void frac256_shrjam(FloatParts256 *a, int c)
@@ -1061,7 +1119,6 @@ static void frac256_shrjam(FloatParts256 *a, int c)
     uint64_t a0 = a->frac_hi, a1 = a->frac_hm;
     uint64_t a2 = a->frac_lm, a3 = a->frac_lo;
     uint64_t sticky = 0;
-    int invc;
 
     if (unlikely(c == 0)) {
         return;
@@ -1086,12 +1143,11 @@ static void frac256_shrjam(FloatParts256 *a, int c)
         goto done;
     }
 
-    invc = -c & 63;
-    sticky |= a3 << invc;
-    a3 = (a3 >> c) | (a2 << invc);
-    a2 = (a2 >> c) | (a1 << invc);
-    a1 = (a1 >> c) | (a0 << invc);
-    a0 = (a0 >> c);
+    sticky |= shr_double(a3, 0, c);
+    a3 = shr_double(a2, a3, c);
+    a2 = shr_double(a1, a2, c);
+    a1 = shr_double(a0, a1, c);
+    a0 = a0 >> c;
 
  done:
     a->frac_lo = a3 | (sticky != 0);
diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index 672c1db555..ec4e27a595 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -85,6 +85,42 @@ this code that are retained.
 #include "fpu/softfloat-types.h"
 #include "qemu/host-utils.h"
 
+/**
+ * shl_double: double-word merging left shift
+ * @l: left or most-significant word
+ * @r: right or least-significant word
+ * @c: shift count
+ *
+ * Shift @l left by @c bits, shifting in bits from @r.
+ */
+static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
+{
+#if defined(__x86_64__)
+    asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
+    return l;
+#else
+    return c ? (l << c) | (r >> (64 - c)) : l;
+#endif
+}
+
+/**
+ * shr_double: double-word merging right shift
+ * @l: left or most-significant word
+ * @r: right or least-significant word
+ * @c: shift count
+ *
+ * Shift @r right by @c bits, shifting in bits from @l.
+ */
+static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
+{
+#if defined(__x86_64__)
+    asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
+    return r;
+#else
+    return c ? (r >> c) | (l << (64 - c)) : r;
+#endif
+}
+
 /*----------------------------------------------------------------------------
 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
 | bits are shifted off, they are ``jammed'' into the least significant bit of