Reorg FloatParts to use QEMU_GENERIC.

Begin replacing the Berkeley float128 routines with FloatParts128. - includes a new implementation of float128_muladd - includes the snan silencing that was missing from float{32,64}_to_float128 and float128_to_float{32,64}. - does not include float128_min/max* (written but not yet reviewed). -----BEGIN PGP SIGNATURE----- iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmChD54dHHJpY2hhcmQu aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV94uAgApJMxVvkRkDuyOXG2 cM0dC+GQQ0prm5id2AW2JREiET+jo2NV7uU8IQGEQq3rtG8trws45gMQFgSRYJk2 sLlAIt4QqD6qzb2H9z+JhOx1yITlsuwrvr+BAwVtK7gw6l4LxKAs35SwWpz/Z5/2 R63bLyontVzzi40Bc4pB/h3CxdOR+UjZ2a2kDIZFuI/j+9pnPoEL/Vp9XMg85ex+ g21rRwE6qv4hrGMhej5YBKQoleoieL3FQ0sXQLi5lLNYejBpU45PjdgdEwbZIBhT 4sQkzV2HRrd84OrQIJU3Jd+zHZoSq6JQUZRSGAnqC7Mvigplo24J5GRjh6T8WoaI y495Lg== =MR2G -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/rth-gitlab/tags/pull-fp-20210516' into staging Reorg FloatParts to use QEMU_GENERIC. Begin replacing the Berkeley float128 routines with FloatParts128. - includes a new implementation of float128_muladd - includes the snan silencing that was missing from float{32,64}_to_float128 and float128_to_float{32,64}. - does not include float128_min/max* (written but not yet reviewed). # gpg: Signature made Sun 16 May 2021 13:27:10 BST # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * remotes/rth-gitlab/tags/pull-fp-20210516: (46 commits) softfloat: Move round_to_int_and_pack to softfloat-parts.c.inc softfloat: Move round_to_int to softfloat-parts.c.inc softfloat: Convert float-to-float conversions with float128 softfloat: Split float_to_float softfloat: Move div_floats to softfloat-parts.c.inc softfloat: Introduce sh[lr]_double primitives softfloat: Tidy mul128By64To192 softfloat: Use add192 in mul128To256 softfloat: Use mulu64 for mul64To128 softfloat: Move muladd_floats to softfloat-parts.c.inc softfloat: Move mul_floats to softfloat-parts.c.inc softfloat: Implement float128_add/sub via parts softfloat: Move addsub_floats to softfloat-parts.c.inc softfloat: Use uadd64_carry, usub64_borrow in softfloat-macros.h softfloat: Move round_canonical to softfloat-parts.c.inc softfloat: Move sf_canonicalize to softfloat-parts.c.inc softfloat: Move pick_nan_muladd to softfloat-parts.c.inc softfloat: Move pick_nan to softfloat-parts.c.inc softfloat: Move return_nan to softfloat-parts.c.inc softfloat: Convert float128_default_nan to parts ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2021-05-17 20:02:55 +01:00 · 2021-05-17 20:02:55 +01:00 · 1acbc0fdf2
parent 367196caa0 463b3f0d7f
commit 1acbc0fdf2
12 changed files with 2974 additions and 2349 deletions
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@ -1073,9 +1073,8 @@ void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc)
    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
        int32_t ai = *(int32_t *)(a + i);
        int32_t bi = *(int32_t *)(b + i);
-        int32_t di = ai + bi;
+        int32_t di;
-        if (((di ^ ai) &~ (ai ^ bi)) < 0) {
+        if (sadd32_overflow(ai, bi, &di)) {
            /* Signed overflow.  */
            di = (di < 0 ? INT32_MAX : INT32_MIN);
        }
        *(int32_t *)(d + i) = di;
@ -1091,9 +1090,8 @@ void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc)
    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
        int64_t ai = *(int64_t *)(a + i);
        int64_t bi = *(int64_t *)(b + i);
-        int64_t di = ai + bi;
+        int64_t di;
-        if (((di ^ ai) &~ (ai ^ bi)) < 0) {
+        if (sadd64_overflow(ai, bi, &di)) {
            /* Signed overflow.  */
            di = (di < 0 ? INT64_MAX : INT64_MIN);
        }
        *(int64_t *)(d + i) = di;
@ -1143,9 +1141,8 @@ void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc)
    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
        int32_t ai = *(int32_t *)(a + i);
        int32_t bi = *(int32_t *)(b + i);
-        int32_t di = ai - bi;
+        int32_t di;
-        if (((di ^ ai) & (ai ^ bi)) < 0) {
+        if (ssub32_overflow(ai, bi, &di)) {
            /* Signed overflow.  */
            di = (di < 0 ? INT32_MAX : INT32_MIN);
        }
        *(int32_t *)(d + i) = di;
@ -1161,9 +1158,8 @@ void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc)
    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
        int64_t ai = *(int64_t *)(a + i);
        int64_t bi = *(int64_t *)(b + i);
-        int64_t di = ai - bi;
+        int64_t di;
-        if (((di ^ ai) & (ai ^ bi)) < 0) {
+        if (ssub64_overflow(ai, bi, &di)) {
            /* Signed overflow.  */
            di = (di < 0 ? INT64_MAX : INT64_MIN);
        }
        *(int64_t *)(d + i) = di;
@ -1209,8 +1205,8 @@ void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc)
    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
        uint32_t ai = *(uint32_t *)(a + i);
        uint32_t bi = *(uint32_t *)(b + i);
-        uint32_t di = ai + bi;
+        uint32_t di;
-        if (di < ai) {
+        if (uadd32_overflow(ai, bi, &di)) {
            di = UINT32_MAX;
        }
        *(uint32_t *)(d + i) = di;
@ -1226,8 +1222,8 @@ void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc)
    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
        uint64_t ai = *(uint64_t *)(a + i);
        uint64_t bi = *(uint64_t *)(b + i);
-        uint64_t di = ai + bi;
+        uint64_t di;
-        if (di < ai) {
+        if (uadd64_overflow(ai, bi, &di)) {
            di = UINT64_MAX;
        }
        *(uint64_t *)(d + i) = di;
@ -1273,8 +1269,8 @@ void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc)
    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
        uint32_t ai = *(uint32_t *)(a + i);
        uint32_t bi = *(uint32_t *)(b + i);
-        uint32_t di = ai - bi;
+        uint32_t di;
-        if (ai < bi) {
+        if (usub32_overflow(ai, bi, &di)) {
            di = 0;
        }
        *(uint32_t *)(d + i) = di;
@ -1290,8 +1286,8 @@ void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
        uint64_t ai = *(uint64_t *)(a + i);
        uint64_t bi = *(uint64_t *)(b + i);
-        uint64_t di = ai - bi;
+        uint64_t di;
-        if (ai < bi) {
+        if (usub64_overflow(ai, bi, &di)) {
            di = 0;
        }
        *(uint64_t *)(d + i) = di;
--- a/fpu/softfloat-parts-addsub.c.inc
+++ b/fpu/softfloat-parts-addsub.c.inc
@ -0,0 +1,62 @@
 /*
 * Floating point arithmetic implementation
 *
 * The code in this source file is derived from release 2a of the SoftFloat
 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
 * some later contributions) are provided under that license, as detailed below.
 * It has subsequently been modified by contributors to the QEMU Project,
 * so some portions are provided under:
 *  the SoftFloat-2a license
 *  the BSD license
 *  GPL-v2-or-later
 *
 * Any future contributions to this file after December 1st 2014 will be
 * taken to be licensed under the Softfloat-2a license unless specifically
 * indicated otherwise.
 */
 static void partsN(add_normal)(FloatPartsN *a, FloatPartsN *b)
 {
    int exp_diff = a->exp - b->exp;
    if (exp_diff > 0) {
        frac_shrjam(b, exp_diff);
    } else if (exp_diff < 0) {
        frac_shrjam(a, -exp_diff);
        a->exp = b->exp;
    }
    if (frac_add(a, a, b)) {
        frac_shrjam(a, 1);
        a->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
        a->exp += 1;
    }
 }
 static bool partsN(sub_normal)(FloatPartsN *a, FloatPartsN *b)
 {
    int exp_diff = a->exp - b->exp;
    int shift;
    if (exp_diff > 0) {
        frac_shrjam(b, exp_diff);
        frac_sub(a, a, b);
    } else if (exp_diff < 0) {
        a->exp = b->exp;
        a->sign ^= 1;
        frac_shrjam(a, -exp_diff);
        frac_sub(a, b, a);
    } else if (frac_sub(a, a, b)) {
        /* Overflow means that A was less than B. */
        frac_neg(a);
        a->sign ^= 1;
    }
    shift = frac_normalize(a);
    if (likely(shift < N)) {
        a->exp -= shift;
 	return true;
    }
    a->cls = float_class_zero;
    return false;
 }
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@ -0,0 +1,817 @@
 /*
 * QEMU float support
 *
 * The code in this source file is derived from release 2a of the SoftFloat
 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
 * some later contributions) are provided under that license, as detailed below.
 * It has subsequently been modified by contributors to the QEMU Project,
 * so some portions are provided under:
 *  the SoftFloat-2a license
 *  the BSD license
 *  GPL-v2-or-later
 *
 * Any future contributions to this file after December 1st 2014 will be
 * taken to be licensed under the Softfloat-2a license unless specifically
 * indicated otherwise.
 */
 static void partsN(return_nan)(FloatPartsN *a, float_status *s)
 {
    switch (a->cls) {
    case float_class_snan:
        float_raise(float_flag_invalid, s);
        if (s->default_nan_mode) {
            parts_default_nan(a, s);
        } else {
            parts_silence_nan(a, s);
        }
        break;
    case float_class_qnan:
        if (s->default_nan_mode) {
            parts_default_nan(a, s);
        }
        break;
    default:
        g_assert_not_reached();
    }
 }
 static FloatPartsN *partsN(pick_nan)(FloatPartsN *a, FloatPartsN *b,
                                     float_status *s)
 {
    if (is_snan(a->cls) || is_snan(b->cls)) {
        float_raise(float_flag_invalid, s);
    }
    if (s->default_nan_mode) {
        parts_default_nan(a, s);
    } else {
        int cmp = frac_cmp(a, b);
        if (cmp == 0) {
            cmp = a->sign < b->sign;
        }
        if (pickNaN(a->cls, b->cls, cmp > 0, s)) {
            a = b;
        }
        if (is_snan(a->cls)) {
            parts_silence_nan(a, s);
        }
    }
    return a;
 }
 static FloatPartsN *partsN(pick_nan_muladd)(FloatPartsN *a, FloatPartsN *b,
                                            FloatPartsN *c, float_status *s,
                                            int ab_mask, int abc_mask)
 {
    int which;
    if (unlikely(abc_mask & float_cmask_snan)) {
        float_raise(float_flag_invalid, s);
    }
    which = pickNaNMulAdd(a->cls, b->cls, c->cls,
                          ab_mask == float_cmask_infzero, s);
    if (s->default_nan_mode || which == 3) {
        /*
         * Note that this check is after pickNaNMulAdd so that function
         * has an opportunity to set the Invalid flag for infzero.
         */
        parts_default_nan(a, s);
        return a;
    }
    switch (which) {
    case 0:
        break;
    case 1:
        a = b;
        break;
    case 2:
        a = c;
        break;
    default:
        g_assert_not_reached();
    }
    if (is_snan(a->cls)) {
        parts_silence_nan(a, s);
    }
    return a;
 }
 /*
 * Canonicalize the FloatParts structure.  Determine the class,
 * unbias the exponent, and normalize the fraction.
 */
 static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
                                 const FloatFmt *fmt)
 {
    if (unlikely(p->exp == 0)) {
        if (likely(frac_eqz(p))) {
            p->cls = float_class_zero;
        } else if (status->flush_inputs_to_zero) {
            float_raise(float_flag_input_denormal, status);
            p->cls = float_class_zero;
            frac_clear(p);
        } else {
            int shift = frac_normalize(p);
            p->cls = float_class_normal;
            p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1;
        }
    } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
        p->cls = float_class_normal;
        p->exp -= fmt->exp_bias;
        frac_shl(p, fmt->frac_shift);
        p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
    } else if (likely(frac_eqz(p))) {
        p->cls = float_class_inf;
    } else {
        frac_shl(p, fmt->frac_shift);
        p->cls = (parts_is_snan_frac(p->frac_hi, status)
                  ? float_class_snan : float_class_qnan);
    }
 }
 /*
 * Round and uncanonicalize a floating-point number by parts. There
 * are FRAC_SHIFT bits that may require rounding at the bottom of the
 * fraction; these bits will be removed. The exponent will be biased
 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 */
 static void partsN(uncanon)(FloatPartsN *p, float_status *s,
                            const FloatFmt *fmt)
 {
    const int exp_max = fmt->exp_max;
    const int frac_shift = fmt->frac_shift;
    const uint64_t frac_lsb = fmt->frac_lsb;
    const uint64_t frac_lsbm1 = fmt->frac_lsbm1;
    const uint64_t round_mask = fmt->round_mask;
    const uint64_t roundeven_mask = fmt->roundeven_mask;
    uint64_t inc;
    bool overflow_norm;
    int exp, flags = 0;
    if (unlikely(p->cls != float_class_normal)) {
        switch (p->cls) {
        case float_class_zero:
            p->exp = 0;
            frac_clear(p);
            return;
        case float_class_inf:
            g_assert(!fmt->arm_althp);
            p->exp = fmt->exp_max;
            frac_clear(p);
            return;
        case float_class_qnan:
        case float_class_snan:
            g_assert(!fmt->arm_althp);
            p->exp = fmt->exp_max;
            frac_shr(p, fmt->frac_shift);
            return;
        default:
            break;
        }
        g_assert_not_reached();
    }
    switch (s->float_rounding_mode) {
    case float_round_nearest_even:
        overflow_norm = false;
        inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
        break;
    case float_round_ties_away:
        overflow_norm = false;
        inc = frac_lsbm1;
        break;
    case float_round_to_zero:
        overflow_norm = true;
        inc = 0;
        break;
    case float_round_up:
        inc = p->sign ? 0 : round_mask;
        overflow_norm = p->sign;
        break;
    case float_round_down:
        inc = p->sign ? round_mask : 0;
        overflow_norm = !p->sign;
        break;
    case float_round_to_odd:
        overflow_norm = true;
        inc = p->frac_lo & frac_lsb ? 0 : round_mask;
        break;
    default:
        g_assert_not_reached();
    }
    exp = p->exp + fmt->exp_bias;
    if (likely(exp > 0)) {
        if (p->frac_lo & round_mask) {
            flags |= float_flag_inexact;
            if (frac_addi(p, p, inc)) {
                frac_shr(p, 1);
                p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
                exp++;
            }
        }
        frac_shr(p, frac_shift);
        if (fmt->arm_althp) {
            /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
            if (unlikely(exp > exp_max)) {
                /* Overflow.  Return the maximum normal.  */
                flags = float_flag_invalid;
                exp = exp_max;
                frac_allones(p);
            }
        } else if (unlikely(exp >= exp_max)) {
            flags |= float_flag_overflow | float_flag_inexact;
            if (overflow_norm) {
                exp = exp_max - 1;
                frac_allones(p);
            } else {
                p->cls = float_class_inf;
                exp = exp_max;
                frac_clear(p);
            }
        }
    } else if (s->flush_to_zero) {
        flags |= float_flag_output_denormal;
        p->cls = float_class_zero;
        exp = 0;
        frac_clear(p);
    } else {
        bool is_tiny = s->tininess_before_rounding || exp < 0;
        if (!is_tiny) {
            FloatPartsN discard;
            is_tiny = !frac_addi(&discard, p, inc);
        }
        frac_shrjam(p, 1 - exp);
        if (p->frac_lo & round_mask) {
            /* Need to recompute round-to-even/round-to-odd. */
            switch (s->float_rounding_mode) {
            case float_round_nearest_even:
                inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1
                       ? frac_lsbm1 : 0);
                break;
            case float_round_to_odd:
                inc = p->frac_lo & frac_lsb ? 0 : round_mask;
                break;
            default:
                break;
            }
            flags |= float_flag_inexact;
            frac_addi(p, p, inc);
        }
        exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) != 0;
        frac_shr(p, frac_shift);
        if (is_tiny && (flags & float_flag_inexact)) {
            flags |= float_flag_underflow;
        }
        if (exp == 0 && frac_eqz(p)) {
            p->cls = float_class_zero;
        }
    }
    p->exp = exp;
    float_raise(flags, s);
 }
 /*
 * Returns the result of adding or subtracting the values of the
 * floating-point values `a' and `b'. The operation is performed
 * according to the IEC/IEEE Standard for Binary Floating-Point
 * Arithmetic.
 */
 static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b,
                                   float_status *s, bool subtract)
 {
    bool b_sign = b->sign ^ subtract;
    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
    if (a->sign != b_sign) {
        /* Subtraction */
        if (likely(ab_mask == float_cmask_normal)) {
            if (parts_sub_normal(a, b)) {
                return a;
            }
            /* Subtract was exact, fall through to set sign. */
            ab_mask = float_cmask_zero;
        }
        if (ab_mask == float_cmask_zero) {
            a->sign = s->float_rounding_mode == float_round_down;
            return a;
        }
        if (unlikely(ab_mask & float_cmask_anynan)) {
            goto p_nan;
        }
        if (ab_mask & float_cmask_inf) {
            if (a->cls != float_class_inf) {
                /* N - Inf */
                goto return_b;
            }
            if (b->cls != float_class_inf) {
                /* Inf - N */
                return a;
            }
            /* Inf - Inf */
            float_raise(float_flag_invalid, s);
            parts_default_nan(a, s);
            return a;
        }
    } else {
        /* Addition */
        if (likely(ab_mask == float_cmask_normal)) {
            parts_add_normal(a, b);
            return a;
        }
        if (ab_mask == float_cmask_zero) {
            return a;
        }
        if (unlikely(ab_mask & float_cmask_anynan)) {
            goto p_nan;
        }
        if (ab_mask & float_cmask_inf) {
            a->cls = float_class_inf;
            return a;
        }
    }
    if (b->cls == float_class_zero) {
        g_assert(a->cls == float_class_normal);
        return a;
    }
    g_assert(a->cls == float_class_zero);
    g_assert(b->cls == float_class_normal);
 return_b:
    b->sign = b_sign;
    return b;
 p_nan:
    return parts_pick_nan(a, b, s);
 }
 /*
 * Returns the result of multiplying the floating-point values `a' and
 * `b'. The operation is performed according to the IEC/IEEE Standard
 * for Binary Floating-Point Arithmetic.
 */
 static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
                                float_status *s)
 {
    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
    bool sign = a->sign ^ b->sign;
    if (likely(ab_mask == float_cmask_normal)) {
        FloatPartsW tmp;
        frac_mulw(&tmp, a, b);
        frac_truncjam(a, &tmp);
        a->exp += b->exp + 1;
        if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
            frac_add(a, a, a);
            a->exp -= 1;
        }
        a->sign = sign;
        return a;
    }
    /* Inf * Zero == NaN */
    if (unlikely(ab_mask == float_cmask_infzero)) {
        float_raise(float_flag_invalid, s);
        parts_default_nan(a, s);
        return a;
    }
    if (unlikely(ab_mask & float_cmask_anynan)) {
        return parts_pick_nan(a, b, s);
    }
    /* Multiply by 0 or Inf */
    if (ab_mask & float_cmask_inf) {
        a->cls = float_class_inf;
        a->sign = sign;
        return a;
    }
    g_assert(ab_mask & float_cmask_zero);
    a->cls = float_class_zero;
    a->sign = sign;
    return a;
 }
 /*
 * Returns the result of multiplying the floating-point values `a' and
 * `b' then adding 'c', with no intermediate rounding step after the
 * multiplication. The operation is performed according to the
 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 * The flags argument allows the caller to select negation of the
 * addend, the intermediate product, or the final result. (The
 * difference between this and having the caller do a separate
 * negation is that negating externally will flip the sign bit on NaNs.)
 *
 * Requires A and C extracted into a double-sized structure to provide the
 * extra space for the widening multiply.
 */
 static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
                                   FloatPartsN *c, int flags, float_status *s)
 {
    int ab_mask, abc_mask;
    FloatPartsW p_widen, c_widen;
    ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
    abc_mask = float_cmask(c->cls) | ab_mask;
    /*
     * It is implementation-defined whether the cases of (0,inf,qnan)
     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
     * they return if they do), so we have to hand this information
     * off to the target-specific pick-a-NaN routine.
     */
    if (unlikely(abc_mask & float_cmask_anynan)) {
        return parts_pick_nan_muladd(a, b, c, s, ab_mask, abc_mask);
    }
    if (flags & float_muladd_negate_c) {
        c->sign ^= 1;
    }
    /* Compute the sign of the product into A. */
    a->sign ^= b->sign;
    if (flags & float_muladd_negate_product) {
        a->sign ^= 1;
    }
    if (unlikely(ab_mask != float_cmask_normal)) {
        if (unlikely(ab_mask == float_cmask_infzero)) {
            goto d_nan;
        }
        if (ab_mask & float_cmask_inf) {
            if (c->cls == float_class_inf && a->sign != c->sign) {
                goto d_nan;
            }
            goto return_inf;
        }
        g_assert(ab_mask & float_cmask_zero);
        if (c->cls == float_class_normal) {
            *a = *c;
            goto return_normal;
        }
        if (c->cls == float_class_zero) {
            if (a->sign != c->sign) {
                goto return_sub_zero;
            }
            goto return_zero;
        }
        g_assert(c->cls == float_class_inf);
    }
    if (unlikely(c->cls == float_class_inf)) {
        a->sign = c->sign;
        goto return_inf;
    }
    /* Perform the multiplication step. */
    p_widen.sign = a->sign;
    p_widen.exp = a->exp + b->exp + 1;
    frac_mulw(&p_widen, a, b);
    if (!(p_widen.frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
        frac_add(&p_widen, &p_widen, &p_widen);
        p_widen.exp -= 1;
    }
    /* Perform the addition step. */
    if (c->cls != float_class_zero) {
        /* Zero-extend C to less significant bits. */
        frac_widen(&c_widen, c);
        c_widen.exp = c->exp;
        if (a->sign == c->sign) {
            parts_add_normal(&p_widen, &c_widen);
        } else if (!parts_sub_normal(&p_widen, &c_widen)) {
            goto return_sub_zero;
        }
    }
    /* Narrow with sticky bit, for proper rounding later. */
    frac_truncjam(a, &p_widen);
    a->sign = p_widen.sign;
    a->exp = p_widen.exp;
 return_normal:
    if (flags & float_muladd_halve_result) {
        a->exp -= 1;
    }
 finish_sign:
    if (flags & float_muladd_negate_result) {
        a->sign ^= 1;
    }
    return a;
 return_sub_zero:
    a->sign = s->float_rounding_mode == float_round_down;
 return_zero:
    a->cls = float_class_zero;
    goto finish_sign;
 return_inf:
    a->cls = float_class_inf;
    goto finish_sign;
 d_nan:
    float_raise(float_flag_invalid, s);
    parts_default_nan(a, s);
    return a;
 }
 /*
 * Returns the result of dividing the floating-point value `a' by the
 * corresponding value `b'. The operation is performed according to
 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 */
 static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b,
                                float_status *s)
 {
    int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
    bool sign = a->sign ^ b->sign;
    if (likely(ab_mask == float_cmask_normal)) {
        a->sign = sign;
        a->exp -= b->exp + frac_div(a, b);
        return a;
    }
    /* 0/0 or Inf/Inf => NaN */
    if (unlikely(ab_mask == float_cmask_zero) ||
        unlikely(ab_mask == float_cmask_inf)) {
        float_raise(float_flag_invalid, s);
        parts_default_nan(a, s);
        return a;
    }
    /* All the NaN cases */
    if (unlikely(ab_mask & float_cmask_anynan)) {
        return parts_pick_nan(a, b, s);
    }
    a->sign = sign;
    /* Inf / X */
    if (a->cls == float_class_inf) {
        return a;
    }
    /* 0 / X */
    if (a->cls == float_class_zero) {
        return a;
    }
    /* X / Inf */
    if (b->cls == float_class_inf) {
        a->cls = float_class_zero;
        return a;
    }
    /* X / 0 => Inf */
    g_assert(b->cls == float_class_zero);
    float_raise(float_flag_divbyzero, s);
    a->cls = float_class_inf;
    return a;
 }
 /*
 * Rounds the floating-point value `a' to an integer, and returns the
 * result as a floating-point value. The operation is performed
 * according to the IEC/IEEE Standard for Binary Floating-Point
 * Arithmetic.
 *
 * parts_round_to_int_normal is an internal helper function for
 * normal numbers only, returning true for inexact but not directly
 * raising float_flag_inexact.
 */
 static bool partsN(round_to_int_normal)(FloatPartsN *a, FloatRoundMode rmode,
                                        int scale, int frac_size)
 {
    uint64_t frac_lsb, frac_lsbm1, rnd_even_mask, rnd_mask, inc;
    int shift_adj;
    scale = MIN(MAX(scale, -0x10000), 0x10000);
    a->exp += scale;
    if (a->exp < 0) {
        bool one;
        /* All fractional */
        switch (rmode) {
        case float_round_nearest_even:
            one = false;
            if (a->exp == -1) {
                FloatPartsN tmp;
                /* Shift left one, discarding DECOMPOSED_IMPLICIT_BIT */
                frac_add(&tmp, a, a);
                /* Anything remaining means frac > 0.5. */
                one = !frac_eqz(&tmp);
            }
            break;
        case float_round_ties_away:
            one = a->exp == -1;
            break;
        case float_round_to_zero:
            one = false;
            break;
        case float_round_up:
            one = !a->sign;
            break;
        case float_round_down:
            one = a->sign;
            break;
        case float_round_to_odd:
            one = true;
            break;
        default:
            g_assert_not_reached();
        }
        frac_clear(a);
        a->exp = 0;
        if (one) {
            a->frac_hi = DECOMPOSED_IMPLICIT_BIT;
        } else {
            a->cls = float_class_zero;
        }
        return true;
    }
    if (a->exp >= frac_size) {
        /* All integral */
        return false;
    }
    if (N > 64 && a->exp < N - 64) {
        /*
         * Rounding is not in the low word -- shift lsb to bit 2,
         * which leaves room for sticky and rounding bit.
         */
        shift_adj = (N - 1) - (a->exp + 2);
        frac_shrjam(a, shift_adj);
        frac_lsb = 1 << 2;
    } else {
        shift_adj = 0;
        frac_lsb = DECOMPOSED_IMPLICIT_BIT >> (a->exp & 63);
    }
    frac_lsbm1 = frac_lsb >> 1;
    rnd_mask = frac_lsb - 1;
    rnd_even_mask = rnd_mask | frac_lsb;
    if (!(a->frac_lo & rnd_mask)) {
        /* Fractional bits already clear, undo the shift above. */
        frac_shl(a, shift_adj);
        return false;
    }
    switch (rmode) {
    case float_round_nearest_even:
        inc = ((a->frac_lo & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
        break;
    case float_round_ties_away:
        inc = frac_lsbm1;
        break;
    case float_round_to_zero:
        inc = 0;
        break;
    case float_round_up:
        inc = a->sign ? 0 : rnd_mask;
        break;
    case float_round_down:
        inc = a->sign ? rnd_mask : 0;
        break;
    case float_round_to_odd:
        inc = a->frac_lo & frac_lsb ? 0 : rnd_mask;
        break;
    default:
        g_assert_not_reached();
    }
    if (shift_adj == 0) {
        if (frac_addi(a, a, inc)) {
            frac_shr(a, 1);
            a->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
            a->exp++;
        }
        a->frac_lo &= ~rnd_mask;
    } else {
        frac_addi(a, a, inc);
        a->frac_lo &= ~rnd_mask;
        /* Be careful shifting back, not to overflow */
        frac_shl(a, shift_adj - 1);
        if (a->frac_hi & DECOMPOSED_IMPLICIT_BIT) {
            a->exp++;
        } else {
            frac_add(a, a, a);
        }
    }
    return true;
 }
 static void partsN(round_to_int)(FloatPartsN *a, FloatRoundMode rmode,
                                 int scale, float_status *s,
                                 const FloatFmt *fmt)
 {
    switch (a->cls) {
    case float_class_qnan:
    case float_class_snan:
        parts_return_nan(a, s);
        break;
    case float_class_zero:
    case float_class_inf:
        break;
    case float_class_normal:
        if (parts_round_to_int_normal(a, rmode, scale, fmt->frac_size)) {
            float_raise(float_flag_inexact, s);
        }
        break;
    default:
        g_assert_not_reached();
    }
 }
 /*
 * Returns the result of converting the floating-point value `a' to
 * the two's complement integer format. The conversion is performed
 * according to the IEC/IEEE Standard for Binary Floating-Point
 * Arithmetic---which means in particular that the conversion is
 * rounded according to the current rounding mode. If `a' is a NaN,
 * the largest positive integer is returned. Otherwise, if the
 * conversion overflows, the largest integer with the same sign as `a'
 * is returned.
 */
 static int64_t partsN(float_to_sint)(FloatPartsN *p, FloatRoundMode rmode,
                                     int scale, int64_t min, int64_t max,
                                     float_status *s)
 {
    int flags = 0;
    uint64_t r;
    switch (p->cls) {
    case float_class_snan:
    case float_class_qnan:
        flags = float_flag_invalid;
        r = max;
        break;
    case float_class_inf:
        flags = float_flag_invalid;
        r = p->sign ? min : max;
        break;
    case float_class_zero:
        return 0;
    case float_class_normal:
        /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
        if (parts_round_to_int_normal(p, rmode, scale, N - 2)) {
            flags = float_flag_inexact;
        }
        if (p->exp <= DECOMPOSED_BINARY_POINT) {
            r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
        } else {
            r = UINT64_MAX;
        }
        if (p->sign) {
            if (r <= -(uint64_t)min) {
                r = -r;
            } else {
                flags = float_flag_invalid;
                r = min;
            }
        } else if (r > max) {
            flags = float_flag_invalid;
            r = max;
        }
        break;
    default:
        g_assert_not_reached();
    }
    float_raise(flags, s);
    return r;
 }
--- a/fpu/softfloat-specialize.c.inc
+++ b/fpu/softfloat-specialize.c.inc
@ -129,7 +129,7 @@ static bool parts_is_snan_frac(uint64_t frac, float_status *status)
 | The pattern for a default generated deconstructed floating-point NaN.
 *----------------------------------------------------------------------------*/
-static FloatParts parts_default_nan(float_status *status)
+static void parts64_default_nan(FloatParts64 *p, float_status *status)
 {
    bool sign = 0;
    uint64_t frac;
@ -163,7 +163,7 @@ static FloatParts parts_default_nan(float_status *status)
    }
 #endif
-    return (FloatParts) {
+    *p = (FloatParts64) {
        .cls = float_class_qnan,
        .sign = sign,
        .exp = INT_MAX,
@ -171,26 +171,55 @@ static FloatParts parts_default_nan(float_status *status)
    };
 }
 static void parts128_default_nan(FloatParts128 *p, float_status *status)
 {
    /*
     * Extrapolate from the choices made by parts64_default_nan to fill
     * in the quad-floating format.  If the low bit is set, assume we
     * want to set all non-snan bits.
     */
    FloatParts64 p64;
    parts64_default_nan(&p64, status);
    *p = (FloatParts128) {
        .cls = float_class_qnan,
        .sign = p64.sign,
        .exp = INT_MAX,
        .frac_hi = p64.frac,
        .frac_lo = -(p64.frac & 1)
    };
 }
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN from a signalling NaN for the deconstructed
 | floating-point parts.
 *----------------------------------------------------------------------------*/
-static FloatParts parts_silence_nan(FloatParts a, float_status *status)
+static uint64_t parts_silence_nan_frac(uint64_t frac, float_status *status)
 {
    g_assert(!no_signaling_nans(status));
-#if defined(TARGET_HPPA)
+    g_assert(!status->default_nan_mode);
-    a.frac &= ~(1ULL << (DECOMPOSED_BINARY_POINT - 1));
+
-    a.frac |= 1ULL << (DECOMPOSED_BINARY_POINT - 2);
+    /* The only snan_bit_is_one target without default_nan_mode is HPPA. */
 #else
    if (snan_bit_is_one(status)) {
-        return parts_default_nan(status);
+        frac &= ~(1ULL << (DECOMPOSED_BINARY_POINT - 1));
        frac |= 1ULL << (DECOMPOSED_BINARY_POINT - 2);
    } else {
-        a.frac |= 1ULL << (DECOMPOSED_BINARY_POINT - 1);
+        frac |= 1ULL << (DECOMPOSED_BINARY_POINT - 1);
    }
-#endif
+    return frac;
-    a.cls = float_class_qnan;
+}
-    return a;
+
 static void parts64_silence_nan(FloatParts64 *p, float_status *status)
 {
    p->frac = parts_silence_nan_frac(p->frac, status);
    p->cls = float_class_qnan;
 }
 static void parts128_silence_nan(FloatParts128 *p, float_status *status)
 {
    p->frac_hi = parts_silence_nan_frac(p->frac_hi, status);
    p->cls = float_class_qnan;
 }
 /*----------------------------------------------------------------------------
@ -227,18 +256,6 @@ floatx80 floatx80_default_nan(float_status *status)
 const floatx80 floatx80_infinity
    = make_floatx80_init(floatx80_infinity_high, floatx80_infinity_low);
 /*----------------------------------------------------------------------------
 | Raises the exceptions specified by `flags'.  Floating-point traps can be
 | defined here if desired.  It is currently not possible for such a trap
 | to substitute a result value.  If traps are not implemented, this routine
 | should be simply `float_exception_flags |= flags;'.
 *----------------------------------------------------------------------------*/
 void float_raise(uint8_t flags, float_status *status)
 {
    status->float_exception_flags |= flags;
 }
 /*----------------------------------------------------------------------------
 | Internal canonical NaN format.
 *----------------------------------------------------------------------------*/
@ -1070,25 +1087,6 @@ bool float128_is_signaling_nan(float128 a, float_status *status)
    }
 }
 /*----------------------------------------------------------------------------
 | Returns a quiet NaN from a signalling NaN for the quadruple-precision
 | floating point value `a'.
 *----------------------------------------------------------------------------*/
 float128 float128_silence_nan(float128 a, float_status *status)
 {
    if (no_signaling_nans(status)) {
        g_assert_not_reached();
    } else {
        if (snan_bit_is_one(status)) {
            return float128_default_nan(status);
        } else {
            a.high |= UINT64_C(0x0000800000000000);
            return a;
        }
    }
 }
 /*----------------------------------------------------------------------------
 | Returns the result of converting the quadruple-precision floating-point NaN
 | `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@ -83,6 +83,43 @@ this code that are retained.
 #define FPU_SOFTFLOAT_MACROS_H
 #include "fpu/softfloat-types.h"
 #include "qemu/host-utils.h"
 /**
 * shl_double: double-word merging left shift
 * @l: left or most-significant word
 * @r: right or least-significant word
 * @c: shift count
 *
 * Shift @l left by @c bits, shifting in bits from @r.
 */
 static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
 {
 #if defined(__x86_64__)
    asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
    return l;
 #else
    return c ? (l << c) | (r >> (64 - c)) : l;
 #endif
 }
 /**
 * shr_double: double-word merging right shift
 * @l: left or most-significant word
 * @r: right or least-significant word
 * @c: shift count
 *
 * Shift @r right by @c bits, shifting in bits from @l.
 */
 static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
 {
 #if defined(__x86_64__)
    asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
    return r;
 #else
    return c ? (r >> c) | (l << (64 - c)) : r;
 #endif
 }
 /*----------------------------------------------------------------------------
 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
@ -403,16 +440,12 @@ static inline void
 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 *----------------------------------------------------------------------------*/
-static inline void
+static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
- add128(
+                          uint64_t *z0Ptr, uint64_t *z1Ptr)
     uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 {
-    uint64_t z1;
+    bool c = 0;
-
+    *z1Ptr = uadd64_carry(a1, b1, &c);
-    z1 = a1 + b1;
+    *z0Ptr = uadd64_carry(a0, b0, &c);
    *z1Ptr = z1;
    *z0Ptr = a0 + b0 + ( z1 < a1 );
 }
 /*----------------------------------------------------------------------------
@ -423,34 +456,14 @@ static inline void
 | `z1Ptr', and `z2Ptr'.
 *----------------------------------------------------------------------------*/
-static inline void
+static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
- add192(
+                          uint64_t b0, uint64_t b1, uint64_t b2,
-     uint64_t a0,
+                          uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
     uint64_t a1,
     uint64_t a2,
     uint64_t b0,
     uint64_t b1,
     uint64_t b2,
     uint64_t *z0Ptr,
     uint64_t *z1Ptr,
     uint64_t *z2Ptr
 )
 {
-    uint64_t z0, z1, z2;
+    bool c = 0;
-    int8_t carry0, carry1;
+    *z2Ptr = uadd64_carry(a2, b2, &c);
-
+    *z1Ptr = uadd64_carry(a1, b1, &c);
-    z2 = a2 + b2;
+    *z0Ptr = uadd64_carry(a0, b0, &c);
    carry1 = ( z2 < a2 );
    z1 = a1 + b1;
    carry0 = ( z1 < a1 );
    z0 = a0 + b0;
    z1 += carry1;
    z0 += ( z1 < carry1 );
    z0 += carry0;
    *z2Ptr = z2;
    *z1Ptr = z1;
    *z0Ptr = z0;
 }
 /*----------------------------------------------------------------------------
@ -461,14 +474,12 @@ static inline void
 | `z1Ptr'.
 *----------------------------------------------------------------------------*/
-static inline void
+static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
- sub128(
+                          uint64_t *z0Ptr, uint64_t *z1Ptr)
     uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 {
-
+    bool c = 0;
-    *z1Ptr = a1 - b1;
+    *z1Ptr = usub64_borrow(a1, b1, &c);
-    *z0Ptr = a0 - b0 - ( a1 < b1 );
+    *z0Ptr = usub64_borrow(a0, b0, &c);
 }
 /*----------------------------------------------------------------------------
@ -479,34 +490,14 @@ static inline void
 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
 *----------------------------------------------------------------------------*/
-static inline void
+static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
- sub192(
+                          uint64_t b0, uint64_t b1, uint64_t b2,
-     uint64_t a0,
+                          uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
     uint64_t a1,
     uint64_t a2,
     uint64_t b0,
     uint64_t b1,
     uint64_t b2,
     uint64_t *z0Ptr,
     uint64_t *z1Ptr,
     uint64_t *z2Ptr
 )
 {
-    uint64_t z0, z1, z2;
+    bool c = 0;
-    int8_t borrow0, borrow1;
+    *z2Ptr = usub64_borrow(a2, b2, &c);
-
+    *z1Ptr = usub64_borrow(a1, b1, &c);
-    z2 = a2 - b2;
+    *z0Ptr = usub64_borrow(a0, b0, &c);
    borrow1 = ( a2 < b2 );
    z1 = a1 - b1;
    borrow0 = ( a1 < b1 );
    z0 = a0 - b0;
    z0 -= ( z1 < borrow1 );
    z1 -= borrow1;
    z0 -= borrow0;
    *z2Ptr = z2;
    *z1Ptr = z1;
    *z0Ptr = z0;
 }
 /*----------------------------------------------------------------------------
@ -515,27 +506,10 @@ static inline void
 | `z0Ptr' and `z1Ptr'.
 *----------------------------------------------------------------------------*/
-static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
+static inline void
 mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
 {
-    uint32_t aHigh, aLow, bHigh, bLow;
+    mulu64(z1Ptr, z0Ptr, a, b);
    uint64_t z0, zMiddleA, zMiddleB, z1;
    aLow = a;
    aHigh = a>>32;
    bLow = b;
    bHigh = b>>32;
    z1 = ( (uint64_t) aLow ) * bLow;
    zMiddleA = ( (uint64_t) aLow ) * bHigh;
    zMiddleB = ( (uint64_t) aHigh ) * bLow;
    z0 = ( (uint64_t) aHigh ) * bHigh;
    zMiddleA += zMiddleB;
    z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
    zMiddleA <<= 32;
    z1 += zMiddleA;
    z0 += ( z1 < zMiddleA );
    *z1Ptr = z1;
    *z0Ptr = z0;
 }
 /*----------------------------------------------------------------------------
@ -546,24 +520,14 @@ static inline void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t
 *----------------------------------------------------------------------------*/
 static inline void
- mul128By64To192(
+mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
-     uint64_t a0,
+                uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
     uint64_t a1,
     uint64_t b,
     uint64_t *z0Ptr,
     uint64_t *z1Ptr,
     uint64_t *z2Ptr
 )
 {
-    uint64_t z0, z1, z2, more1;
+    uint64_t z0, z1, m1;
    mul64To128( a1, b, &z1, &z2 );
    mul64To128( a0, b, &z0, &more1 );
    add128( z0, more1, 0, z1, &z0, &z1 );
    *z2Ptr = z2;
    *z1Ptr = z1;
    *z0Ptr = z0;
    mul64To128(a1, b, &m1, z2Ptr);
    mul64To128(a0, b, &z0, &z1);
    add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
 }
 /*----------------------------------------------------------------------------
@ -573,34 +537,21 @@ static inline void
 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
 *----------------------------------------------------------------------------*/
-static inline void
+static inline void mul128To256(uint64_t a0, uint64_t a1,
- mul128To256(
+                               uint64_t b0, uint64_t b1,
-     uint64_t a0,
+                               uint64_t *z0Ptr, uint64_t *z1Ptr,
-     uint64_t a1,
+                               uint64_t *z2Ptr, uint64_t *z3Ptr)
     uint64_t b0,
     uint64_t b1,
     uint64_t *z0Ptr,
     uint64_t *z1Ptr,
     uint64_t *z2Ptr,
     uint64_t *z3Ptr
 )
 {
-    uint64_t z0, z1, z2, z3;
+    uint64_t z0, z1, z2;
-    uint64_t more1, more2;
+    uint64_t m0, m1, m2, n1, n2;
-    mul64To128( a1, b1, &z2, &z3 );
+    mul64To128(a1, b0, &m1, &m2);
-    mul64To128( a1, b0, &z1, &more2 );
+    mul64To128(a0, b1, &n1, &n2);
-    add128( z1, more2, 0, z2, &z1, &z2 );
+    mul64To128(a1, b1, &z2, z3Ptr);
-    mul64To128( a0, b0, &z0, &more1 );
+    mul64To128(a0, b0, &z0, &z1);
    add128( z0, more1, 0, z1, &z0, &z1 );
    mul64To128( a0, b1, &more1, &more2 );
    add128( more1, more2, 0, z2, &more1, &z2 );
    add128( z0, z1, 0, more1, &z0, &z1 );
    *z3Ptr = z3;
    *z2Ptr = z2;
    *z1Ptr = z1;
    *z0Ptr = z0;
    add192( 0, m1, m2,  0, n1, n2, &m0, &m1, &m2);
    add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
 }
 /*----------------------------------------------------------------------------
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@ -100,7 +100,10 @@ typedef enum {
 | Routine to raise any or all of the software IEC/IEEE floating-point
 | exception flags.
 *----------------------------------------------------------------------------*/
-void float_raise(uint8_t flags, float_status *status);
+static inline void float_raise(uint8_t flags, float_status *status)
 {
    status->float_exception_flags |= flags;
 }
 /*----------------------------------------------------------------------------
 | If `a' is denormal and we are in flush-to-zero mode then set the
@ -1194,6 +1197,8 @@ float128 float128_round_to_int(float128, float_status *status);
 float128 float128_add(float128, float128, float_status *status);
 float128 float128_sub(float128, float128, float_status *status);
 float128 float128_mul(float128, float128, float_status *status);
 float128 float128_muladd(float128, float128, float128, int,
                         float_status *status);
 float128 float128_div(float128, float128, float_status *status);
 float128 float128_rem(float128, float128, float_status *status);
 float128 float128_sqrt(float128, float_status *status);
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@ -26,6 +26,7 @@
 #ifndef HOST_UTILS_H
 #define HOST_UTILS_H
 #include "qemu/compiler.h"
 #include "qemu/bswap.h"
 #ifdef CONFIG_INT128
@ -272,6 +273,9 @@ static inline int ctpop64(uint64_t val)
 */
 static inline uint8_t revbit8(uint8_t x)
 {
 #if __has_builtin(__builtin_bitreverse8)
    return __builtin_bitreverse8(x);
 #else
    /* Assign the correct nibble position.  */
    x = ((x & 0xf0) >> 4)
      | ((x & 0x0f) << 4);
@ -281,6 +285,7 @@ static inline uint8_t revbit8(uint8_t x)
      | ((x & 0x22) << 1)
      | ((x & 0x11) << 3);
    return x;
 #endif
 }
 /**
@ -289,6 +294,9 @@ static inline uint8_t revbit8(uint8_t x)
 */
 static inline uint16_t revbit16(uint16_t x)
 {
 #if __has_builtin(__builtin_bitreverse16)
    return __builtin_bitreverse16(x);
 #else
    /* Assign the correct byte position.  */
    x = bswap16(x);
    /* Assign the correct nibble position.  */
@ -300,6 +308,7 @@ static inline uint16_t revbit16(uint16_t x)
      | ((x & 0x2222) << 1)
      | ((x & 0x1111) << 3);
    return x;
 #endif
 }
 /**
@ -308,6 +317,9 @@ static inline uint16_t revbit16(uint16_t x)
 */
 static inline uint32_t revbit32(uint32_t x)
 {
 #if __has_builtin(__builtin_bitreverse32)
    return __builtin_bitreverse32(x);
 #else
    /* Assign the correct byte position.  */
    x = bswap32(x);
    /* Assign the correct nibble position.  */
@ -319,6 +331,7 @@ static inline uint32_t revbit32(uint32_t x)
      | ((x & 0x22222222u) << 1)
      | ((x & 0x11111111u) << 3);
    return x;
 #endif
 }
 /**
@ -327,6 +340,9 @@ static inline uint32_t revbit32(uint32_t x)
 */
 static inline uint64_t revbit64(uint64_t x)
 {
 #if __has_builtin(__builtin_bitreverse64)
    return __builtin_bitreverse64(x);
 #else
    /* Assign the correct byte position.  */
    x = bswap64(x);
    /* Assign the correct nibble position.  */
@ -338,6 +354,281 @@ static inline uint64_t revbit64(uint64_t x)
      | ((x & 0x2222222222222222ull) << 1)
      | ((x & 0x1111111111111111ull) << 3);
    return x;
 #endif
 }
 /**
 * sadd32_overflow - addition with overflow indication
 * @x, @y: addends
 * @ret: Output for sum
 *
 * Computes *@ret = @x + @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool sadd32_overflow(int32_t x, int32_t y, int32_t *ret)
 {
 #if __has_builtin(__builtin_add_overflow) || __GNUC__ >= 5
    return __builtin_add_overflow(x, y, ret);
 #else
    *ret = x + y;
    return ((*ret ^ x) & ~(x ^ y)) < 0;
 #endif
 }
 /**
 * sadd64_overflow - addition with overflow indication
 * @x, @y: addends
 * @ret: Output for sum
 *
 * Computes *@ret = @x + @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool sadd64_overflow(int64_t x, int64_t y, int64_t *ret)
 {
 #if __has_builtin(__builtin_add_overflow) || __GNUC__ >= 5
    return __builtin_add_overflow(x, y, ret);
 #else
    *ret = x + y;
    return ((*ret ^ x) & ~(x ^ y)) < 0;
 #endif
 }
 /**
 * uadd32_overflow - addition with overflow indication
 * @x, @y: addends
 * @ret: Output for sum
 *
 * Computes *@ret = @x + @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool uadd32_overflow(uint32_t x, uint32_t y, uint32_t *ret)
 {
 #if __has_builtin(__builtin_add_overflow) || __GNUC__ >= 5
    return __builtin_add_overflow(x, y, ret);
 #else
    *ret = x + y;
    return *ret < x;
 #endif
 }
 /**
 * uadd64_overflow - addition with overflow indication
 * @x, @y: addends
 * @ret: Output for sum
 *
 * Computes *@ret = @x + @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool uadd64_overflow(uint64_t x, uint64_t y, uint64_t *ret)
 {
 #if __has_builtin(__builtin_add_overflow) || __GNUC__ >= 5
    return __builtin_add_overflow(x, y, ret);
 #else
    *ret = x + y;
    return *ret < x;
 #endif
 }
 /**
 * ssub32_overflow - subtraction with overflow indication
 * @x: Minuend
 * @y: Subtrahend
 * @ret: Output for difference
 *
 * Computes *@ret = @x - @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool ssub32_overflow(int32_t x, int32_t y, int32_t *ret)
 {
 #if __has_builtin(__builtin_sub_overflow) || __GNUC__ >= 5
    return __builtin_sub_overflow(x, y, ret);
 #else
    *ret = x - y;
    return ((*ret ^ x) & (x ^ y)) < 0;
 #endif
 }
 /**
 * ssub64_overflow - subtraction with overflow indication
 * @x: Minuend
 * @y: Subtrahend
 * @ret: Output for sum
 *
 * Computes *@ret = @x - @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool ssub64_overflow(int64_t x, int64_t y, int64_t *ret)
 {
 #if __has_builtin(__builtin_sub_overflow) || __GNUC__ >= 5
    return __builtin_sub_overflow(x, y, ret);
 #else
    *ret = x - y;
    return ((*ret ^ x) & (x ^ y)) < 0;
 #endif
 }
 /**
 * usub32_overflow - subtraction with overflow indication
 * @x: Minuend
 * @y: Subtrahend
 * @ret: Output for sum
 *
 * Computes *@ret = @x - @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool usub32_overflow(uint32_t x, uint32_t y, uint32_t *ret)
 {
 #if __has_builtin(__builtin_sub_overflow) || __GNUC__ >= 5
    return __builtin_sub_overflow(x, y, ret);
 #else
    *ret = x - y;
    return x < y;
 #endif
 }
 /**
 * usub64_overflow - subtraction with overflow indication
 * @x: Minuend
 * @y: Subtrahend
 * @ret: Output for sum
 *
 * Computes *@ret = @x - @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool usub64_overflow(uint64_t x, uint64_t y, uint64_t *ret)
 {
 #if __has_builtin(__builtin_sub_overflow) || __GNUC__ >= 5
    return __builtin_sub_overflow(x, y, ret);
 #else
    *ret = x - y;
    return x < y;
 #endif
 }
 /**
 * smul32_overflow - multiplication with overflow indication
 * @x, @y: Input multipliers
 * @ret: Output for product
 *
 * Computes *@ret = @x * @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool smul32_overflow(int32_t x, int32_t y, int32_t *ret)
 {
 #if __has_builtin(__builtin_mul_overflow) || __GNUC__ >= 5
    return __builtin_mul_overflow(x, y, ret);
 #else
    int64_t z = (int64_t)x * y;
    *ret = z;
    return *ret != z;
 #endif
 }
 /**
 * smul64_overflow - multiplication with overflow indication
 * @x, @y: Input multipliers
 * @ret: Output for product
 *
 * Computes *@ret = @x * @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool smul64_overflow(int64_t x, int64_t y, int64_t *ret)
 {
 #if __has_builtin(__builtin_mul_overflow) || __GNUC__ >= 5
    return __builtin_mul_overflow(x, y, ret);
 #else
    uint64_t hi, lo;
    muls64(&lo, &hi, x, y);
    *ret = lo;
    return hi != ((int64_t)lo >> 63);
 #endif
 }
 /**
 * umul32_overflow - multiplication with overflow indication
 * @x, @y: Input multipliers
 * @ret: Output for product
 *
 * Computes *@ret = @x * @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool umul32_overflow(uint32_t x, uint32_t y, uint32_t *ret)
 {
 #if __has_builtin(__builtin_mul_overflow) || __GNUC__ >= 5
    return __builtin_mul_overflow(x, y, ret);
 #else
    uint64_t z = (uint64_t)x * y;
    *ret = z;
    return z > UINT32_MAX;
 #endif
 }
 /**
 * umul64_overflow - multiplication with overflow indication
 * @x, @y: Input multipliers
 * @ret: Output for product
 *
 * Computes *@ret = @x * @y, and returns true if and only if that
 * value has been truncated.
 */
 static inline bool umul64_overflow(uint64_t x, uint64_t y, uint64_t *ret)
 {
 #if __has_builtin(__builtin_mul_overflow) || __GNUC__ >= 5
    return __builtin_mul_overflow(x, y, ret);
 #else
    uint64_t hi;
    mulu64(ret, &hi, x, y);
    return hi != 0;
 #endif
 }
 /**
 * uadd64_carry - addition with carry-in and carry-out
 * @x, @y: addends
 * @pcarry: in-out carry value
 *
 * Computes @x + @y + *@pcarry, placing the carry-out back
 * into *@pcarry and returning the 64-bit sum.
 */
 static inline uint64_t uadd64_carry(uint64_t x, uint64_t y, bool *pcarry)
 {
 #if __has_builtin(__builtin_addcll)
    unsigned long long c = *pcarry;
    x = __builtin_addcll(x, y, c, &c);
    *pcarry = c & 1;
    return x;
 #else
    bool c = *pcarry;
    /* This is clang's internal expansion of __builtin_addc. */
    c = uadd64_overflow(x, c, &x);
    c |= uadd64_overflow(x, y, &x);
    *pcarry = c;
    return x;
 #endif
 }
 /**
 * usub64_borrow - subtraction with borrow-in and borrow-out
 * @x, @y: addends
 * @pborrow: in-out borrow value
 *
 * Computes @x - @y - *@pborrow, placing the borrow-out back
 * into *@pborrow and returning the 64-bit sum.
 */
 static inline uint64_t usub64_borrow(uint64_t x, uint64_t y, bool *pborrow)
 {
 #if __has_builtin(__builtin_subcll)
    unsigned long long b = *pborrow;
    x = __builtin_subcll(x, y, b, &b);
    *pborrow = b & 1;
    return x;
 #else
    bool b = *pborrow;
    b = usub64_overflow(x, b, &x);
    b |= usub64_overflow(x, y, &x);
    *pborrow = b;
    return x;
 #endif
 }
 /* Host type specific sizes of these routines.  */
--- a/target/mips/fpu_helper.h
+++ b/target/mips/fpu_helper.h
@ -27,8 +27,14 @@ static inline void restore_flush_mode(CPUMIPSState *env)
 static inline void restore_snan_bit_mode(CPUMIPSState *env)
 {
-    set_snan_bit_is_one((env->active_fpu.fcr31 & (1 << FCR31_NAN2008)) == 0,
+    bool nan2008 = env->active_fpu.fcr31 & (1 << FCR31_NAN2008);
-                        &env->active_fpu.fp_status);
+
    /*
     * With nan2008, SNaNs are silenced in the usual way.
     * Before that, SNaNs are not silenced; default nans are produced.
     */
    set_snan_bit_is_one(!nan2008, &env->active_fpu.fp_status);
    set_default_nan_mode(!nan2008, &env->active_fpu.fp_status);
 }
 static inline void restore_fp_status(CPUMIPSState *env)
--- a/tests/fp/fp-bench.c
+++ b/tests/fp/fp-bench.c
@ -14,6 +14,7 @@
 #include <math.h>
 #include <fenv.h>
 #include "qemu/timer.h"
 #include "qemu/int128.h"
 #include "fpu/softfloat.h"
 /* amortize the computation of random inputs */
@ -50,8 +51,10 @@ static const char * const op_names[] = {
 enum precision {
    PREC_SINGLE,
    PREC_DOUBLE,
    PREC_QUAD,
    PREC_FLOAT32,
    PREC_FLOAT64,
    PREC_FLOAT128,
    PREC_MAX_NR,
 };
@ -89,6 +92,7 @@ union fp {
    double d;
    float32 f32;
    float64 f64;
    float128 f128;
    uint64_t u64;
 };
@ -113,6 +117,10 @@ struct op_desc {
 static uint64_t random_ops[MAX_OPERANDS] = {
    SEED_A, SEED_B, SEED_C,
 };
 static float128 random_quad_ops[MAX_OPERANDS] = {
    {SEED_A, SEED_B}, {SEED_B, SEED_C}, {SEED_C, SEED_A},
 };
 static float_status soft_status;
 static enum precision precision;
 static enum op operation;
@ -141,25 +149,45 @@ static void update_random_ops(int n_ops, enum precision prec)
    int i;
    for (i = 0; i < n_ops; i++) {
        uint64_t r = random_ops[i];
        switch (prec) {
        case PREC_SINGLE:
        case PREC_FLOAT32:
        {
            uint64_t r = random_ops[i];
            do {
                r = xorshift64star(r);
            } while (!float32_is_normal(r));
            random_ops[i] = r;
            break;
        }
        case PREC_DOUBLE:
        case PREC_FLOAT64:
        {
            uint64_t r = random_ops[i];
            do {
                r = xorshift64star(r);
            } while (!float64_is_normal(r));
            random_ops[i] = r;
            break;
        }
        case PREC_QUAD:
        case PREC_FLOAT128:
        {
            float128 r = random_quad_ops[i];
            uint64_t hi = r.high;
            uint64_t lo = r.low;
            do {
                hi = xorshift64star(hi);
                lo = xorshift64star(lo);
                r = make_float128(hi, lo);
            } while (!float128_is_normal(r));
            random_quad_ops[i] = r;
            break;
        }
        default:
            g_assert_not_reached();
        }
        random_ops[i] = r;
    }
 }
@ -184,6 +212,13 @@ static void fill_random(union fp *ops, int n_ops, enum precision prec,
                ops[i].f64 = float64_chs(ops[i].f64);
            }
            break;
        case PREC_QUAD:
        case PREC_FLOAT128:
            ops[i].f128 = random_quad_ops[i];
            if (no_neg && float128_is_neg(ops[i].f128)) {
                ops[i].f128 = float128_chs(ops[i].f128);
            }
            break;
        default:
            g_assert_not_reached();
        }
@ -345,6 +380,41 @@ static void bench(enum precision prec, enum op op, int n_ops, bool no_neg)
                }
            }
            break;
        case PREC_FLOAT128:
            fill_random(ops, n_ops, prec, no_neg);
            t0 = get_clock();
            for (i = 0; i < OPS_PER_ITER; i++) {
                float128 a = ops[0].f128;
                float128 b = ops[1].f128;
                float128 c = ops[2].f128;
                switch (op) {
                case OP_ADD:
                    res.f128 = float128_add(a, b, &soft_status);
                    break;
                case OP_SUB:
                    res.f128 = float128_sub(a, b, &soft_status);
                    break;
                case OP_MUL:
                    res.f128 = float128_mul(a, b, &soft_status);
                    break;
                case OP_DIV:
                    res.f128 = float128_div(a, b, &soft_status);
                    break;
                case OP_FMA:
                    res.f128 = float128_muladd(a, b, c, 0, &soft_status);
                    break;
                case OP_SQRT:
                    res.f128 = float128_sqrt(a, &soft_status);
                    break;
                case OP_CMP:
                    res.u64 = float128_compare_quiet(a, b, &soft_status);
                    break;
                default:
                    g_assert_not_reached();
                }
            }
            break;
        default:
            g_assert_not_reached();
        }
@ -369,7 +439,8 @@ static void bench(enum precision prec, enum op op, int n_ops, bool no_neg)
    GEN_BENCH(bench_ ## opname ## _float, float, PREC_SINGLE, op, n_ops) \
    GEN_BENCH(bench_ ## opname ## _double, double, PREC_DOUBLE, op, n_ops) \
    GEN_BENCH(bench_ ## opname ## _float32, float32, PREC_FLOAT32, op, n_ops) \
-    GEN_BENCH(bench_ ## opname ## _float64, float64, PREC_FLOAT64, op, n_ops)
+    GEN_BENCH(bench_ ## opname ## _float64, float64, PREC_FLOAT64, op, n_ops) \
    GEN_BENCH(bench_ ## opname ## _float128, float128, PREC_FLOAT128, op, n_ops)
 GEN_BENCH_ALL_TYPES(add, OP_ADD, 2)
 GEN_BENCH_ALL_TYPES(sub, OP_SUB, 2)
@ -383,7 +454,8 @@ GEN_BENCH_ALL_TYPES(cmp, OP_CMP, 2)
    GEN_BENCH_NO_NEG(bench_ ## name ## _float, float, PREC_SINGLE, op, n) \
    GEN_BENCH_NO_NEG(bench_ ## name ## _double, double, PREC_DOUBLE, op, n) \
    GEN_BENCH_NO_NEG(bench_ ## name ## _float32, float32, PREC_FLOAT32, op, n) \
-    GEN_BENCH_NO_NEG(bench_ ## name ## _float64, float64, PREC_FLOAT64, op, n)
+    GEN_BENCH_NO_NEG(bench_ ## name ## _float64, float64, PREC_FLOAT64, op, n) \
    GEN_BENCH_NO_NEG(bench_ ## name ## _float128, float128, PREC_FLOAT128, op, n)
 GEN_BENCH_ALL_TYPES_NO_NEG(sqrt, OP_SQRT, 1)
 #undef GEN_BENCH_ALL_TYPES_NO_NEG
@ -397,6 +469,7 @@ GEN_BENCH_ALL_TYPES_NO_NEG(sqrt, OP_SQRT, 1)
        [PREC_DOUBLE]    = bench_ ## opname ## _double,         \
        [PREC_FLOAT32]   = bench_ ## opname ## _float32,        \
        [PREC_FLOAT64]   = bench_ ## opname ## _float64,        \
        [PREC_FLOAT128]   = bench_ ## opname ## _float128,      \
    }
 static const bench_func_t bench_funcs[OP_MAX_NR][PREC_MAX_NR] = {
@ -445,7 +518,7 @@ static void usage_complete(int argc, char *argv[])
    fprintf(stderr, " -h = show this help message.\n");
    fprintf(stderr, " -o = floating point operation (%s). Default: %s\n",
            op_list, op_names[0]);
-    fprintf(stderr, " -p = floating point precision (single, double). "
+    fprintf(stderr, " -p = floating point precision (single, double, quad[soft only]). "
            "Default: single\n");
    fprintf(stderr, " -r = rounding mode (even, zero, down, up, tieaway). "
            "Default: even\n");
@ -565,6 +638,8 @@ static void parse_args(int argc, char *argv[])
                precision = PREC_SINGLE;
            } else if (!strcmp(optarg, "double")) {
                precision = PREC_DOUBLE;
            } else if (!strcmp(optarg, "quad")) {
                precision = PREC_QUAD;
            } else {
                fprintf(stderr, "Unsupported precision '%s'\n", optarg);
                exit(EXIT_FAILURE);
@ -608,6 +683,9 @@ static void parse_args(int argc, char *argv[])
        case PREC_DOUBLE:
            precision = PREC_FLOAT64;
            break;
        case PREC_QUAD:
            precision = PREC_FLOAT128;
            break;
        default:
            g_assert_not_reached();
        }
--- a/tests/fp/fp-test.c
+++ b/tests/fp/fp-test.c
@ -717,7 +717,7 @@ static void do_testfloat(int op, int rmode, bool exact)
        test_abz_f128(true_abz_f128M, subj_abz_f128M);
        break;
    case F128_MULADD:
-        not_implemented();
+        test_abcz_f128(slow_f128M_mulAdd, qemu_f128M_mulAdd);
        break;
    case F128_SQRT:
        test_az_f128(slow_f128M_sqrt, qemu_f128M_sqrt);
--- a/tests/fp/wrap.c.inc
+++ b/tests/fp/wrap.c.inc
@ -574,6 +574,18 @@ WRAP_MULADD(qemu_f32_mulAdd, float32_muladd, float32)
 WRAP_MULADD(qemu_f64_mulAdd, float64_muladd, float64)
 #undef WRAP_MULADD
 static void qemu_f128M_mulAdd(const float128_t *ap, const float128_t *bp,
                              const float128_t *cp, float128_t *res)
 {
    float128 a, b, c, ret;
    a = soft_to_qemu128(*ap);
    b = soft_to_qemu128(*bp);
    c = soft_to_qemu128(*cp);
    ret = float128_muladd(a, b, c, 0, &qsf);
    *res = qemu_to_soft128(ret);
 }
 #define WRAP_CMP16(name, func, retcond)         \
    static bool name(float16_t a, float16_t b)  \
    {                                           \