fpu/softfloat: re-factor add/sub

We can now add float16_add/sub and use the common decompose and
canonicalize functions to have a single implementation for
float16/32/64 add and sub functions.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
This commit is contained in:
Alex Bennée 2017-11-27 14:15:17 +00:00
parent a90119b5a2
commit 6fff216769
2 changed files with 469 additions and 427 deletions

View File

@ -83,6 +83,7 @@ this code that are retained.
* target-dependent and needs the TARGET_* macros.
*/
#include "qemu/osdep.h"
#include "qemu/bitops.h"
#include "fpu/softfloat.h"
/* We only need stdlib for abort() */
@ -270,6 +271,470 @@ static const FloatFmt float64_params = {
FLOAT_PARAMS(11, 52)
};
/* Unpack a float to parts, but do not canonicalize. */
static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
{
const int sign_pos = fmt.frac_size + fmt.exp_size;
return (FloatParts) {
.cls = float_class_unclassified,
.sign = extract64(raw, sign_pos, 1),
.exp = extract64(raw, fmt.frac_size, fmt.exp_size),
.frac = extract64(raw, 0, fmt.frac_size),
};
}
static inline FloatParts float16_unpack_raw(float16 f)
{
return unpack_raw(float16_params, f);
}
static inline FloatParts float32_unpack_raw(float32 f)
{
return unpack_raw(float32_params, f);
}
static inline FloatParts float64_unpack_raw(float64 f)
{
return unpack_raw(float64_params, f);
}
/* Pack a float from parts, but do not canonicalize. */
static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
{
const int sign_pos = fmt.frac_size + fmt.exp_size;
uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
return deposit64(ret, sign_pos, 1, p.sign);
}
static inline float16 float16_pack_raw(FloatParts p)
{
return make_float16(pack_raw(float16_params, p));
}
static inline float32 float32_pack_raw(FloatParts p)
{
return make_float32(pack_raw(float32_params, p));
}
static inline float64 float64_pack_raw(FloatParts p)
{
return make_float64(pack_raw(float64_params, p));
}
/* Canonicalize EXP and FRAC, setting CLS. */
static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
float_status *status)
{
if (part.exp == parm->exp_max) {
if (part.frac == 0) {
part.cls = float_class_inf;
} else {
#ifdef NO_SIGNALING_NANS
part.cls = float_class_qnan;
#else
int64_t msb = part.frac << (parm->frac_shift + 2);
if ((msb < 0) == status->snan_bit_is_one) {
part.cls = float_class_snan;
} else {
part.cls = float_class_qnan;
}
#endif
}
} else if (part.exp == 0) {
if (likely(part.frac == 0)) {
part.cls = float_class_zero;
} else if (status->flush_inputs_to_zero) {
float_raise(float_flag_input_denormal, status);
part.cls = float_class_zero;
part.frac = 0;
} else {
int shift = clz64(part.frac) - 1;
part.cls = float_class_normal;
part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
part.frac <<= shift;
}
} else {
part.cls = float_class_normal;
part.exp -= parm->exp_bias;
part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
}
return part;
}
/* Round and uncanonicalize a floating-point number by parts. There
* are FRAC_SHIFT bits that may require rounding at the bottom of the
* fraction; these bits will be removed. The exponent will be biased
* by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
*/
static FloatParts round_canonical(FloatParts p, float_status *s,
const FloatFmt *parm)
{
const uint64_t frac_lsbm1 = parm->frac_lsbm1;
const uint64_t round_mask = parm->round_mask;
const uint64_t roundeven_mask = parm->roundeven_mask;
const int exp_max = parm->exp_max;
const int frac_shift = parm->frac_shift;
uint64_t frac, inc;
int exp, flags = 0;
bool overflow_norm;
frac = p.frac;
exp = p.exp;
switch (p.cls) {
case float_class_normal:
switch (s->float_rounding_mode) {
case float_round_nearest_even:
overflow_norm = false;
inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
break;
case float_round_ties_away:
overflow_norm = false;
inc = frac_lsbm1;
break;
case float_round_to_zero:
overflow_norm = true;
inc = 0;
break;
case float_round_up:
inc = p.sign ? 0 : round_mask;
overflow_norm = p.sign;
break;
case float_round_down:
inc = p.sign ? round_mask : 0;
overflow_norm = !p.sign;
break;
default:
g_assert_not_reached();
}
exp += parm->exp_bias;
if (likely(exp > 0)) {
if (frac & round_mask) {
flags |= float_flag_inexact;
frac += inc;
if (frac & DECOMPOSED_OVERFLOW_BIT) {
frac >>= 1;
exp++;
}
}
frac >>= frac_shift;
if (unlikely(exp >= exp_max)) {
flags |= float_flag_overflow | float_flag_inexact;
if (overflow_norm) {
exp = exp_max - 1;
frac = -1;
} else {
p.cls = float_class_inf;
goto do_inf;
}
}
} else if (s->flush_to_zero) {
flags |= float_flag_output_denormal;
p.cls = float_class_zero;
goto do_zero;
} else {
bool is_tiny = (s->float_detect_tininess
== float_tininess_before_rounding)
|| (exp < 0)
|| !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
shift64RightJamming(frac, 1 - exp, &frac);
if (frac & round_mask) {
/* Need to recompute round-to-even. */
if (s->float_rounding_mode == float_round_nearest_even) {
inc = ((frac & roundeven_mask) != frac_lsbm1
? frac_lsbm1 : 0);
}
flags |= float_flag_inexact;
frac += inc;
}
exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
frac >>= frac_shift;
if (is_tiny && (flags & float_flag_inexact)) {
flags |= float_flag_underflow;
}
if (exp == 0 && frac == 0) {
p.cls = float_class_zero;
}
}
break;
case float_class_zero:
do_zero:
exp = 0;
frac = 0;
break;
case float_class_inf:
do_inf:
exp = exp_max;
frac = 0;
break;
case float_class_qnan:
case float_class_snan:
exp = exp_max;
break;
default:
g_assert_not_reached();
}
float_raise(flags, s);
p.exp = exp;
p.frac = frac;
return p;
}
static FloatParts float16_unpack_canonical(float16 f, float_status *s)
{
return canonicalize(float16_unpack_raw(f), &float16_params, s);
}
static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
{
switch (p.cls) {
case float_class_dnan:
return float16_default_nan(s);
case float_class_msnan:
return float16_maybe_silence_nan(float16_pack_raw(p), s);
default:
p = round_canonical(p, s, &float16_params);
return float16_pack_raw(p);
}
}
static FloatParts float32_unpack_canonical(float32 f, float_status *s)
{
return canonicalize(float32_unpack_raw(f), &float32_params, s);
}
static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
{
switch (p.cls) {
case float_class_dnan:
return float32_default_nan(s);
case float_class_msnan:
return float32_maybe_silence_nan(float32_pack_raw(p), s);
default:
p = round_canonical(p, s, &float32_params);
return float32_pack_raw(p);
}
}
static FloatParts float64_unpack_canonical(float64 f, float_status *s)
{
return canonicalize(float64_unpack_raw(f), &float64_params, s);
}
static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
{
switch (p.cls) {
case float_class_dnan:
return float64_default_nan(s);
case float_class_msnan:
return float64_maybe_silence_nan(float64_pack_raw(p), s);
default:
p = round_canonical(p, s, &float64_params);
return float64_pack_raw(p);
}
}
/* Simple helpers for checking if what NaN we have */
static bool is_nan(FloatClass c)
{
return unlikely(c >= float_class_qnan);
}
static bool is_snan(FloatClass c)
{
return c == float_class_snan;
}
static bool is_qnan(FloatClass c)
{
return c == float_class_qnan;
}
static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
{
if (is_snan(a.cls) || is_snan(b.cls)) {
s->float_exception_flags |= float_flag_invalid;
}
if (s->default_nan_mode) {
a.cls = float_class_dnan;
} else {
if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
is_qnan(b.cls), is_snan(b.cls),
a.frac > b.frac ||
(a.frac == b.frac && a.sign < b.sign))) {
a = b;
}
a.cls = float_class_msnan;
}
return a;
}
/*
* Returns the result of adding or subtracting the values of the
* floating-point values `a' and `b'. The operation is performed
* according to the IEC/IEEE Standard for Binary Floating-Point
* Arithmetic.
*/
static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
float_status *s)
{
bool a_sign = a.sign;
bool b_sign = b.sign ^ subtract;
if (a_sign != b_sign) {
/* Subtraction */
if (a.cls == float_class_normal && b.cls == float_class_normal) {
if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
a.frac = a.frac - b.frac;
} else {
shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
a.frac = b.frac - a.frac;
a.exp = b.exp;
a_sign ^= 1;
}
if (a.frac == 0) {
a.cls = float_class_zero;
a.sign = s->float_rounding_mode == float_round_down;
} else {
int shift = clz64(a.frac) - 1;
a.frac = a.frac << shift;
a.exp = a.exp - shift;
a.sign = a_sign;
}
return a;
}
if (is_nan(a.cls) || is_nan(b.cls)) {
return pick_nan(a, b, s);
}
if (a.cls == float_class_inf) {
if (b.cls == float_class_inf) {
float_raise(float_flag_invalid, s);
a.cls = float_class_dnan;
}
return a;
}
if (a.cls == float_class_zero && b.cls == float_class_zero) {
a.sign = s->float_rounding_mode == float_round_down;
return a;
}
if (a.cls == float_class_zero || b.cls == float_class_inf) {
b.sign = a_sign ^ 1;
return b;
}
if (b.cls == float_class_zero) {
return a;
}
} else {
/* Addition */
if (a.cls == float_class_normal && b.cls == float_class_normal) {
if (a.exp > b.exp) {
shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
} else if (a.exp < b.exp) {
shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
a.exp = b.exp;
}
a.frac += b.frac;
if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
a.frac >>= 1;
a.exp += 1;
}
return a;
}
if (is_nan(a.cls) || is_nan(b.cls)) {
return pick_nan(a, b, s);
}
if (a.cls == float_class_inf || b.cls == float_class_zero) {
return a;
}
if (b.cls == float_class_inf || a.cls == float_class_zero) {
b.sign = b_sign;
return b;
}
}
g_assert_not_reached();
}
/*
* Returns the result of adding or subtracting the floating-point
* values `a' and `b'. The operation is performed according to the
* IEC/IEEE Standard for Binary Floating-Point Arithmetic.
*/
float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
float_status *status)
{
FloatParts pa = float16_unpack_canonical(a, status);
FloatParts pb = float16_unpack_canonical(b, status);
FloatParts pr = addsub_floats(pa, pb, false, status);
return float16_round_pack_canonical(pr, status);
}
float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
float_status *status)
{
FloatParts pa = float32_unpack_canonical(a, status);
FloatParts pb = float32_unpack_canonical(b, status);
FloatParts pr = addsub_floats(pa, pb, false, status);
return float32_round_pack_canonical(pr, status);
}
float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
float_status *status)
{
FloatParts pa = float64_unpack_canonical(a, status);
FloatParts pb = float64_unpack_canonical(b, status);
FloatParts pr = addsub_floats(pa, pb, false, status);
return float64_round_pack_canonical(pr, status);
}
float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
float_status *status)
{
FloatParts pa = float16_unpack_canonical(a, status);
FloatParts pb = float16_unpack_canonical(b, status);
FloatParts pr = addsub_floats(pa, pb, true, status);
return float16_round_pack_canonical(pr, status);
}
float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
float_status *status)
{
FloatParts pa = float32_unpack_canonical(a, status);
FloatParts pb = float32_unpack_canonical(b, status);
FloatParts pr = addsub_floats(pa, pb, true, status);
return float32_round_pack_canonical(pr, status);
}
float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
float_status *status)
{
FloatParts pa = float64_unpack_canonical(a, status);
FloatParts pb = float64_unpack_canonical(b, status);
FloatParts pr = addsub_floats(pa, pb, true, status);
return float64_round_pack_canonical(pr, status);
}
/*----------------------------------------------------------------------------
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
| and 7, and returns the properly rounded 32-bit integer corresponding to the
@ -2081,220 +2546,6 @@ float32 float32_round_to_int(float32 a, float_status *status)
}
/*----------------------------------------------------------------------------
| Returns the result of adding the absolute values of the single-precision
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
| before being returned. `zSign' is ignored if the result is a NaN.
| The addition is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
float_status *status)
{
int aExp, bExp, zExp;
uint32_t aSig, bSig, zSig;
int expDiff;
aSig = extractFloat32Frac( a );
aExp = extractFloat32Exp( a );
bSig = extractFloat32Frac( b );
bExp = extractFloat32Exp( b );
expDiff = aExp - bExp;
aSig <<= 6;
bSig <<= 6;
if ( 0 < expDiff ) {
if ( aExp == 0xFF ) {
if (aSig) {
return propagateFloat32NaN(a, b, status);
}
return a;
}
if ( bExp == 0 ) {
--expDiff;
}
else {
bSig |= 0x20000000;
}
shift32RightJamming( bSig, expDiff, &bSig );
zExp = aExp;
}
else if ( expDiff < 0 ) {
if ( bExp == 0xFF ) {
if (bSig) {
return propagateFloat32NaN(a, b, status);
}
return packFloat32( zSign, 0xFF, 0 );
}
if ( aExp == 0 ) {
++expDiff;
}
else {
aSig |= 0x20000000;
}
shift32RightJamming( aSig, - expDiff, &aSig );
zExp = bExp;
}
else {
if ( aExp == 0xFF ) {
if (aSig | bSig) {
return propagateFloat32NaN(a, b, status);
}
return a;
}
if ( aExp == 0 ) {
if (status->flush_to_zero) {
if (aSig | bSig) {
float_raise(float_flag_output_denormal, status);
}
return packFloat32(zSign, 0, 0);
}
return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
}
zSig = 0x40000000 + aSig + bSig;
zExp = aExp;
goto roundAndPack;
}
aSig |= 0x20000000;
zSig = ( aSig + bSig )<<1;
--zExp;
if ( (int32_t) zSig < 0 ) {
zSig = aSig + bSig;
++zExp;
}
roundAndPack:
return roundAndPackFloat32(zSign, zExp, zSig, status);
}
/*----------------------------------------------------------------------------
| Returns the result of subtracting the absolute values of the single-
| precision floating-point values `a' and `b'. If `zSign' is 1, the
| difference is negated before being returned. `zSign' is ignored if the
| result is a NaN. The subtraction is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
float_status *status)
{
int aExp, bExp, zExp;
uint32_t aSig, bSig, zSig;
int expDiff;
aSig = extractFloat32Frac( a );
aExp = extractFloat32Exp( a );
bSig = extractFloat32Frac( b );
bExp = extractFloat32Exp( b );
expDiff = aExp - bExp;
aSig <<= 7;
bSig <<= 7;
if ( 0 < expDiff ) goto aExpBigger;
if ( expDiff < 0 ) goto bExpBigger;
if ( aExp == 0xFF ) {
if (aSig | bSig) {
return propagateFloat32NaN(a, b, status);
}
float_raise(float_flag_invalid, status);
return float32_default_nan(status);
}
if ( aExp == 0 ) {
aExp = 1;
bExp = 1;
}
if ( bSig < aSig ) goto aBigger;
if ( aSig < bSig ) goto bBigger;
return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
bExpBigger:
if ( bExp == 0xFF ) {
if (bSig) {
return propagateFloat32NaN(a, b, status);
}
return packFloat32( zSign ^ 1, 0xFF, 0 );
}
if ( aExp == 0 ) {
++expDiff;
}
else {
aSig |= 0x40000000;
}
shift32RightJamming( aSig, - expDiff, &aSig );
bSig |= 0x40000000;
bBigger:
zSig = bSig - aSig;
zExp = bExp;
zSign ^= 1;
goto normalizeRoundAndPack;
aExpBigger:
if ( aExp == 0xFF ) {
if (aSig) {
return propagateFloat32NaN(a, b, status);
}
return a;
}
if ( bExp == 0 ) {
--expDiff;
}
else {
bSig |= 0x40000000;
}
shift32RightJamming( bSig, expDiff, &bSig );
aSig |= 0x40000000;
aBigger:
zSig = aSig - bSig;
zExp = aExp;
normalizeRoundAndPack:
--zExp;
return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
}
/*----------------------------------------------------------------------------
| Returns the result of adding the single-precision floating-point values `a'
| and `b'. The operation is performed according to the IEC/IEEE Standard for
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
float32 float32_add(float32 a, float32 b, float_status *status)
{
flag aSign, bSign;
a = float32_squash_input_denormal(a, status);
b = float32_squash_input_denormal(b, status);
aSign = extractFloat32Sign( a );
bSign = extractFloat32Sign( b );
if ( aSign == bSign ) {
return addFloat32Sigs(a, b, aSign, status);
}
else {
return subFloat32Sigs(a, b, aSign, status);
}
}
/*----------------------------------------------------------------------------
| Returns the result of subtracting the single-precision floating-point values
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
float32 float32_sub(float32 a, float32 b, float_status *status)
{
flag aSign, bSign;
a = float32_squash_input_denormal(a, status);
b = float32_squash_input_denormal(b, status);
aSign = extractFloat32Sign( a );
bSign = extractFloat32Sign( b );
if ( aSign == bSign ) {
return subFloat32Sigs(a, b, aSign, status);
}
else {
return addFloat32Sigs(a, b, aSign, status);
}
}
/*----------------------------------------------------------------------------
| Returns the result of multiplying the single-precision floating-point values
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
@ -3891,219 +4142,6 @@ float64 float64_trunc_to_int(float64 a, float_status *status)
return res;
}
/*----------------------------------------------------------------------------
| Returns the result of adding the absolute values of the double-precision
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
| before being returned. `zSign' is ignored if the result is a NaN.
| The addition is performed according to the IEC/IEEE Standard for Binary
| Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
float_status *status)
{
int aExp, bExp, zExp;
uint64_t aSig, bSig, zSig;
int expDiff;
aSig = extractFloat64Frac( a );
aExp = extractFloat64Exp( a );
bSig = extractFloat64Frac( b );
bExp = extractFloat64Exp( b );
expDiff = aExp - bExp;
aSig <<= 9;
bSig <<= 9;
if ( 0 < expDiff ) {
if ( aExp == 0x7FF ) {
if (aSig) {
return propagateFloat64NaN(a, b, status);
}
return a;
}
if ( bExp == 0 ) {
--expDiff;
}
else {
bSig |= LIT64( 0x2000000000000000 );
}
shift64RightJamming( bSig, expDiff, &bSig );
zExp = aExp;
}
else if ( expDiff < 0 ) {
if ( bExp == 0x7FF ) {
if (bSig) {
return propagateFloat64NaN(a, b, status);
}
return packFloat64( zSign, 0x7FF, 0 );
}
if ( aExp == 0 ) {
++expDiff;
}
else {
aSig |= LIT64( 0x2000000000000000 );
}
shift64RightJamming( aSig, - expDiff, &aSig );
zExp = bExp;
}
else {
if ( aExp == 0x7FF ) {
if (aSig | bSig) {
return propagateFloat64NaN(a, b, status);
}
return a;
}
if ( aExp == 0 ) {
if (status->flush_to_zero) {
if (aSig | bSig) {
float_raise(float_flag_output_denormal, status);
}
return packFloat64(zSign, 0, 0);
}
return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
}
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
zExp = aExp;
goto roundAndPack;
}
aSig |= LIT64( 0x2000000000000000 );
zSig = ( aSig + bSig )<<1;
--zExp;
if ( (int64_t) zSig < 0 ) {
zSig = aSig + bSig;
++zExp;
}
roundAndPack:
return roundAndPackFloat64(zSign, zExp, zSig, status);
}
/*----------------------------------------------------------------------------
| Returns the result of subtracting the absolute values of the double-
| precision floating-point values `a' and `b'. If `zSign' is 1, the
| difference is negated before being returned. `zSign' is ignored if the
| result is a NaN. The subtraction is performed according to the IEC/IEEE
| Standard for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
float_status *status)
{
int aExp, bExp, zExp;
uint64_t aSig, bSig, zSig;
int expDiff;
aSig = extractFloat64Frac( a );
aExp = extractFloat64Exp( a );
bSig = extractFloat64Frac( b );
bExp = extractFloat64Exp( b );
expDiff = aExp - bExp;
aSig <<= 10;
bSig <<= 10;
if ( 0 < expDiff ) goto aExpBigger;
if ( expDiff < 0 ) goto bExpBigger;
if ( aExp == 0x7FF ) {
if (aSig | bSig) {
return propagateFloat64NaN(a, b, status);
}
float_raise(float_flag_invalid, status);
return float64_default_nan(status);
}
if ( aExp == 0 ) {
aExp = 1;
bExp = 1;
}
if ( bSig < aSig ) goto aBigger;
if ( aSig < bSig ) goto bBigger;
return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
bExpBigger:
if ( bExp == 0x7FF ) {
if (bSig) {
return propagateFloat64NaN(a, b, status);
}
return packFloat64( zSign ^ 1, 0x7FF, 0 );
}
if ( aExp == 0 ) {
++expDiff;
}
else {
aSig |= LIT64( 0x4000000000000000 );
}
shift64RightJamming( aSig, - expDiff, &aSig );
bSig |= LIT64( 0x4000000000000000 );
bBigger:
zSig = bSig - aSig;
zExp = bExp;
zSign ^= 1;
goto normalizeRoundAndPack;
aExpBigger:
if ( aExp == 0x7FF ) {
if (aSig) {
return propagateFloat64NaN(a, b, status);
}
return a;
}
if ( bExp == 0 ) {
--expDiff;
}
else {
bSig |= LIT64( 0x4000000000000000 );
}
shift64RightJamming( bSig, expDiff, &bSig );
aSig |= LIT64( 0x4000000000000000 );
aBigger:
zSig = aSig - bSig;
zExp = aExp;
normalizeRoundAndPack:
--zExp;
return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
}
/*----------------------------------------------------------------------------
| Returns the result of adding the double-precision floating-point values `a'
| and `b'. The operation is performed according to the IEC/IEEE Standard for
| Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
float64 float64_add(float64 a, float64 b, float_status *status)
{
flag aSign, bSign;
a = float64_squash_input_denormal(a, status);
b = float64_squash_input_denormal(b, status);
aSign = extractFloat64Sign( a );
bSign = extractFloat64Sign( b );
if ( aSign == bSign ) {
return addFloat64Sigs(a, b, aSign, status);
}
else {
return subFloat64Sigs(a, b, aSign, status);
}
}
/*----------------------------------------------------------------------------
| Returns the result of subtracting the double-precision floating-point values
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
| for Binary Floating-Point Arithmetic.
*----------------------------------------------------------------------------*/
float64 float64_sub(float64 a, float64 b, float_status *status)
{
flag aSign, bSign;
a = float64_squash_input_denormal(a, status);
b = float64_squash_input_denormal(b, status);
aSign = extractFloat64Sign( a );
bSign = extractFloat64Sign( b );
if ( aSign == bSign ) {
return subFloat64Sigs(a, b, aSign, status);
}
else {
return addFloat64Sigs(a, b, aSign, status);
}
}
/*----------------------------------------------------------------------------
| Returns the result of multiplying the double-precision floating-point values

View File

@ -236,6 +236,10 @@ float64 float16_to_float64(float16 a, flag ieee, float_status *status);
/*----------------------------------------------------------------------------
| Software half-precision operations.
*----------------------------------------------------------------------------*/
float16 float16_add(float16, float16, float_status *status);
float16 float16_sub(float16, float16, float_status *status);
int float16_is_quiet_nan(float16, float_status *status);
int float16_is_signaling_nan(float16, float_status *status);
float16 float16_maybe_silence_nan(float16, float_status *status);