target/i386: reimplement fyl2x using floatx80 operations
The x87 fyl2x emulation is currently based around conversion to double. This is inherently unsuitable for a good emulation of any floatx80 operation. Reimplement using the soft-float operations, building on top of the reimplementation of fyl2xp1 and factoring out code to be shared between the two instructions. The included test assumes that the result in round-to-nearest mode should always be one of the two closest floating-point numbers to the mathematically exact result (including that it should be exact, in the exact cases which cover more cases than for fyl2xp1). Signed-off-by: Joseph Myers <joseph@codesourcery.com> Message-Id: <alpine.DEB.2.21.2006172321530.20587@digraph.polyomino.org.uk> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
parent
5eebc49d2d
commit
1f18a1e6ab
@ -1223,21 +1223,6 @@ void helper_f2xm1(CPUX86State *env)
|
||||
merge_exception_flags(env, old_flags);
|
||||
}
|
||||
|
||||
void helper_fyl2x(CPUX86State *env)
|
||||
{
|
||||
double fptemp = floatx80_to_double(env, ST0);
|
||||
|
||||
if (fptemp > 0.0) {
|
||||
fptemp = log(fptemp) / log(2.0); /* log2(ST) */
|
||||
fptemp *= floatx80_to_double(env, ST1);
|
||||
ST1 = double_to_floatx80(env, fptemp);
|
||||
fpop(env);
|
||||
} else {
|
||||
env->fpus &= ~0x4700;
|
||||
env->fpus |= 0x400;
|
||||
}
|
||||
}
|
||||
|
||||
void helper_fptan(CPUX86State *env)
|
||||
{
|
||||
double fptemp = floatx80_to_double(env, ST0);
|
||||
@ -1395,6 +1380,118 @@ void helper_fprem(CPUX86State *env)
|
||||
#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
|
||||
#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
|
||||
|
||||
/*
|
||||
* Compute an approximation of log2(1+arg), where 1+arg is in the
|
||||
* interval [sqrt(2)/2, sqrt(2)]. It is assumed that when this
|
||||
* function is called, rounding precision is set to 80 and the
|
||||
* round-to-nearest mode is in effect. arg must not be exactly zero,
|
||||
* and must not be so close to zero that underflow might occur.
|
||||
*/
|
||||
static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
|
||||
uint64_t *sig0, uint64_t *sig1)
|
||||
{
|
||||
uint64_t arg0_sig = extractFloatx80Frac(arg);
|
||||
int32_t arg0_exp = extractFloatx80Exp(arg);
|
||||
bool arg0_sign = extractFloatx80Sign(arg);
|
||||
bool asign;
|
||||
int32_t dexp, texp, aexp;
|
||||
uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
|
||||
uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
|
||||
uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
|
||||
floatx80 t2, accum;
|
||||
|
||||
/*
|
||||
* Compute an approximation of arg/(2+arg), with extra precision,
|
||||
* as the argument to a polynomial approximation. The extra
|
||||
* precision is only needed for the first term of the
|
||||
* approximation, with subsequent terms being significantly
|
||||
* smaller; the approximation only uses odd exponents, and the
|
||||
* square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
|
||||
*/
|
||||
if (arg0_sign) {
|
||||
dexp = 0x3fff;
|
||||
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
|
||||
sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
|
||||
} else {
|
||||
dexp = 0x4000;
|
||||
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
|
||||
dsig0 |= 0x8000000000000000ULL;
|
||||
}
|
||||
texp = arg0_exp - dexp + 0x3ffe;
|
||||
rsig0 = arg0_sig;
|
||||
rsig1 = 0;
|
||||
rsig2 = 0;
|
||||
if (dsig0 <= rsig0) {
|
||||
shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
|
||||
++texp;
|
||||
}
|
||||
tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
|
||||
mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
|
||||
sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
|
||||
&rsig0, &rsig1, &rsig2);
|
||||
while ((int64_t) rsig0 < 0) {
|
||||
--tsig0;
|
||||
add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
|
||||
&rsig0, &rsig1, &rsig2);
|
||||
}
|
||||
tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
|
||||
/*
|
||||
* No need to correct any estimation error in tsig1; even with
|
||||
* such error, it is accurate enough. Now compute the square of
|
||||
* that approximation.
|
||||
*/
|
||||
mul128To256(tsig0, tsig1, tsig0, tsig1,
|
||||
&t2sig0, &t2sig1, &t2sig2, &t2sig3);
|
||||
t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
|
||||
t2sig0, t2sig1, &env->fp_status);
|
||||
|
||||
/* Compute the lower parts of the polynomial expansion. */
|
||||
accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
|
||||
|
||||
/*
|
||||
* The full polynomial expansion is fyl2x_coeff_0 + accum (where
|
||||
* accum has much lower magnitude, and so, in particular, carry
|
||||
* out of the addition is not possible), multiplied by t. (This
|
||||
* expansion is only accurate to about 70 bits, not 128 bits.)
|
||||
*/
|
||||
aexp = extractFloatx80Exp(fyl2x_coeff_0);
|
||||
asign = extractFloatx80Sign(fyl2x_coeff_0);
|
||||
shift128RightJamming(extractFloatx80Frac(accum), 0,
|
||||
aexp - extractFloatx80Exp(accum),
|
||||
&asig0, &asig1);
|
||||
bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
|
||||
bsig1 = 0;
|
||||
if (asign == extractFloatx80Sign(accum)) {
|
||||
add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
|
||||
} else {
|
||||
sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
|
||||
}
|
||||
/* Multiply by t to compute the required result. */
|
||||
mul128To256(asig0, asig1, tsig0, tsig1,
|
||||
&asig0, &asig1, &asig2, &asig3);
|
||||
aexp += texp - 0x3ffe;
|
||||
*exp = aexp;
|
||||
*sig0 = asig0;
|
||||
*sig1 = asig1;
|
||||
}
|
||||
|
||||
void helper_fyl2xp1(CPUX86State *env)
|
||||
{
|
||||
uint8_t old_flags = save_exception_flags(env);
|
||||
@ -1462,109 +1559,18 @@ void helper_fyl2xp1(CPUX86State *env)
|
||||
ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
|
||||
sig0, sig1, &env->fp_status);
|
||||
} else {
|
||||
bool asign;
|
||||
uint32_t dexp, texp, aexp;
|
||||
uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
|
||||
uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
|
||||
uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
|
||||
floatx80 t2, accum;
|
||||
int32_t aexp;
|
||||
uint64_t asig0, asig1, asig2;
|
||||
FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
|
||||
signed char save_prec = env->fp_status.floatx80_rounding_precision;
|
||||
env->fp_status.float_rounding_mode = float_round_nearest_even;
|
||||
env->fp_status.floatx80_rounding_precision = 80;
|
||||
|
||||
helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
|
||||
/*
|
||||
* Compute an approximation of ST0/(2+ST0), with extra
|
||||
* precision, as the argument to a polynomial approximation.
|
||||
* The extra precision is only needed for the first term of
|
||||
* the approximation, with subsequent terms being
|
||||
* significantly smaller; the approximation only uses odd
|
||||
* exponents, and the square of ST0/(2+ST0) is at most
|
||||
* 17-12*sqrt(2) = 0.029....
|
||||
* Multiply by the second argument to compute the required
|
||||
* result.
|
||||
*/
|
||||
if (arg0_sign) {
|
||||
dexp = 0x3fff;
|
||||
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
|
||||
sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
|
||||
} else {
|
||||
dexp = 0x4000;
|
||||
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
|
||||
dsig0 |= 0x8000000000000000ULL;
|
||||
}
|
||||
texp = arg0_exp - dexp + 0x3ffe;
|
||||
rsig0 = arg0_sig;
|
||||
rsig1 = 0;
|
||||
rsig2 = 0;
|
||||
if (dsig0 <= rsig0) {
|
||||
shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
|
||||
++texp;
|
||||
}
|
||||
tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
|
||||
mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
|
||||
sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
|
||||
&rsig0, &rsig1, &rsig2);
|
||||
while ((int64_t) rsig0 < 0) {
|
||||
--tsig0;
|
||||
add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
|
||||
&rsig0, &rsig1, &rsig2);
|
||||
}
|
||||
tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
|
||||
/*
|
||||
* No need to correct any estimation error in tsig1; even with
|
||||
* such error, it is accurate enough. Now compute the square
|
||||
* of that approximation.
|
||||
*/
|
||||
mul128To256(tsig0, tsig1, tsig0, tsig1,
|
||||
&t2sig0, &t2sig1, &t2sig2, &t2sig3);
|
||||
t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
|
||||
t2sig0, t2sig1, &env->fp_status);
|
||||
|
||||
/* Compute the lower parts of the polynomial expansion. */
|
||||
accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
|
||||
accum = floatx80_mul(accum, t2, &env->fp_status);
|
||||
accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
|
||||
|
||||
/*
|
||||
* The full polynomial expansion is fyl2x_coeff_0 + accum
|
||||
* (where accum has much lower magnitude, and so, in
|
||||
* particular, carry out of the addition is not possible),
|
||||
* multiplied by t. (This expansion is only accurate to about
|
||||
* 70 bits, not 128 bits.)
|
||||
*/
|
||||
aexp = extractFloatx80Exp(fyl2x_coeff_0);
|
||||
asign = extractFloatx80Sign(fyl2x_coeff_0);
|
||||
shift128RightJamming(extractFloatx80Frac(accum), 0,
|
||||
aexp - extractFloatx80Exp(accum),
|
||||
&asig0, &asig1);
|
||||
bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
|
||||
bsig1 = 0;
|
||||
if (asign == extractFloatx80Sign(accum)) {
|
||||
add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
|
||||
} else {
|
||||
sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
|
||||
}
|
||||
/*
|
||||
* Multiply by t and by the second argument to compute the
|
||||
* required result.
|
||||
*/
|
||||
mul128To256(asig0, asig1, tsig0, tsig1,
|
||||
&asig0, &asig1, &asig2, &asig3);
|
||||
aexp += texp - 0x3ffe;
|
||||
if (arg1_exp == 0) {
|
||||
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
|
||||
}
|
||||
@ -1581,6 +1587,151 @@ void helper_fyl2xp1(CPUX86State *env)
|
||||
merge_exception_flags(env, old_flags);
|
||||
}
|
||||
|
||||
void helper_fyl2x(CPUX86State *env)
|
||||
{
|
||||
uint8_t old_flags = save_exception_flags(env);
|
||||
uint64_t arg0_sig = extractFloatx80Frac(ST0);
|
||||
int32_t arg0_exp = extractFloatx80Exp(ST0);
|
||||
bool arg0_sign = extractFloatx80Sign(ST0);
|
||||
uint64_t arg1_sig = extractFloatx80Frac(ST1);
|
||||
int32_t arg1_exp = extractFloatx80Exp(ST1);
|
||||
bool arg1_sign = extractFloatx80Sign(ST1);
|
||||
|
||||
if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
|
||||
float_raise(float_flag_invalid, &env->fp_status);
|
||||
ST1 = floatx80_silence_nan(ST0, &env->fp_status);
|
||||
} else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
|
||||
float_raise(float_flag_invalid, &env->fp_status);
|
||||
ST1 = floatx80_silence_nan(ST1, &env->fp_status);
|
||||
} else if (floatx80_invalid_encoding(ST0) ||
|
||||
floatx80_invalid_encoding(ST1)) {
|
||||
float_raise(float_flag_invalid, &env->fp_status);
|
||||
ST1 = floatx80_default_nan(&env->fp_status);
|
||||
} else if (floatx80_is_any_nan(ST0)) {
|
||||
ST1 = ST0;
|
||||
} else if (floatx80_is_any_nan(ST1)) {
|
||||
/* Pass this NaN through. */
|
||||
} else if (arg0_sign && !floatx80_is_zero(ST0)) {
|
||||
float_raise(float_flag_invalid, &env->fp_status);
|
||||
ST1 = floatx80_default_nan(&env->fp_status);
|
||||
} else if (floatx80_is_infinity(ST1)) {
|
||||
FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
|
||||
&env->fp_status);
|
||||
switch (cmp) {
|
||||
case float_relation_less:
|
||||
ST1 = floatx80_chs(ST1);
|
||||
break;
|
||||
case float_relation_greater:
|
||||
/* Result is infinity of the same sign as ST1. */
|
||||
break;
|
||||
default:
|
||||
float_raise(float_flag_invalid, &env->fp_status);
|
||||
ST1 = floatx80_default_nan(&env->fp_status);
|
||||
break;
|
||||
}
|
||||
} else if (floatx80_is_infinity(ST0)) {
|
||||
if (floatx80_is_zero(ST1)) {
|
||||
float_raise(float_flag_invalid, &env->fp_status);
|
||||
ST1 = floatx80_default_nan(&env->fp_status);
|
||||
} else if (arg1_sign) {
|
||||
ST1 = floatx80_chs(ST0);
|
||||
} else {
|
||||
ST1 = ST0;
|
||||
}
|
||||
} else if (floatx80_is_zero(ST0)) {
|
||||
if (floatx80_is_zero(ST1)) {
|
||||
float_raise(float_flag_invalid, &env->fp_status);
|
||||
ST1 = floatx80_default_nan(&env->fp_status);
|
||||
} else {
|
||||
/* Result is infinity with opposite sign to ST1. */
|
||||
float_raise(float_flag_divbyzero, &env->fp_status);
|
||||
ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
|
||||
0x8000000000000000ULL);
|
||||
}
|
||||
} else if (floatx80_is_zero(ST1)) {
|
||||
if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
|
||||
ST1 = floatx80_chs(ST1);
|
||||
}
|
||||
/* Otherwise, ST1 is already the correct result. */
|
||||
} else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
|
||||
if (arg1_sign) {
|
||||
ST1 = floatx80_chs(floatx80_zero);
|
||||
} else {
|
||||
ST1 = floatx80_zero;
|
||||
}
|
||||
} else {
|
||||
int32_t int_exp;
|
||||
floatx80 arg0_m1;
|
||||
FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
|
||||
signed char save_prec = env->fp_status.floatx80_rounding_precision;
|
||||
env->fp_status.float_rounding_mode = float_round_nearest_even;
|
||||
env->fp_status.floatx80_rounding_precision = 80;
|
||||
|
||||
if (arg0_exp == 0) {
|
||||
normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
|
||||
}
|
||||
if (arg1_exp == 0) {
|
||||
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
|
||||
}
|
||||
int_exp = arg0_exp - 0x3fff;
|
||||
if (arg0_sig > 0xb504f333f9de6484ULL) {
|
||||
++int_exp;
|
||||
}
|
||||
arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
|
||||
&env->fp_status),
|
||||
floatx80_one, &env->fp_status);
|
||||
if (floatx80_is_zero(arg0_m1)) {
|
||||
/* Exact power of 2; multiply by ST1. */
|
||||
env->fp_status.float_rounding_mode = save_mode;
|
||||
ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
|
||||
ST1, &env->fp_status);
|
||||
} else {
|
||||
bool asign = extractFloatx80Sign(arg0_m1);
|
||||
int32_t aexp;
|
||||
uint64_t asig0, asig1, asig2;
|
||||
helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
|
||||
if (int_exp != 0) {
|
||||
bool isign = (int_exp < 0);
|
||||
int32_t iexp;
|
||||
uint64_t isig;
|
||||
int shift;
|
||||
int_exp = isign ? -int_exp : int_exp;
|
||||
shift = clz32(int_exp) + 32;
|
||||
isig = int_exp;
|
||||
isig <<= shift;
|
||||
iexp = 0x403e - shift;
|
||||
shift128RightJamming(asig0, asig1, iexp - aexp,
|
||||
&asig0, &asig1);
|
||||
if (asign == isign) {
|
||||
add128(isig, 0, asig0, asig1, &asig0, &asig1);
|
||||
} else {
|
||||
sub128(isig, 0, asig0, asig1, &asig0, &asig1);
|
||||
}
|
||||
aexp = iexp;
|
||||
asign = isign;
|
||||
}
|
||||
/*
|
||||
* Multiply by the second argument to compute the required
|
||||
* result.
|
||||
*/
|
||||
if (arg1_exp == 0) {
|
||||
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
|
||||
}
|
||||
mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
|
||||
aexp += arg1_exp - 0x3ffe;
|
||||
/* This result is inexact. */
|
||||
asig1 |= 1;
|
||||
env->fp_status.float_rounding_mode = save_mode;
|
||||
ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
|
||||
asig0, asig1, &env->fp_status);
|
||||
}
|
||||
|
||||
env->fp_status.floatx80_rounding_precision = save_prec;
|
||||
}
|
||||
fpop(env);
|
||||
merge_exception_flags(env, old_flags);
|
||||
}
|
||||
|
||||
void helper_fsqrt(CPUX86State *env)
|
||||
{
|
||||
uint8_t old_flags = save_exception_flags(env);
|
||||
|
1161
tests/tcg/i386/test-i386-fyl2x.c
Normal file
1161
tests/tcg/i386/test-i386-fyl2x.c
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user