target/i386: reimplement fyl2x using floatx80 operations

The x87 fyl2x emulation is currently based around conversion to
double.  This is inherently unsuitable for a good emulation of any
floatx80 operation.  Reimplement using the soft-float operations,
building on top of the reimplementation of fyl2xp1 and factoring out
code to be shared between the two instructions.

The included test assumes that the result in round-to-nearest mode
should always be one of the two closest floating-point numbers to the
mathematically exact result (including that it should be exact, in the
exact cases which cover more cases than for fyl2xp1).

Signed-off-by: Joseph Myers <joseph@codesourcery.com>
Message-Id: <alpine.DEB.2.21.2006172321530.20587@digraph.polyomino.org.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Joseph Myers 2020-06-17 23:24:29 +00:00 committed by Paolo Bonzini
parent 5eebc49d2d
commit 1f18a1e6ab
2 changed files with 1423 additions and 111 deletions

View File

@ -1223,21 +1223,6 @@ void helper_f2xm1(CPUX86State *env)
merge_exception_flags(env, old_flags);
}
void helper_fyl2x(CPUX86State *env)
{
double fptemp = floatx80_to_double(env, ST0);
if (fptemp > 0.0) {
fptemp = log(fptemp) / log(2.0); /* log2(ST) */
fptemp *= floatx80_to_double(env, ST1);
ST1 = double_to_floatx80(env, fptemp);
fpop(env);
} else {
env->fpus &= ~0x4700;
env->fpus |= 0x400;
}
}
void helper_fptan(CPUX86State *env)
{
double fptemp = floatx80_to_double(env, ST0);
@ -1395,6 +1380,118 @@ void helper_fprem(CPUX86State *env)
#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
/*
* Compute an approximation of log2(1+arg), where 1+arg is in the
* interval [sqrt(2)/2, sqrt(2)]. It is assumed that when this
* function is called, rounding precision is set to 80 and the
* round-to-nearest mode is in effect. arg must not be exactly zero,
* and must not be so close to zero that underflow might occur.
*/
static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
uint64_t *sig0, uint64_t *sig1)
{
uint64_t arg0_sig = extractFloatx80Frac(arg);
int32_t arg0_exp = extractFloatx80Exp(arg);
bool arg0_sign = extractFloatx80Sign(arg);
bool asign;
int32_t dexp, texp, aexp;
uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
floatx80 t2, accum;
/*
* Compute an approximation of arg/(2+arg), with extra precision,
* as the argument to a polynomial approximation. The extra
* precision is only needed for the first term of the
* approximation, with subsequent terms being significantly
* smaller; the approximation only uses odd exponents, and the
* square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
*/
if (arg0_sign) {
dexp = 0x3fff;
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
} else {
dexp = 0x4000;
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
dsig0 |= 0x8000000000000000ULL;
}
texp = arg0_exp - dexp + 0x3ffe;
rsig0 = arg0_sig;
rsig1 = 0;
rsig2 = 0;
if (dsig0 <= rsig0) {
shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
++texp;
}
tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
&rsig0, &rsig1, &rsig2);
while ((int64_t) rsig0 < 0) {
--tsig0;
add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
&rsig0, &rsig1, &rsig2);
}
tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
/*
* No need to correct any estimation error in tsig1; even with
* such error, it is accurate enough. Now compute the square of
* that approximation.
*/
mul128To256(tsig0, tsig1, tsig0, tsig1,
&t2sig0, &t2sig1, &t2sig2, &t2sig3);
t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
t2sig0, t2sig1, &env->fp_status);
/* Compute the lower parts of the polynomial expansion. */
accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
/*
* The full polynomial expansion is fyl2x_coeff_0 + accum (where
* accum has much lower magnitude, and so, in particular, carry
* out of the addition is not possible), multiplied by t. (This
* expansion is only accurate to about 70 bits, not 128 bits.)
*/
aexp = extractFloatx80Exp(fyl2x_coeff_0);
asign = extractFloatx80Sign(fyl2x_coeff_0);
shift128RightJamming(extractFloatx80Frac(accum), 0,
aexp - extractFloatx80Exp(accum),
&asig0, &asig1);
bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
bsig1 = 0;
if (asign == extractFloatx80Sign(accum)) {
add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
} else {
sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
}
/* Multiply by t to compute the required result. */
mul128To256(asig0, asig1, tsig0, tsig1,
&asig0, &asig1, &asig2, &asig3);
aexp += texp - 0x3ffe;
*exp = aexp;
*sig0 = asig0;
*sig1 = asig1;
}
void helper_fyl2xp1(CPUX86State *env)
{
uint8_t old_flags = save_exception_flags(env);
@ -1462,109 +1559,18 @@ void helper_fyl2xp1(CPUX86State *env)
ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
sig0, sig1, &env->fp_status);
} else {
bool asign;
uint32_t dexp, texp, aexp;
uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
floatx80 t2, accum;
int32_t aexp;
uint64_t asig0, asig1, asig2;
FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
signed char save_prec = env->fp_status.floatx80_rounding_precision;
env->fp_status.float_rounding_mode = float_round_nearest_even;
env->fp_status.floatx80_rounding_precision = 80;
helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
/*
* Compute an approximation of ST0/(2+ST0), with extra
* precision, as the argument to a polynomial approximation.
* The extra precision is only needed for the first term of
* the approximation, with subsequent terms being
* significantly smaller; the approximation only uses odd
* exponents, and the square of ST0/(2+ST0) is at most
* 17-12*sqrt(2) = 0.029....
* Multiply by the second argument to compute the required
* result.
*/
if (arg0_sign) {
dexp = 0x3fff;
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
} else {
dexp = 0x4000;
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
dsig0 |= 0x8000000000000000ULL;
}
texp = arg0_exp - dexp + 0x3ffe;
rsig0 = arg0_sig;
rsig1 = 0;
rsig2 = 0;
if (dsig0 <= rsig0) {
shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
++texp;
}
tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
&rsig0, &rsig1, &rsig2);
while ((int64_t) rsig0 < 0) {
--tsig0;
add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
&rsig0, &rsig1, &rsig2);
}
tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
/*
* No need to correct any estimation error in tsig1; even with
* such error, it is accurate enough. Now compute the square
* of that approximation.
*/
mul128To256(tsig0, tsig1, tsig0, tsig1,
&t2sig0, &t2sig1, &t2sig2, &t2sig3);
t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
t2sig0, t2sig1, &env->fp_status);
/* Compute the lower parts of the polynomial expansion. */
accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
/*
* The full polynomial expansion is fyl2x_coeff_0 + accum
* (where accum has much lower magnitude, and so, in
* particular, carry out of the addition is not possible),
* multiplied by t. (This expansion is only accurate to about
* 70 bits, not 128 bits.)
*/
aexp = extractFloatx80Exp(fyl2x_coeff_0);
asign = extractFloatx80Sign(fyl2x_coeff_0);
shift128RightJamming(extractFloatx80Frac(accum), 0,
aexp - extractFloatx80Exp(accum),
&asig0, &asig1);
bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
bsig1 = 0;
if (asign == extractFloatx80Sign(accum)) {
add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
} else {
sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
}
/*
* Multiply by t and by the second argument to compute the
* required result.
*/
mul128To256(asig0, asig1, tsig0, tsig1,
&asig0, &asig1, &asig2, &asig3);
aexp += texp - 0x3ffe;
if (arg1_exp == 0) {
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
}
@ -1581,6 +1587,151 @@ void helper_fyl2xp1(CPUX86State *env)
merge_exception_flags(env, old_flags);
}
void helper_fyl2x(CPUX86State *env)
{
uint8_t old_flags = save_exception_flags(env);
uint64_t arg0_sig = extractFloatx80Frac(ST0);
int32_t arg0_exp = extractFloatx80Exp(ST0);
bool arg0_sign = extractFloatx80Sign(ST0);
uint64_t arg1_sig = extractFloatx80Frac(ST1);
int32_t arg1_exp = extractFloatx80Exp(ST1);
bool arg1_sign = extractFloatx80Sign(ST1);
if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_silence_nan(ST0, &env->fp_status);
} else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_silence_nan(ST1, &env->fp_status);
} else if (floatx80_invalid_encoding(ST0) ||
floatx80_invalid_encoding(ST1)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
} else if (floatx80_is_any_nan(ST0)) {
ST1 = ST0;
} else if (floatx80_is_any_nan(ST1)) {
/* Pass this NaN through. */
} else if (arg0_sign && !floatx80_is_zero(ST0)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
} else if (floatx80_is_infinity(ST1)) {
FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
&env->fp_status);
switch (cmp) {
case float_relation_less:
ST1 = floatx80_chs(ST1);
break;
case float_relation_greater:
/* Result is infinity of the same sign as ST1. */
break;
default:
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
break;
}
} else if (floatx80_is_infinity(ST0)) {
if (floatx80_is_zero(ST1)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
} else if (arg1_sign) {
ST1 = floatx80_chs(ST0);
} else {
ST1 = ST0;
}
} else if (floatx80_is_zero(ST0)) {
if (floatx80_is_zero(ST1)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
} else {
/* Result is infinity with opposite sign to ST1. */
float_raise(float_flag_divbyzero, &env->fp_status);
ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
0x8000000000000000ULL);
}
} else if (floatx80_is_zero(ST1)) {
if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
ST1 = floatx80_chs(ST1);
}
/* Otherwise, ST1 is already the correct result. */
} else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
if (arg1_sign) {
ST1 = floatx80_chs(floatx80_zero);
} else {
ST1 = floatx80_zero;
}
} else {
int32_t int_exp;
floatx80 arg0_m1;
FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
signed char save_prec = env->fp_status.floatx80_rounding_precision;
env->fp_status.float_rounding_mode = float_round_nearest_even;
env->fp_status.floatx80_rounding_precision = 80;
if (arg0_exp == 0) {
normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
}
if (arg1_exp == 0) {
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
}
int_exp = arg0_exp - 0x3fff;
if (arg0_sig > 0xb504f333f9de6484ULL) {
++int_exp;
}
arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
&env->fp_status),
floatx80_one, &env->fp_status);
if (floatx80_is_zero(arg0_m1)) {
/* Exact power of 2; multiply by ST1. */
env->fp_status.float_rounding_mode = save_mode;
ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
ST1, &env->fp_status);
} else {
bool asign = extractFloatx80Sign(arg0_m1);
int32_t aexp;
uint64_t asig0, asig1, asig2;
helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
if (int_exp != 0) {
bool isign = (int_exp < 0);
int32_t iexp;
uint64_t isig;
int shift;
int_exp = isign ? -int_exp : int_exp;
shift = clz32(int_exp) + 32;
isig = int_exp;
isig <<= shift;
iexp = 0x403e - shift;
shift128RightJamming(asig0, asig1, iexp - aexp,
&asig0, &asig1);
if (asign == isign) {
add128(isig, 0, asig0, asig1, &asig0, &asig1);
} else {
sub128(isig, 0, asig0, asig1, &asig0, &asig1);
}
aexp = iexp;
asign = isign;
}
/*
* Multiply by the second argument to compute the required
* result.
*/
if (arg1_exp == 0) {
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
}
mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
aexp += arg1_exp - 0x3ffe;
/* This result is inexact. */
asig1 |= 1;
env->fp_status.float_rounding_mode = save_mode;
ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
asig0, asig1, &env->fp_status);
}
env->fp_status.floatx80_rounding_precision = save_prec;
}
fpop(env);
merge_exception_flags(env, old_flags);
}
void helper_fsqrt(CPUX86State *env)
{
uint8_t old_flags = save_exception_flags(env);

File diff suppressed because it is too large Load Diff