qemu-e2k/target-i386/misc_helper.c
Paolo Bonzini 81f3053b77 target-i386: yield to another VCPU on PAUSE
After commit b1bbfe7 (aio / timers: On timer modification, qemu_notify
or aio_notify, 2013-08-21) FreeBSD guests report a huge slowdown.

The problem shows up as soon as FreeBSD turns out its periodic (~1 ms)
tick, but the timers are only the trigger for a pre-existing problem.

Before the offending patch, setting a timer did a timer_settime system call.

After, setting the timer exits the event loop (which uses poll) and
reenters it with a new deadline.  This does not cause any slowdown; the
difference is between one system call (timer_settime and a signal
delivery (SIGALRM) before the patch, and two system calls afterwards
(write to a pipe or eventfd + calling poll again when re-entering the
event loop).

Unfortunately, the exit/enter causes the main loop to grab the iothread
lock, which in turns kicks the VCPU thread out of execution.  This
causes TCG to execute the next VCPU in its round-robin scheduling of
VCPUS.  When the second VCPU is mostly unused, FreeBSD runs a "pause"
instruction in its idle loop which only burns cycles without any
progress.  As soon as the timer tick expires, the first VCPU runs
the interrupt handler but very soon it sets it again---and QEMU
then goes back doing nothing in the second VCPU.

The fix is to make the pause instruction do "cpu_loop_exit".

Cc: Richard Henderson <rth@twiddle.net>
Reported-by: Luigi Rizzo <rizzo@iet.unipi.it>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Message-id: 1384948442-24217-1-git-send-email-pbonzini@redhat.com
Signed-off-by: Anthony Liguori <aliguori@amazon.com>
2013-11-21 07:55:45 -08:00

644 lines
16 KiB
C

/*
* x86 misc helpers
*
* Copyright (c) 2003 Fabrice Bellard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "cpu.h"
#include "exec/ioport.h"
#include "helper.h"
#if !defined(CONFIG_USER_ONLY)
#include "exec/softmmu_exec.h"
#endif /* !defined(CONFIG_USER_ONLY) */
/* check if Port I/O is allowed in TSS */
static inline void check_io(CPUX86State *env, int addr, int size)
{
int io_offset, val, mask;
/* TSS must be a valid 32 bit one */
if (!(env->tr.flags & DESC_P_MASK) ||
((env->tr.flags >> DESC_TYPE_SHIFT) & 0xf) != 9 ||
env->tr.limit < 103) {
goto fail;
}
io_offset = cpu_lduw_kernel(env, env->tr.base + 0x66);
io_offset += (addr >> 3);
/* Note: the check needs two bytes */
if ((io_offset + 1) > env->tr.limit) {
goto fail;
}
val = cpu_lduw_kernel(env, env->tr.base + io_offset);
val >>= (addr & 7);
mask = (1 << size) - 1;
/* all bits must be zero to allow the I/O */
if ((val & mask) != 0) {
fail:
raise_exception_err(env, EXCP0D_GPF, 0);
}
}
void helper_check_iob(CPUX86State *env, uint32_t t0)
{
check_io(env, t0, 1);
}
void helper_check_iow(CPUX86State *env, uint32_t t0)
{
check_io(env, t0, 2);
}
void helper_check_iol(CPUX86State *env, uint32_t t0)
{
check_io(env, t0, 4);
}
void helper_outb(uint32_t port, uint32_t data)
{
cpu_outb(port, data & 0xff);
}
target_ulong helper_inb(uint32_t port)
{
return cpu_inb(port);
}
void helper_outw(uint32_t port, uint32_t data)
{
cpu_outw(port, data & 0xffff);
}
target_ulong helper_inw(uint32_t port)
{
return cpu_inw(port);
}
void helper_outl(uint32_t port, uint32_t data)
{
cpu_outl(port, data);
}
target_ulong helper_inl(uint32_t port)
{
return cpu_inl(port);
}
void helper_into(CPUX86State *env, int next_eip_addend)
{
int eflags;
eflags = cpu_cc_compute_all(env, CC_OP);
if (eflags & CC_O) {
raise_interrupt(env, EXCP04_INTO, 1, 0, next_eip_addend);
}
}
void helper_single_step(CPUX86State *env)
{
#ifndef CONFIG_USER_ONLY
check_hw_breakpoints(env, true);
env->dr[6] |= DR6_BS;
#endif
raise_exception(env, EXCP01_DB);
}
void helper_cpuid(CPUX86State *env)
{
uint32_t eax, ebx, ecx, edx;
cpu_svm_check_intercept_param(env, SVM_EXIT_CPUID, 0);
cpu_x86_cpuid(env, (uint32_t)env->regs[R_EAX], (uint32_t)env->regs[R_ECX],
&eax, &ebx, &ecx, &edx);
env->regs[R_EAX] = eax;
env->regs[R_EBX] = ebx;
env->regs[R_ECX] = ecx;
env->regs[R_EDX] = edx;
}
#if defined(CONFIG_USER_ONLY)
target_ulong helper_read_crN(CPUX86State *env, int reg)
{
return 0;
}
void helper_write_crN(CPUX86State *env, int reg, target_ulong t0)
{
}
void helper_movl_drN_T0(CPUX86State *env, int reg, target_ulong t0)
{
}
#else
target_ulong helper_read_crN(CPUX86State *env, int reg)
{
target_ulong val;
cpu_svm_check_intercept_param(env, SVM_EXIT_READ_CR0 + reg, 0);
switch (reg) {
default:
val = env->cr[reg];
break;
case 8:
if (!(env->hflags2 & HF2_VINTR_MASK)) {
val = cpu_get_apic_tpr(env->apic_state);
} else {
val = env->v_tpr;
}
break;
}
return val;
}
void helper_write_crN(CPUX86State *env, int reg, target_ulong t0)
{
cpu_svm_check_intercept_param(env, SVM_EXIT_WRITE_CR0 + reg, 0);
switch (reg) {
case 0:
cpu_x86_update_cr0(env, t0);
break;
case 3:
cpu_x86_update_cr3(env, t0);
break;
case 4:
cpu_x86_update_cr4(env, t0);
break;
case 8:
if (!(env->hflags2 & HF2_VINTR_MASK)) {
cpu_set_apic_tpr(env->apic_state, t0);
}
env->v_tpr = t0 & 0x0f;
break;
default:
env->cr[reg] = t0;
break;
}
}
void helper_movl_drN_T0(CPUX86State *env, int reg, target_ulong t0)
{
int i;
if (reg < 4) {
hw_breakpoint_remove(env, reg);
env->dr[reg] = t0;
hw_breakpoint_insert(env, reg);
} else if (reg == 7) {
for (i = 0; i < DR7_MAX_BP; i++) {
hw_breakpoint_remove(env, i);
}
env->dr[7] = t0;
for (i = 0; i < DR7_MAX_BP; i++) {
hw_breakpoint_insert(env, i);
}
} else {
env->dr[reg] = t0;
}
}
#endif
void helper_lmsw(CPUX86State *env, target_ulong t0)
{
/* only 4 lower bits of CR0 are modified. PE cannot be set to zero
if already set to one. */
t0 = (env->cr[0] & ~0xe) | (t0 & 0xf);
helper_write_crN(env, 0, t0);
}
void helper_invlpg(CPUX86State *env, target_ulong addr)
{
cpu_svm_check_intercept_param(env, SVM_EXIT_INVLPG, 0);
tlb_flush_page(env, addr);
}
void helper_rdtsc(CPUX86State *env)
{
uint64_t val;
if ((env->cr[4] & CR4_TSD_MASK) && ((env->hflags & HF_CPL_MASK) != 0)) {
raise_exception(env, EXCP0D_GPF);
}
cpu_svm_check_intercept_param(env, SVM_EXIT_RDTSC, 0);
val = cpu_get_tsc(env) + env->tsc_offset;
env->regs[R_EAX] = (uint32_t)(val);
env->regs[R_EDX] = (uint32_t)(val >> 32);
}
void helper_rdtscp(CPUX86State *env)
{
helper_rdtsc(env);
env->regs[R_ECX] = (uint32_t)(env->tsc_aux);
}
void helper_rdpmc(CPUX86State *env)
{
if ((env->cr[4] & CR4_PCE_MASK) && ((env->hflags & HF_CPL_MASK) != 0)) {
raise_exception(env, EXCP0D_GPF);
}
cpu_svm_check_intercept_param(env, SVM_EXIT_RDPMC, 0);
/* currently unimplemented */
qemu_log_mask(LOG_UNIMP, "x86: unimplemented rdpmc\n");
raise_exception_err(env, EXCP06_ILLOP, 0);
}
#if defined(CONFIG_USER_ONLY)
void helper_wrmsr(CPUX86State *env)
{
}
void helper_rdmsr(CPUX86State *env)
{
}
#else
void helper_wrmsr(CPUX86State *env)
{
uint64_t val;
cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 1);
val = ((uint32_t)env->regs[R_EAX]) |
((uint64_t)((uint32_t)env->regs[R_EDX]) << 32);
switch ((uint32_t)env->regs[R_ECX]) {
case MSR_IA32_SYSENTER_CS:
env->sysenter_cs = val & 0xffff;
break;
case MSR_IA32_SYSENTER_ESP:
env->sysenter_esp = val;
break;
case MSR_IA32_SYSENTER_EIP:
env->sysenter_eip = val;
break;
case MSR_IA32_APICBASE:
cpu_set_apic_base(env->apic_state, val);
break;
case MSR_EFER:
{
uint64_t update_mask;
update_mask = 0;
if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_SYSCALL) {
update_mask |= MSR_EFER_SCE;
}
if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
update_mask |= MSR_EFER_LME;
}
if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_FFXSR) {
update_mask |= MSR_EFER_FFXSR;
}
if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_NX) {
update_mask |= MSR_EFER_NXE;
}
if (env->features[FEAT_8000_0001_ECX] & CPUID_EXT3_SVM) {
update_mask |= MSR_EFER_SVME;
}
if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_FFXSR) {
update_mask |= MSR_EFER_FFXSR;
}
cpu_load_efer(env, (env->efer & ~update_mask) |
(val & update_mask));
}
break;
case MSR_STAR:
env->star = val;
break;
case MSR_PAT:
env->pat = val;
break;
case MSR_VM_HSAVE_PA:
env->vm_hsave = val;
break;
#ifdef TARGET_X86_64
case MSR_LSTAR:
env->lstar = val;
break;
case MSR_CSTAR:
env->cstar = val;
break;
case MSR_FMASK:
env->fmask = val;
break;
case MSR_FSBASE:
env->segs[R_FS].base = val;
break;
case MSR_GSBASE:
env->segs[R_GS].base = val;
break;
case MSR_KERNELGSBASE:
env->kernelgsbase = val;
break;
#endif
case MSR_MTRRphysBase(0):
case MSR_MTRRphysBase(1):
case MSR_MTRRphysBase(2):
case MSR_MTRRphysBase(3):
case MSR_MTRRphysBase(4):
case MSR_MTRRphysBase(5):
case MSR_MTRRphysBase(6):
case MSR_MTRRphysBase(7):
env->mtrr_var[((uint32_t)env->regs[R_ECX] -
MSR_MTRRphysBase(0)) / 2].base = val;
break;
case MSR_MTRRphysMask(0):
case MSR_MTRRphysMask(1):
case MSR_MTRRphysMask(2):
case MSR_MTRRphysMask(3):
case MSR_MTRRphysMask(4):
case MSR_MTRRphysMask(5):
case MSR_MTRRphysMask(6):
case MSR_MTRRphysMask(7):
env->mtrr_var[((uint32_t)env->regs[R_ECX] -
MSR_MTRRphysMask(0)) / 2].mask = val;
break;
case MSR_MTRRfix64K_00000:
env->mtrr_fixed[(uint32_t)env->regs[R_ECX] -
MSR_MTRRfix64K_00000] = val;
break;
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
env->mtrr_fixed[(uint32_t)env->regs[R_ECX] -
MSR_MTRRfix16K_80000 + 1] = val;
break;
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
case MSR_MTRRfix4K_D8000:
case MSR_MTRRfix4K_E0000:
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
env->mtrr_fixed[(uint32_t)env->regs[R_ECX] -
MSR_MTRRfix4K_C0000 + 3] = val;
break;
case MSR_MTRRdefType:
env->mtrr_deftype = val;
break;
case MSR_MCG_STATUS:
env->mcg_status = val;
break;
case MSR_MCG_CTL:
if ((env->mcg_cap & MCG_CTL_P)
&& (val == 0 || val == ~(uint64_t)0)) {
env->mcg_ctl = val;
}
break;
case MSR_TSC_AUX:
env->tsc_aux = val;
break;
case MSR_IA32_MISC_ENABLE:
env->msr_ia32_misc_enable = val;
break;
default:
if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL
&& (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL +
(4 * env->mcg_cap & 0xff)) {
uint32_t offset = (uint32_t)env->regs[R_ECX] - MSR_MC0_CTL;
if ((offset & 0x3) != 0
|| (val == 0 || val == ~(uint64_t)0)) {
env->mce_banks[offset] = val;
}
break;
}
/* XXX: exception? */
break;
}
}
void helper_rdmsr(CPUX86State *env)
{
uint64_t val;
cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 0);
switch ((uint32_t)env->regs[R_ECX]) {
case MSR_IA32_SYSENTER_CS:
val = env->sysenter_cs;
break;
case MSR_IA32_SYSENTER_ESP:
val = env->sysenter_esp;
break;
case MSR_IA32_SYSENTER_EIP:
val = env->sysenter_eip;
break;
case MSR_IA32_APICBASE:
val = cpu_get_apic_base(env->apic_state);
break;
case MSR_EFER:
val = env->efer;
break;
case MSR_STAR:
val = env->star;
break;
case MSR_PAT:
val = env->pat;
break;
case MSR_VM_HSAVE_PA:
val = env->vm_hsave;
break;
case MSR_IA32_PERF_STATUS:
/* tsc_increment_by_tick */
val = 1000ULL;
/* CPU multiplier */
val |= (((uint64_t)4ULL) << 40);
break;
#ifdef TARGET_X86_64
case MSR_LSTAR:
val = env->lstar;
break;
case MSR_CSTAR:
val = env->cstar;
break;
case MSR_FMASK:
val = env->fmask;
break;
case MSR_FSBASE:
val = env->segs[R_FS].base;
break;
case MSR_GSBASE:
val = env->segs[R_GS].base;
break;
case MSR_KERNELGSBASE:
val = env->kernelgsbase;
break;
case MSR_TSC_AUX:
val = env->tsc_aux;
break;
#endif
case MSR_MTRRphysBase(0):
case MSR_MTRRphysBase(1):
case MSR_MTRRphysBase(2):
case MSR_MTRRphysBase(3):
case MSR_MTRRphysBase(4):
case MSR_MTRRphysBase(5):
case MSR_MTRRphysBase(6):
case MSR_MTRRphysBase(7):
val = env->mtrr_var[((uint32_t)env->regs[R_ECX] -
MSR_MTRRphysBase(0)) / 2].base;
break;
case MSR_MTRRphysMask(0):
case MSR_MTRRphysMask(1):
case MSR_MTRRphysMask(2):
case MSR_MTRRphysMask(3):
case MSR_MTRRphysMask(4):
case MSR_MTRRphysMask(5):
case MSR_MTRRphysMask(6):
case MSR_MTRRphysMask(7):
val = env->mtrr_var[((uint32_t)env->regs[R_ECX] -
MSR_MTRRphysMask(0)) / 2].mask;
break;
case MSR_MTRRfix64K_00000:
val = env->mtrr_fixed[0];
break;
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
val = env->mtrr_fixed[(uint32_t)env->regs[R_ECX] -
MSR_MTRRfix16K_80000 + 1];
break;
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
case MSR_MTRRfix4K_D8000:
case MSR_MTRRfix4K_E0000:
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
val = env->mtrr_fixed[(uint32_t)env->regs[R_ECX] -
MSR_MTRRfix4K_C0000 + 3];
break;
case MSR_MTRRdefType:
val = env->mtrr_deftype;
break;
case MSR_MTRRcap:
if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
val = MSR_MTRRcap_VCNT | MSR_MTRRcap_FIXRANGE_SUPPORT |
MSR_MTRRcap_WC_SUPPORTED;
} else {
/* XXX: exception? */
val = 0;
}
break;
case MSR_MCG_CAP:
val = env->mcg_cap;
break;
case MSR_MCG_CTL:
if (env->mcg_cap & MCG_CTL_P) {
val = env->mcg_ctl;
} else {
val = 0;
}
break;
case MSR_MCG_STATUS:
val = env->mcg_status;
break;
case MSR_IA32_MISC_ENABLE:
val = env->msr_ia32_misc_enable;
break;
default:
if ((uint32_t)env->regs[R_ECX] >= MSR_MC0_CTL
&& (uint32_t)env->regs[R_ECX] < MSR_MC0_CTL +
(4 * env->mcg_cap & 0xff)) {
uint32_t offset = (uint32_t)env->regs[R_ECX] - MSR_MC0_CTL;
val = env->mce_banks[offset];
break;
}
/* XXX: exception? */
val = 0;
break;
}
env->regs[R_EAX] = (uint32_t)(val);
env->regs[R_EDX] = (uint32_t)(val >> 32);
}
#endif
static void do_pause(X86CPU *cpu)
{
CPUX86State *env = &cpu->env;
/* Just let another CPU run. */
env->exception_index = EXCP_INTERRUPT;
cpu_loop_exit(env);
}
static void do_hlt(X86CPU *cpu)
{
CPUState *cs = CPU(cpu);
CPUX86State *env = &cpu->env;
env->hflags &= ~HF_INHIBIT_IRQ_MASK; /* needed if sti is just before */
cs->halted = 1;
env->exception_index = EXCP_HLT;
cpu_loop_exit(env);
}
void helper_hlt(CPUX86State *env, int next_eip_addend)
{
X86CPU *cpu = x86_env_get_cpu(env);
cpu_svm_check_intercept_param(env, SVM_EXIT_HLT, 0);
env->eip += next_eip_addend;
do_hlt(cpu);
}
void helper_monitor(CPUX86State *env, target_ulong ptr)
{
if ((uint32_t)env->regs[R_ECX] != 0) {
raise_exception(env, EXCP0D_GPF);
}
/* XXX: store address? */
cpu_svm_check_intercept_param(env, SVM_EXIT_MONITOR, 0);
}
void helper_mwait(CPUX86State *env, int next_eip_addend)
{
CPUState *cs;
X86CPU *cpu;
if ((uint32_t)env->regs[R_ECX] != 0) {
raise_exception(env, EXCP0D_GPF);
}
cpu_svm_check_intercept_param(env, SVM_EXIT_MWAIT, 0);
env->eip += next_eip_addend;
cpu = x86_env_get_cpu(env);
cs = CPU(cpu);
/* XXX: not complete but not completely erroneous */
if (cs->cpu_index != 0 || CPU_NEXT(cs) != NULL) {
do_pause(cpu);
} else {
do_hlt(cpu);
}
}
void helper_pause(CPUX86State *env, int next_eip_addend)
{
X86CPU *cpu = x86_env_get_cpu(env);
cpu_svm_check_intercept_param(env, SVM_EXIT_PAUSE, 0);
env->eip += next_eip_addend;
do_pause(cpu);
}
void helper_debug(CPUX86State *env)
{
env->exception_index = EXCP_DEBUG;
cpu_loop_exit(env);
}