From bdcbd3e2fb98407e6b0928a17d1ed61786cb6389 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:04 +0100 Subject: [PATCH 01/34] kvm: ppc: Fix breakage of kvm_arch_pre_run/process_irqchip_events Commit 7a39fe5882 failed to convert the right arch function. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- target-ppc/kvm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index bd4012a4ec..3924f4bed4 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -222,7 +222,7 @@ int kvmppc_set_interrupt(CPUState *env, int irq, int level) #define PPC_INPUT_INT PPC6xx_INPUT_INT #endif -int kvm_arch_pre_run(CPUState *env, struct kvm_run *run) +void kvm_arch_pre_run(CPUState *env, struct kvm_run *run) { int r; unsigned irq; @@ -253,15 +253,15 @@ int kvm_arch_pre_run(CPUState *env, struct kvm_run *run) /* We don't know if there are more interrupts pending after this. However, * the guest will return to userspace in the course of handling this one * anyways, so we will get a chance to deliver the rest. */ - return 0; } void kvm_arch_post_run(CPUState *env, struct kvm_run *run) { } -void kvm_arch_process_irqchip_events(CPUState *env) +int kvm_arch_process_irqchip_events(CPUState *env) { + return 0; } static int kvmppc_handle_halt(CPUState *env) From fbc1c7e6886290c72e78bac85db8e779d608b38b Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:05 +0100 Subject: [PATCH 02/34] kvm: Fix build warning when KVM_CAP_SET_GUEST_DEBUG is lacking Original fix by David Gibson. CC: David Gibson Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- kvm-all.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kvm-all.c b/kvm-all.c index e6a7de4722..7753c8ae38 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -998,7 +998,9 @@ int kvm_cpu_exec(CPUState *env) } ret = EXCP_INTERRUPT; +#ifdef KVM_CAP_SET_GUEST_DEBUG out: +#endif env->exit_request = 0; cpu_single_env = NULL; return ret; From ac098781583bb1ca8369b5bd81fa5fbafa86231c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:06 +0100 Subject: [PATCH 03/34] x86: Account for MCE in cpu_has_work MCEs can be injected asynchronously, so they can also terminate the halt state. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- target-i386/exec.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/target-i386/exec.h b/target-i386/exec.h index 3e7386e91c..6f9f709d8a 100644 --- a/target-i386/exec.h +++ b/target-i386/exec.h @@ -293,15 +293,12 @@ static inline void load_eflags(int eflags, int update_mask) static inline int cpu_has_work(CPUState *env) { - int work; - - work = (env->interrupt_request & CPU_INTERRUPT_HARD) && - (env->eflags & IF_MASK); - work |= env->interrupt_request & CPU_INTERRUPT_NMI; - work |= env->interrupt_request & CPU_INTERRUPT_INIT; - work |= env->interrupt_request & CPU_INTERRUPT_SIPI; - - return work; + return ((env->interrupt_request & CPU_INTERRUPT_HARD) && + (env->eflags & IF_MASK)) || + (env->interrupt_request & (CPU_INTERRUPT_NMI | + CPU_INTERRUPT_INIT | + CPU_INTERRUPT_SIPI | + CPU_INTERRUPT_MCE)); } /* load efer and update the corresponding hflags. XXX: do consistency From 185592324ff92f3c7c5edb4d9e96d40a137ecdcd Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:07 +0100 Subject: [PATCH 04/34] x86: Perform implicit mcg_status reset Reorder mcg_status in CPUState to achieve automatic clearing on reset. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- target-i386/cpu.h | 3 ++- target-i386/helper.c | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 5f1df8b4d3..75156e761a 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -687,6 +687,8 @@ typedef struct CPUX86State { uint64_t pat; + uint64_t mcg_status; + /* exception/interrupt handling */ int error_code; int exception_is_int; @@ -741,7 +743,6 @@ typedef struct CPUX86State { struct DeviceState *apic_state; uint64_t mcg_cap; - uint64_t mcg_status; uint64_t mcg_ctl; uint64_t mce_banks[MCE_BANKS_DEF*4]; diff --git a/target-i386/helper.c b/target-i386/helper.c index f0c546df5c..f41416f2ee 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -101,8 +101,6 @@ void cpu_reset(CPUX86State *env) env->dr[7] = DR7_FIXED_1; cpu_breakpoint_remove_all(env, BP_CPU); cpu_watchpoint_remove_all(env, BP_CPU); - - env->mcg_status = 0; } void cpu_x86_close(CPUX86State *env) From 2fa11da0c3a6614b15e6007389986a7c2b31ac49 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:08 +0100 Subject: [PATCH 05/34] x86: Small cleanups of MCE helpers Fix some code style issues, use proper headers, and align to cpu_x86 naming scheme. No functional changes. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- cpu-all.h | 4 ---- monitor.c | 2 +- target-i386/cpu.h | 5 +++++ target-i386/helper.c | 41 ++++++++++++++++++++++++----------------- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/cpu-all.h b/cpu-all.h index 87b0f86667..caf5e6c378 100644 --- a/cpu-all.h +++ b/cpu-all.h @@ -971,8 +971,4 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf); int cpu_memory_rw_debug(CPUState *env, target_ulong addr, uint8_t *buf, int len, int is_write); -void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc, - int broadcast); - #endif /* CPU_ALL_H */ diff --git a/monitor.c b/monitor.c index 22ae3bbff0..45b0cc23d5 100644 --- a/monitor.c +++ b/monitor.c @@ -2713,7 +2713,7 @@ static void do_inject_mce(Monitor *mon, const QDict *qdict) for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu) { if (cenv->cpu_index == cpu_index && cenv->mcg_cap) { - cpu_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, + cpu_x86_inject_mce(cenv, bank, status, mcg_status, addr, misc, broadcast); break; } diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 75156e761a..52bb48ec33 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -986,4 +986,9 @@ static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc, void do_cpu_init(CPUState *env); void do_cpu_sipi(CPUState *env); + +void cpu_x86_inject_mce(CPUState *cenv, int bank, uint64_t status, + uint64_t mcg_status, uint64_t addr, uint64_t misc, + int broadcast); + #endif /* CPU_I386_H */ diff --git a/target-i386/helper.c b/target-i386/helper.c index f41416f2ee..ba3bed91c8 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -28,6 +28,9 @@ #include "qemu-common.h" #include "kvm.h" #include "kvm_x86.h" +#ifndef CONFIG_USER_ONLY +#include "sysemu.h" +#endif //#define DEBUG_MMU @@ -1063,11 +1066,9 @@ static void breakpoint_handler(CPUState *env) prev_debug_excp_handler(env); } -/* This should come from sysemu.h - if we could include it here... */ -void qemu_system_reset_request(void); - -static void qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc) +static void +qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, + uint64_t mcg_status, uint64_t addr, uint64_t misc) { uint64_t mcg_cap = cenv->mcg_cap; uint64_t *banks = cenv->mce_banks; @@ -1077,15 +1078,17 @@ static void qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, * reporting is disabled */ if ((status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && - cenv->mcg_ctl != ~(uint64_t)0) + cenv->mcg_ctl != ~(uint64_t)0) { return; + } banks += 4 * bank; /* * if MSR_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank */ - if ((status & MCI_STATUS_UC) && banks[0] != ~(uint64_t)0) + if ((status & MCI_STATUS_UC) && banks[0] != ~(uint64_t)0) { return; + } if (status & MCI_STATUS_UC) { if ((cenv->mcg_status & MCG_STATUS_MCIP) || !(cenv->cr[4] & CR4_MCE_MASK)) { @@ -1095,8 +1098,9 @@ static void qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, qemu_system_reset_request(); return; } - if (banks[1] & MCI_STATUS_VAL) + if (banks[1] & MCI_STATUS_VAL) { status |= MCI_STATUS_OVER; + } banks[2] = addr; banks[3] = misc; cenv->mcg_status = mcg_status; @@ -1104,16 +1108,18 @@ static void qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, cpu_interrupt(cenv, CPU_INTERRUPT_MCE); } else if (!(banks[1] & MCI_STATUS_VAL) || !(banks[1] & MCI_STATUS_UC)) { - if (banks[1] & MCI_STATUS_VAL) + if (banks[1] & MCI_STATUS_VAL) { status |= MCI_STATUS_OVER; + } banks[2] = addr; banks[3] = misc; banks[1] = status; - } else + } else { banks[1] |= MCI_STATUS_OVER; + } } -void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, +void cpu_x86_inject_mce(CPUState *cenv, int bank, uint64_t status, uint64_t mcg_status, uint64_t addr, uint64_t misc, int broadcast) { @@ -1155,15 +1161,16 @@ void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, static void mce_init(CPUX86State *cenv) { - unsigned int bank, bank_num; + unsigned int bank; - if (((cenv->cpuid_version >> 8)&0xf) >= 6 - && (cenv->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)) { + if (((cenv->cpuid_version >> 8) & 0xf) >= 6 + && (cenv->cpuid_features & (CPUID_MCE | CPUID_MCA)) == + (CPUID_MCE | CPUID_MCA)) { cenv->mcg_cap = MCE_CAP_DEF | MCE_BANKS_DEF; cenv->mcg_ctl = ~(uint64_t)0; - bank_num = MCE_BANKS_DEF; - for (bank = 0; bank < bank_num; bank++) - cenv->mce_banks[bank*4] = ~(uint64_t)0; + for (bank = 0; bank < MCE_BANKS_DEF; bank++) { + cenv->mce_banks[bank * 4] = ~(uint64_t)0; + } } } From 316378e4d0214b45cfeaa01609aca4dabb18d78b Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:09 +0100 Subject: [PATCH 06/34] x86: Refine error reporting of MCE injection services As this service is used by the human monitor, make sure that errors get reported to the right channel, and also raise the verbosity. This requires to move Monitor typedef in qemu-common.h to resolve the include dependency. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- monitor.c | 4 +-- qemu-common.h | 6 ++-- target-i386/cpu.h | 6 ++-- target-i386/helper.c | 77 ++++++++++++++++++++++++++------------------ 4 files changed, 53 insertions(+), 40 deletions(-) diff --git a/monitor.c b/monitor.c index 45b0cc23d5..662df7ceb4 100644 --- a/monitor.c +++ b/monitor.c @@ -2712,8 +2712,8 @@ static void do_inject_mce(Monitor *mon, const QDict *qdict) int broadcast = qdict_get_try_bool(qdict, "broadcast", 0); for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu) { - if (cenv->cpu_index == cpu_index && cenv->mcg_cap) { - cpu_x86_inject_mce(cenv, bank, status, mcg_status, addr, misc, + if (cenv->cpu_index == cpu_index) { + cpu_x86_inject_mce(mon, cenv, bank, status, mcg_status, addr, misc, broadcast); break; } diff --git a/qemu-common.h b/qemu-common.h index 3fe1719d03..7a96dd14e8 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -18,6 +18,9 @@ typedef struct QEMUFile QEMUFile; typedef struct QEMUBH QEMUBH; typedef struct DeviceState DeviceState; +struct Monitor; +typedef struct Monitor Monitor; + /* we put basic includes here to avoid repeating them in device drivers */ #include #include @@ -327,9 +330,6 @@ void qemu_iovec_memset(QEMUIOVector *qiov, int c, size_t count); void qemu_iovec_memset_skip(QEMUIOVector *qiov, int c, size_t count, size_t skip); -struct Monitor; -typedef struct Monitor Monitor; - /* Convert a byte between binary and BCD. */ static inline uint8_t to_bcd(uint8_t val) { diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 52bb48ec33..486af1d6e7 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -987,8 +987,8 @@ static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc, void do_cpu_init(CPUState *env); void do_cpu_sipi(CPUState *env); -void cpu_x86_inject_mce(CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc, - int broadcast); +void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, + uint64_t status, uint64_t mcg_status, uint64_t addr, + uint64_t misc, int broadcast); #endif /* CPU_I386_H */ diff --git a/target-i386/helper.c b/target-i386/helper.c index ba3bed91c8..462d3329db 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -30,6 +30,7 @@ #include "kvm_x86.h" #ifndef CONFIG_USER_ONLY #include "sysemu.h" +#include "monitor.h" #endif //#define DEBUG_MMU @@ -1067,33 +1068,38 @@ static void breakpoint_handler(CPUState *env) } static void -qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, +qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status, uint64_t mcg_status, uint64_t addr, uint64_t misc) { uint64_t mcg_cap = cenv->mcg_cap; - uint64_t *banks = cenv->mce_banks; + uint64_t *banks = cenv->mce_banks + 4 * bank; - /* - * if MSR_MCG_CTL is not all 1s, the uncorrected error - * reporting is disabled - */ - if ((status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && - cenv->mcg_ctl != ~(uint64_t)0) { - return; - } - banks += 4 * bank; - /* - * if MSR_MCi_CTL is not all 1s, the uncorrected error - * reporting is disabled for the bank - */ - if ((status & MCI_STATUS_UC) && banks[0] != ~(uint64_t)0) { - return; - } if (status & MCI_STATUS_UC) { + /* + * if MSR_MCG_CTL is not all 1s, the uncorrected error + * reporting is disabled + */ + if ((mcg_cap & MCG_CTL_P) && cenv->mcg_ctl != ~(uint64_t)0) { + monitor_printf(mon, + "CPU %d: Uncorrected error reporting disabled\n", + cenv->cpu_index); + return; + } + + /* + * if MSR_MCi_CTL is not all 1s, the uncorrected error + * reporting is disabled for the bank + */ + if (banks[0] != ~(uint64_t)0) { + monitor_printf(mon, "CPU %d: Uncorrected error reporting disabled " + "for bank %d\n", cenv->cpu_index, bank); + return; + } + if ((cenv->mcg_status & MCG_STATUS_MCIP) || !(cenv->cr[4] & CR4_MCE_MASK)) { - fprintf(stderr, "injects mce exception while previous " - "one is in progress!\n"); + monitor_printf(mon, "CPU %d: Previous MCE still in progress, " + "raising triple fault\n", cenv->cpu_index); qemu_log_mask(CPU_LOG_RESET, "Triple fault\n"); qemu_system_reset_request(); return; @@ -1119,23 +1125,29 @@ qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, } } -void cpu_x86_inject_mce(CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc, - int broadcast) +void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, + uint64_t status, uint64_t mcg_status, uint64_t addr, + uint64_t misc, int broadcast) { unsigned bank_num = cenv->mcg_cap & 0xff; CPUState *env; int flag = 0; - if (bank >= bank_num || !(status & MCI_STATUS_VAL)) { + if (!cenv->mcg_cap) { + monitor_printf(mon, "MCE injection not supported\n"); return; } - - if (broadcast) { - if (!cpu_x86_support_mca_broadcast(cenv)) { - fprintf(stderr, "Current CPU does not support broadcast\n"); - return; - } + if (bank >= bank_num) { + monitor_printf(mon, "Invalid MCE bank number\n"); + return; + } + if (!(status & MCI_STATUS_VAL)) { + monitor_printf(mon, "Invalid MCE status code\n"); + return; + } + if (broadcast && !cpu_x86_support_mca_broadcast(cenv)) { + monitor_printf(mon, "Guest CPU does not support MCA broadcast\n"); + return; } if (kvm_enabled()) { @@ -1145,13 +1157,14 @@ void cpu_x86_inject_mce(CPUState *cenv, int bank, uint64_t status, kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, flag); } else { - qemu_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc); + qemu_inject_x86_mce(mon, cenv, bank, status, mcg_status, addr, misc); if (broadcast) { for (env = first_cpu; env != NULL; env = env->next_cpu) { if (cenv == env) { continue; } - qemu_inject_x86_mce(env, 1, MCI_STATUS_VAL | MCI_STATUS_UC, + qemu_inject_x86_mce(mon, env, 1, + MCI_STATUS_VAL | MCI_STATUS_UC, MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0); } } From 747461c76beb1856343d5cd16bb7663ed2ebf3da Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:10 +0100 Subject: [PATCH 07/34] x86: Optionally avoid injecting AO MCEs while others are pending Allow to tell cpu_x86_inject_mce that it should ignore Action Optional MCE events when the target VCPU is still processing another one. This will be used by KVM soon. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- monitor.c | 7 +++++-- target-i386/cpu.h | 5 ++++- target-i386/helper.c | 26 +++++++++++++++++++------- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/monitor.c b/monitor.c index 662df7ceb4..ae20927c5a 100644 --- a/monitor.c +++ b/monitor.c @@ -2709,12 +2709,15 @@ static void do_inject_mce(Monitor *mon, const QDict *qdict) uint64_t mcg_status = qdict_get_int(qdict, "mcg_status"); uint64_t addr = qdict_get_int(qdict, "addr"); uint64_t misc = qdict_get_int(qdict, "misc"); - int broadcast = qdict_get_try_bool(qdict, "broadcast", 0); + int flags = MCE_INJECT_UNCOND_AO; + if (qdict_get_try_bool(qdict, "broadcast", 0)) { + flags |= MCE_INJECT_BROADCAST; + } for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu) { if (cenv->cpu_index == cpu_index) { cpu_x86_inject_mce(mon, cenv, bank, status, mcg_status, addr, misc, - broadcast); + flags); break; } } diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 486af1d6e7..d0eae754cc 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -987,8 +987,11 @@ static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc, void do_cpu_init(CPUState *env); void do_cpu_sipi(CPUState *env); +#define MCE_INJECT_BROADCAST 1 +#define MCE_INJECT_UNCOND_AO 2 + void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status, uint64_t mcg_status, uint64_t addr, - uint64_t misc, int broadcast); + uint64_t misc, int flags); #endif /* CPU_I386_H */ diff --git a/target-i386/helper.c b/target-i386/helper.c index 462d3329db..e3ef40cbca 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -1069,11 +1069,20 @@ static void breakpoint_handler(CPUState *env) static void qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc) + uint64_t mcg_status, uint64_t addr, uint64_t misc, + int flags) { uint64_t mcg_cap = cenv->mcg_cap; uint64_t *banks = cenv->mce_banks + 4 * bank; + /* + * If there is an MCE exception being processed, ignore this SRAO MCE + * unless unconditional injection was requested. + */ + if (!(flags & MCE_INJECT_UNCOND_AO) && !(status & MCI_STATUS_AR) + && (cenv->mcg_status & MCG_STATUS_MCIP)) { + return; + } if (status & MCI_STATUS_UC) { /* * if MSR_MCG_CTL is not all 1s, the uncorrected error @@ -1127,7 +1136,7 @@ qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status, void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status, uint64_t mcg_status, uint64_t addr, - uint64_t misc, int broadcast) + uint64_t misc, int flags) { unsigned bank_num = cenv->mcg_cap & 0xff; CPUState *env; @@ -1145,27 +1154,30 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, monitor_printf(mon, "Invalid MCE status code\n"); return; } - if (broadcast && !cpu_x86_support_mca_broadcast(cenv)) { + if ((flags & MCE_INJECT_BROADCAST) + && !cpu_x86_support_mca_broadcast(cenv)) { monitor_printf(mon, "Guest CPU does not support MCA broadcast\n"); return; } if (kvm_enabled()) { - if (broadcast) { + if (flags & MCE_INJECT_BROADCAST) { flag |= MCE_BROADCAST; } kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, flag); } else { - qemu_inject_x86_mce(mon, cenv, bank, status, mcg_status, addr, misc); - if (broadcast) { + qemu_inject_x86_mce(mon, cenv, bank, status, mcg_status, addr, misc, + flags); + if (flags & MCE_INJECT_BROADCAST) { for (env = first_cpu; env != NULL; env = env->next_cpu) { if (cenv == env) { continue; } qemu_inject_x86_mce(mon, env, 1, MCI_STATUS_VAL | MCI_STATUS_UC, - MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0); + MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, + flags); } } } From a7ada1510cb380b86c4811aa40fc45c8a7fc0daf Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:11 +0100 Subject: [PATCH 08/34] Synchronize VCPU states before reset This is required to support keeping VCPU states across a system reset. If we do not read the current state before the reset, cpu_synchronize_all_post_reset may write back incorrect state information. The first user of this will be MCE MSR synchronization which currently works around the missing cpu_synchronize_all_states. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- vl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vl.c b/vl.c index 5e007a764c..f4c24d30dc 100644 --- a/vl.c +++ b/vl.c @@ -1450,6 +1450,7 @@ static void main_loop(void) } if (qemu_reset_requested()) { pause_all_vcpus(); + cpu_synchronize_all_states(); qemu_system_reset(); resume_all_vcpus(); } From 419fb20a8eab0376d86790a9907553307b1654c5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:12 +0100 Subject: [PATCH 09/34] kvm: x86: Move MCE functions together Pure function suffling to avoid multiple #ifdef KVM_CAP_MCE sections, no functional changes. While at it, annotate some #ifdef sections. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- target-i386/kvm.c | 346 +++++++++++++++++++++++----------------------- 1 file changed, 171 insertions(+), 175 deletions(-) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 1f12cbf19e..aba2cc91fa 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -172,7 +172,7 @@ static int get_para_features(CPUState *env) #endif return features; } -#endif +#endif /* CONFIG_KVM_PARA */ #ifdef KVM_CAP_MCE static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap, @@ -273,8 +273,174 @@ static void kvm_inject_x86_mce_on(CPUState *env, struct kvm_x86_mce *mce, run_on_cpu(env, kvm_do_inject_x86_mce, &data); } -static void kvm_mce_broadcast_rest(CPUState *env); -#endif +static void kvm_mce_broadcast_rest(CPUState *env) +{ + struct kvm_x86_mce mce = { + .bank = 1, + .status = MCI_STATUS_VAL | MCI_STATUS_UC, + .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, + .addr = 0, + .misc = 0, + }; + CPUState *cenv; + + /* Broadcast MCA signal for processor version 06H_EH and above */ + if (cpu_x86_support_mca_broadcast(env)) { + for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu) { + if (cenv == env) { + continue; + } + kvm_inject_x86_mce_on(cenv, &mce, ABORT_ON_ERROR); + } + } +} + +static void kvm_mce_inj_srar_dataload(CPUState *env, target_phys_addr_t paddr) +{ + struct kvm_x86_mce mce = { + .bank = 9, + .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | MCI_STATUS_AR | 0x134, + .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV, + .addr = paddr, + .misc = (MCM_ADDR_PHYS << 6) | 0xc, + }; + int r; + + r = kvm_set_mce(env, &mce); + if (r < 0) { + fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); + abort(); + } + kvm_mce_broadcast_rest(env); +} + +static void kvm_mce_inj_srao_memscrub(CPUState *env, target_phys_addr_t paddr) +{ + struct kvm_x86_mce mce = { + .bank = 9, + .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | 0xc0, + .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, + .addr = paddr, + .misc = (MCM_ADDR_PHYS << 6) | 0xc, + }; + int r; + + r = kvm_set_mce(env, &mce); + if (r < 0) { + fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); + abort(); + } + kvm_mce_broadcast_rest(env); +} + +static void kvm_mce_inj_srao_memscrub2(CPUState *env, target_phys_addr_t paddr) +{ + struct kvm_x86_mce mce = { + .bank = 9, + .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S + | 0xc0, + .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, + .addr = paddr, + .misc = (MCM_ADDR_PHYS << 6) | 0xc, + }; + + kvm_inject_x86_mce_on(env, &mce, ABORT_ON_ERROR); + kvm_mce_broadcast_rest(env); +} +#endif /* KVM_CAP_MCE */ + +static void hardware_memory_error(void) +{ + fprintf(stderr, "Hardware memory error!\n"); + exit(1); +} + +int kvm_arch_on_sigbus_vcpu(CPUState *env, int code, void *addr) +{ +#ifdef KVM_CAP_MCE + void *vaddr; + ram_addr_t ram_addr; + target_phys_addr_t paddr; + + if ((env->mcg_cap & MCG_SER_P) && addr + && (code == BUS_MCEERR_AR + || code == BUS_MCEERR_AO)) { + vaddr = (void *)addr; + if (qemu_ram_addr_from_host(vaddr, &ram_addr) || + !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) { + fprintf(stderr, "Hardware memory error for memory used by " + "QEMU itself instead of guest system!\n"); + /* Hope we are lucky for AO MCE */ + if (code == BUS_MCEERR_AO) { + return 0; + } else { + hardware_memory_error(); + } + } + + if (code == BUS_MCEERR_AR) { + /* Fake an Intel architectural Data Load SRAR UCR */ + kvm_mce_inj_srar_dataload(env, paddr); + } else { + /* + * If there is an MCE excpetion being processed, ignore + * this SRAO MCE + */ + if (!kvm_mce_in_progress(env)) { + /* Fake an Intel architectural Memory scrubbing UCR */ + kvm_mce_inj_srao_memscrub(env, paddr); + } + } + } else +#endif /* KVM_CAP_MCE */ + { + if (code == BUS_MCEERR_AO) { + return 0; + } else if (code == BUS_MCEERR_AR) { + hardware_memory_error(); + } else { + return 1; + } + } + return 0; +} + +int kvm_arch_on_sigbus(int code, void *addr) +{ +#ifdef KVM_CAP_MCE + if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) { + void *vaddr; + ram_addr_t ram_addr; + target_phys_addr_t paddr; + + /* Hope we are lucky for AO MCE */ + vaddr = addr; + if (qemu_ram_addr_from_host(vaddr, &ram_addr) || + !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, + &paddr)) { + fprintf(stderr, "Hardware memory error for memory used by " + "QEMU itself instead of guest system!: %p\n", addr); + return 0; + } + kvm_mce_inj_srao_memscrub2(first_cpu, paddr); + } else +#endif /* KVM_CAP_MCE */ + { + if (code == BUS_MCEERR_AO) { + return 0; + } else if (code == BUS_MCEERR_AR) { + hardware_memory_error(); + } else { + return 1; + } + } + return 0; +} void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, uint64_t mcg_status, uint64_t addr, uint64_t misc, @@ -294,11 +460,11 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, } kvm_inject_x86_mce_on(cenv, &mce, flag); -#else +#else /* !KVM_CAP_MCE*/ if (flag & ABORT_ON_ERROR) { abort(); } -#endif +#endif /* !KVM_CAP_MCE*/ } static void cpu_update_state(void *opaque, int running, int reason) @@ -1783,173 +1949,3 @@ bool kvm_arch_stop_on_emulation_error(CPUState *env) return !(env->cr[0] & CR0_PE_MASK) || ((env->segs[R_CS].selector & 3) != 3); } - -static void hardware_memory_error(void) -{ - fprintf(stderr, "Hardware memory error!\n"); - exit(1); -} - -#ifdef KVM_CAP_MCE -static void kvm_mce_broadcast_rest(CPUState *env) -{ - struct kvm_x86_mce mce = { - .bank = 1, - .status = MCI_STATUS_VAL | MCI_STATUS_UC, - .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, - .addr = 0, - .misc = 0, - }; - CPUState *cenv; - - /* Broadcast MCA signal for processor version 06H_EH and above */ - if (cpu_x86_support_mca_broadcast(env)) { - for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu) { - if (cenv == env) { - continue; - } - kvm_inject_x86_mce_on(cenv, &mce, ABORT_ON_ERROR); - } - } -} - -static void kvm_mce_inj_srar_dataload(CPUState *env, target_phys_addr_t paddr) -{ - struct kvm_x86_mce mce = { - .bank = 9, - .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN - | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S - | MCI_STATUS_AR | 0x134, - .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV, - .addr = paddr, - .misc = (MCM_ADDR_PHYS << 6) | 0xc, - }; - int r; - - r = kvm_set_mce(env, &mce); - if (r < 0) { - fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); - abort(); - } - kvm_mce_broadcast_rest(env); -} - -static void kvm_mce_inj_srao_memscrub(CPUState *env, target_phys_addr_t paddr) -{ - struct kvm_x86_mce mce = { - .bank = 9, - .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN - | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S - | 0xc0, - .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, - .addr = paddr, - .misc = (MCM_ADDR_PHYS << 6) | 0xc, - }; - int r; - - r = kvm_set_mce(env, &mce); - if (r < 0) { - fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); - abort(); - } - kvm_mce_broadcast_rest(env); -} - -static void kvm_mce_inj_srao_memscrub2(CPUState *env, target_phys_addr_t paddr) -{ - struct kvm_x86_mce mce = { - .bank = 9, - .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN - | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S - | 0xc0, - .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, - .addr = paddr, - .misc = (MCM_ADDR_PHYS << 6) | 0xc, - }; - - kvm_inject_x86_mce_on(env, &mce, ABORT_ON_ERROR); - kvm_mce_broadcast_rest(env); -} - -#endif - -int kvm_arch_on_sigbus_vcpu(CPUState *env, int code, void *addr) -{ -#if defined(KVM_CAP_MCE) - void *vaddr; - ram_addr_t ram_addr; - target_phys_addr_t paddr; - - if ((env->mcg_cap & MCG_SER_P) && addr - && (code == BUS_MCEERR_AR - || code == BUS_MCEERR_AO)) { - vaddr = (void *)addr; - if (qemu_ram_addr_from_host(vaddr, &ram_addr) || - !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) { - fprintf(stderr, "Hardware memory error for memory used by " - "QEMU itself instead of guest system!\n"); - /* Hope we are lucky for AO MCE */ - if (code == BUS_MCEERR_AO) { - return 0; - } else { - hardware_memory_error(); - } - } - - if (code == BUS_MCEERR_AR) { - /* Fake an Intel architectural Data Load SRAR UCR */ - kvm_mce_inj_srar_dataload(env, paddr); - } else { - /* - * If there is an MCE excpetion being processed, ignore - * this SRAO MCE - */ - if (!kvm_mce_in_progress(env)) { - /* Fake an Intel architectural Memory scrubbing UCR */ - kvm_mce_inj_srao_memscrub(env, paddr); - } - } - } else -#endif - { - if (code == BUS_MCEERR_AO) { - return 0; - } else if (code == BUS_MCEERR_AR) { - hardware_memory_error(); - } else { - return 1; - } - } - return 0; -} - -int kvm_arch_on_sigbus(int code, void *addr) -{ -#if defined(KVM_CAP_MCE) - if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) { - void *vaddr; - ram_addr_t ram_addr; - target_phys_addr_t paddr; - - /* Hope we are lucky for AO MCE */ - vaddr = addr; - if (qemu_ram_addr_from_host(vaddr, &ram_addr) || - !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) { - fprintf(stderr, "Hardware memory error for memory used by " - "QEMU itself instead of guest system!: %p\n", addr); - return 0; - } - kvm_mce_inj_srao_memscrub2(first_cpu, paddr); - } else -#endif - { - if (code == BUS_MCEERR_AO) { - return 0; - } else if (code == BUS_MCEERR_AR) { - hardware_memory_error(); - } else { - return 1; - } - } - return 0; -} From 990368650f77d6f2b428b034209331ada7cb463f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:13 +0100 Subject: [PATCH 10/34] kvm: Rename kvm_arch_process_irqchip_events to async_events We will broaden the scope of this function on x86 beyond irqchip events. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- kvm-all.c | 2 +- kvm.h | 2 +- target-i386/kvm.c | 2 +- target-ppc/kvm.c | 2 +- target-s390x/kvm.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 7753c8ae38..226843c65f 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -893,7 +893,7 @@ int kvm_cpu_exec(CPUState *env) DPRINTF("kvm_cpu_exec()\n"); - if (kvm_arch_process_irqchip_events(env)) { + if (kvm_arch_process_async_events(env)) { env->exit_request = 0; return EXCP_HLT; } diff --git a/kvm.h b/kvm.h index 59b2c29fd9..7bc04e0f14 100644 --- a/kvm.h +++ b/kvm.h @@ -102,7 +102,7 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run); int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run); -int kvm_arch_process_irqchip_events(CPUState *env); +int kvm_arch_process_async_events(CPUState *env); int kvm_arch_get_registers(CPUState *env); diff --git a/target-i386/kvm.c b/target-i386/kvm.c index aba2cc91fa..0aef810418 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -1675,7 +1675,7 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run) cpu_set_apic_base(env->apic_state, run->apic_base); } -int kvm_arch_process_irqchip_events(CPUState *env) +int kvm_arch_process_async_events(CPUState *env) { if (kvm_irqchip_in_kernel()) { return 0; diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 3924f4bed4..6c99a163b8 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -259,7 +259,7 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run) { } -int kvm_arch_process_irqchip_events(CPUState *env) +int kvm_arch_process_async_events(CPUState *env) { return 0; } diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index 6e9427431d..a85ae0fc11 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -177,7 +177,7 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run) { } -int kvm_arch_process_irqchip_events(CPUState *env) +int kvm_arch_process_async_events(CPUState *env) { return 0; } From ab443475c9235822e329e1bfde89be6c71e2c21e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:14 +0100 Subject: [PATCH 11/34] kvm: x86: Inject pending MCE events on state writeback The current way of injecting MCE events without updating of and synchronizing with the CPUState is broken and causes spurious corruptions of the MCE-related parts of the CPUState. As a first step towards a fix, enhance the state writeback code with support for injecting events that are pending in the CPUState. A pending exception will then be signaled via cpu_interrupt(CPU_INTERRUPT_MCE). And, just like for TCG, we need to leave the halt state when CPU_INTERRUPT_MCE is pending (left broken for the to-be-removed old KVM code). This will also allow to unify TCG and KVM injection code. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- target-i386/kvm.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 0aef810418..d9a6fc501b 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -467,6 +467,38 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, #endif /* !KVM_CAP_MCE*/ } +static int kvm_inject_mce_oldstyle(CPUState *env) +{ +#ifdef KVM_CAP_MCE + if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) { + unsigned int bank, bank_num = env->mcg_cap & 0xff; + struct kvm_x86_mce mce; + + env->exception_injected = -1; + + /* + * There must be at least one bank in use if an MCE is pending. + * Find it and use its values for the event injection. + */ + for (bank = 0; bank < bank_num; bank++) { + if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) { + break; + } + } + assert(bank < bank_num); + + mce.bank = bank; + mce.status = env->mce_banks[bank * 4 + 1]; + mce.mcg_status = env->mcg_status; + mce.addr = env->mce_banks[bank * 4 + 2]; + mce.misc = env->mce_banks[bank * 4 + 3]; + + return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, &mce); + } +#endif /* KVM_CAP_MCE */ + return 0; +} + static void cpu_update_state(void *opaque, int running, int reason) { CPUState *env = opaque; @@ -1539,6 +1571,11 @@ int kvm_arch_put_registers(CPUState *env, int level) if (ret < 0) { return ret; } + /* must be before kvm_put_msrs */ + ret = kvm_inject_mce_oldstyle(env); + if (ret < 0) { + return ret; + } ret = kvm_put_msrs(env, level); if (ret < 0) { return ret; @@ -1677,6 +1714,29 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run) int kvm_arch_process_async_events(CPUState *env) { + if (env->interrupt_request & CPU_INTERRUPT_MCE) { + /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */ + assert(env->mcg_cap); + + env->interrupt_request &= ~CPU_INTERRUPT_MCE; + + kvm_cpu_synchronize_state(env); + + if (env->exception_injected == EXCP08_DBLE) { + /* this means triple fault */ + qemu_system_reset_request(); + env->exit_request = 1; + return 0; + } + env->exception_injected = EXCP12_MCHK; + env->has_error_code = 0; + + env->halted = 0; + if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) { + env->mp_state = KVM_MP_STATE_RUNNABLE; + } + } + if (kvm_irqchip_in_kernel()) { return 0; } From d5bfda334adf9af62df5709cdac38f523f815f47 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:15 +0100 Subject: [PATCH 12/34] x86: Run qemu_inject_x86_mce on target VCPU We will use the current TCG-only MCE injection path for KVM as well, and then this read-modify-write of the target VCPU state has to be performed synchronously in the corresponding thread. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- target-i386/helper.c | 87 +++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/target-i386/helper.c b/target-i386/helper.c index e3ef40cbca..a32960c4ce 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -1067,29 +1067,42 @@ static void breakpoint_handler(CPUState *env) prev_debug_excp_handler(env); } -static void -qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc, - int flags) +typedef struct MCEInjectionParams { + Monitor *mon; + CPUState *env; + int bank; + uint64_t status; + uint64_t mcg_status; + uint64_t addr; + uint64_t misc; + int flags; +} MCEInjectionParams; + +static void do_inject_x86_mce(void *data) { - uint64_t mcg_cap = cenv->mcg_cap; - uint64_t *banks = cenv->mce_banks + 4 * bank; + MCEInjectionParams *params = data; + CPUState *cenv = params->env; + uint64_t *banks = cenv->mce_banks + 4 * params->bank; + + cpu_synchronize_state(cenv); /* * If there is an MCE exception being processed, ignore this SRAO MCE * unless unconditional injection was requested. */ - if (!(flags & MCE_INJECT_UNCOND_AO) && !(status & MCI_STATUS_AR) + if (!(params->flags & MCE_INJECT_UNCOND_AO) + && !(params->status & MCI_STATUS_AR) && (cenv->mcg_status & MCG_STATUS_MCIP)) { return; } - if (status & MCI_STATUS_UC) { + + if (params->status & MCI_STATUS_UC) { /* * if MSR_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled */ - if ((mcg_cap & MCG_CTL_P) && cenv->mcg_ctl != ~(uint64_t)0) { - monitor_printf(mon, + if ((cenv->mcg_cap & MCG_CTL_P) && cenv->mcg_ctl != ~(uint64_t)0) { + monitor_printf(params->mon, "CPU %d: Uncorrected error reporting disabled\n", cenv->cpu_index); return; @@ -1100,35 +1113,39 @@ qemu_inject_x86_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status, * reporting is disabled for the bank */ if (banks[0] != ~(uint64_t)0) { - monitor_printf(mon, "CPU %d: Uncorrected error reporting disabled " - "for bank %d\n", cenv->cpu_index, bank); + monitor_printf(params->mon, + "CPU %d: Uncorrected error reporting disabled for" + " bank %d\n", + cenv->cpu_index, params->bank); return; } if ((cenv->mcg_status & MCG_STATUS_MCIP) || !(cenv->cr[4] & CR4_MCE_MASK)) { - monitor_printf(mon, "CPU %d: Previous MCE still in progress, " - "raising triple fault\n", cenv->cpu_index); + monitor_printf(params->mon, + "CPU %d: Previous MCE still in progress, raising" + " triple fault\n", + cenv->cpu_index); qemu_log_mask(CPU_LOG_RESET, "Triple fault\n"); qemu_system_reset_request(); return; } if (banks[1] & MCI_STATUS_VAL) { - status |= MCI_STATUS_OVER; + params->status |= MCI_STATUS_OVER; } - banks[2] = addr; - banks[3] = misc; - cenv->mcg_status = mcg_status; - banks[1] = status; + banks[2] = params->addr; + banks[3] = params->misc; + cenv->mcg_status = params->mcg_status; + banks[1] = params->status; cpu_interrupt(cenv, CPU_INTERRUPT_MCE); } else if (!(banks[1] & MCI_STATUS_VAL) || !(banks[1] & MCI_STATUS_UC)) { if (banks[1] & MCI_STATUS_VAL) { - status |= MCI_STATUS_OVER; + params->status |= MCI_STATUS_OVER; } - banks[2] = addr; - banks[3] = misc; - banks[1] = status; + banks[2] = params->addr; + banks[3] = params->misc; + banks[1] = params->status; } else { banks[1] |= MCI_STATUS_OVER; } @@ -1138,6 +1155,16 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, uint64_t status, uint64_t mcg_status, uint64_t addr, uint64_t misc, int flags) { + MCEInjectionParams params = { + .mon = mon, + .env = cenv, + .bank = bank, + .status = status, + .mcg_status = mcg_status, + .addr = addr, + .misc = misc, + .flags = flags, + }; unsigned bank_num = cenv->mcg_cap & 0xff; CPUState *env; int flag = 0; @@ -1167,17 +1194,19 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, flag); } else { - qemu_inject_x86_mce(mon, cenv, bank, status, mcg_status, addr, misc, - flags); + run_on_cpu(cenv, do_inject_x86_mce, ¶ms); if (flags & MCE_INJECT_BROADCAST) { + params.bank = 1; + params.status = MCI_STATUS_VAL | MCI_STATUS_UC; + params.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; + params.addr = 0; + params.misc = 0; for (env = first_cpu; env != NULL; env = env->next_cpu) { if (cenv == env) { continue; } - qemu_inject_x86_mce(mon, env, 1, - MCI_STATUS_VAL | MCI_STATUS_UC, - MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, - flags); + params.env = env; + run_on_cpu(cenv, do_inject_x86_mce, ¶ms); } } } From c34d440a728fd3b5099d11dec122d440ef092c23 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:16 +0100 Subject: [PATCH 13/34] kvm: x86: Consolidate TCG and KVM MCE injection code This switches KVM's MCE injection path to cpu_x86_inject_mce, both for SIGBUS and monitor initiated events. This means we prepare the MCA MSRs in the VCPUState also for KVM. We have to drop the MSRs writeback restrictions for this purpose which is now safe as every uncoordinated MSR injection is removed with this patch. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- target-i386/helper.c | 34 +++--- target-i386/kvm.c | 238 +++++------------------------------------- target-i386/kvm_x86.h | 25 ----- 3 files changed, 37 insertions(+), 260 deletions(-) delete mode 100644 target-i386/kvm_x86.h diff --git a/target-i386/helper.c b/target-i386/helper.c index a32960c4ce..a08309f97a 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -27,7 +27,6 @@ #include "exec-all.h" #include "qemu-common.h" #include "kvm.h" -#include "kvm_x86.h" #ifndef CONFIG_USER_ONLY #include "sysemu.h" #include "monitor.h" @@ -1167,7 +1166,6 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, }; unsigned bank_num = cenv->mcg_cap & 0xff; CPUState *env; - int flag = 0; if (!cenv->mcg_cap) { monitor_printf(mon, "MCE injection not supported\n"); @@ -1187,27 +1185,19 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, return; } - if (kvm_enabled()) { - if (flags & MCE_INJECT_BROADCAST) { - flag |= MCE_BROADCAST; - } - - kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, flag); - } else { - run_on_cpu(cenv, do_inject_x86_mce, ¶ms); - if (flags & MCE_INJECT_BROADCAST) { - params.bank = 1; - params.status = MCI_STATUS_VAL | MCI_STATUS_UC; - params.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; - params.addr = 0; - params.misc = 0; - for (env = first_cpu; env != NULL; env = env->next_cpu) { - if (cenv == env) { - continue; - } - params.env = env; - run_on_cpu(cenv, do_inject_x86_mce, ¶ms); + run_on_cpu(cenv, do_inject_x86_mce, ¶ms); + if (flags & MCE_INJECT_BROADCAST) { + params.bank = 1; + params.status = MCI_STATUS_VAL | MCI_STATUS_UC; + params.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; + params.addr = 0; + params.misc = 0; + for (env = first_cpu; env != NULL; env = env->next_cpu) { + if (cenv == env) { + continue; } + params.env = env; + run_on_cpu(cenv, do_inject_x86_mce, ¶ms); } } } diff --git a/target-i386/kvm.c b/target-i386/kvm.c index d9a6fc501b..4974fa42d2 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -28,7 +28,6 @@ #include "hw/pc.h" #include "hw/apic.h" #include "ioport.h" -#include "kvm_x86.h" #ifdef CONFIG_KVM_PARA #include @@ -193,164 +192,23 @@ static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap) return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap); } -static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m) +static void kvm_mce_inject(CPUState *env, target_phys_addr_t paddr, int code) { - return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m); -} + uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S; + uint64_t mcg_status = MCG_STATUS_MCIP; -static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n) -{ - struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs); - int r; - - kmsrs->nmsrs = n; - memcpy(kmsrs->entries, msrs, n * sizeof *msrs); - r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs); - memcpy(msrs, kmsrs->entries, n * sizeof *msrs); - free(kmsrs); - return r; -} - -/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */ -static int kvm_mce_in_progress(CPUState *env) -{ - struct kvm_msr_entry msr_mcg_status = { - .index = MSR_MCG_STATUS, - }; - int r; - - r = kvm_get_msr(env, &msr_mcg_status, 1); - if (r == -1 || r == 0) { - fprintf(stderr, "Failed to get MCE status\n"); - return 0; + if (code == BUS_MCEERR_AR) { + status |= MCI_STATUS_AR | 0x134; + mcg_status |= MCG_STATUS_EIPV; + } else { + status |= 0xc0; + mcg_status |= MCG_STATUS_RIPV; } - return !!(msr_mcg_status.data & MCG_STATUS_MCIP); -} - -struct kvm_x86_mce_data -{ - CPUState *env; - struct kvm_x86_mce *mce; - int abort_on_error; -}; - -static void kvm_do_inject_x86_mce(void *_data) -{ - struct kvm_x86_mce_data *data = _data; - int r; - - /* If there is an MCE exception being processed, ignore this SRAO MCE */ - if ((data->env->mcg_cap & MCG_SER_P) && - !(data->mce->status & MCI_STATUS_AR)) { - if (kvm_mce_in_progress(data->env)) { - return; - } - } - - r = kvm_set_mce(data->env, data->mce); - if (r < 0) { - perror("kvm_set_mce FAILED"); - if (data->abort_on_error) { - abort(); - } - } -} - -static void kvm_inject_x86_mce_on(CPUState *env, struct kvm_x86_mce *mce, - int flag) -{ - struct kvm_x86_mce_data data = { - .env = env, - .mce = mce, - .abort_on_error = (flag & ABORT_ON_ERROR), - }; - - if (!env->mcg_cap) { - fprintf(stderr, "MCE support is not enabled!\n"); - return; - } - - run_on_cpu(env, kvm_do_inject_x86_mce, &data); -} - -static void kvm_mce_broadcast_rest(CPUState *env) -{ - struct kvm_x86_mce mce = { - .bank = 1, - .status = MCI_STATUS_VAL | MCI_STATUS_UC, - .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, - .addr = 0, - .misc = 0, - }; - CPUState *cenv; - - /* Broadcast MCA signal for processor version 06H_EH and above */ - if (cpu_x86_support_mca_broadcast(env)) { - for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu) { - if (cenv == env) { - continue; - } - kvm_inject_x86_mce_on(cenv, &mce, ABORT_ON_ERROR); - } - } -} - -static void kvm_mce_inj_srar_dataload(CPUState *env, target_phys_addr_t paddr) -{ - struct kvm_x86_mce mce = { - .bank = 9, - .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN - | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S - | MCI_STATUS_AR | 0x134, - .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV, - .addr = paddr, - .misc = (MCM_ADDR_PHYS << 6) | 0xc, - }; - int r; - - r = kvm_set_mce(env, &mce); - if (r < 0) { - fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); - abort(); - } - kvm_mce_broadcast_rest(env); -} - -static void kvm_mce_inj_srao_memscrub(CPUState *env, target_phys_addr_t paddr) -{ - struct kvm_x86_mce mce = { - .bank = 9, - .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN - | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S - | 0xc0, - .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, - .addr = paddr, - .misc = (MCM_ADDR_PHYS << 6) | 0xc, - }; - int r; - - r = kvm_set_mce(env, &mce); - if (r < 0) { - fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); - abort(); - } - kvm_mce_broadcast_rest(env); -} - -static void kvm_mce_inj_srao_memscrub2(CPUState *env, target_phys_addr_t paddr) -{ - struct kvm_x86_mce mce = { - .bank = 9, - .status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN - | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S - | 0xc0, - .mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV, - .addr = paddr, - .misc = (MCM_ADDR_PHYS << 6) | 0xc, - }; - - kvm_inject_x86_mce_on(env, &mce, ABORT_ON_ERROR); - kvm_mce_broadcast_rest(env); + cpu_x86_inject_mce(NULL, env, 9, status, mcg_status, paddr, + (MCM_ADDR_PHYS << 6) | 0xc, + cpu_x86_support_mca_broadcast(env) ? + MCE_INJECT_BROADCAST : 0); } #endif /* KVM_CAP_MCE */ @@ -363,16 +221,14 @@ static void hardware_memory_error(void) int kvm_arch_on_sigbus_vcpu(CPUState *env, int code, void *addr) { #ifdef KVM_CAP_MCE - void *vaddr; ram_addr_t ram_addr; target_phys_addr_t paddr; if ((env->mcg_cap & MCG_SER_P) && addr - && (code == BUS_MCEERR_AR - || code == BUS_MCEERR_AO)) { - vaddr = (void *)addr; - if (qemu_ram_addr_from_host(vaddr, &ram_addr) || - !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) { + && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) { + if (qemu_ram_addr_from_host(addr, &ram_addr) || + !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, + &paddr)) { fprintf(stderr, "Hardware memory error for memory used by " "QEMU itself instead of guest system!\n"); /* Hope we are lucky for AO MCE */ @@ -382,20 +238,7 @@ int kvm_arch_on_sigbus_vcpu(CPUState *env, int code, void *addr) hardware_memory_error(); } } - - if (code == BUS_MCEERR_AR) { - /* Fake an Intel architectural Data Load SRAR UCR */ - kvm_mce_inj_srar_dataload(env, paddr); - } else { - /* - * If there is an MCE excpetion being processed, ignore - * this SRAO MCE - */ - if (!kvm_mce_in_progress(env)) { - /* Fake an Intel architectural Memory scrubbing UCR */ - kvm_mce_inj_srao_memscrub(env, paddr); - } - } + kvm_mce_inject(env, paddr, code); } else #endif /* KVM_CAP_MCE */ { @@ -414,20 +257,18 @@ int kvm_arch_on_sigbus(int code, void *addr) { #ifdef KVM_CAP_MCE if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) { - void *vaddr; ram_addr_t ram_addr; target_phys_addr_t paddr; /* Hope we are lucky for AO MCE */ - vaddr = addr; - if (qemu_ram_addr_from_host(vaddr, &ram_addr) || + if (qemu_ram_addr_from_host(addr, &ram_addr) || !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) { fprintf(stderr, "Hardware memory error for memory used by " "QEMU itself instead of guest system!: %p\n", addr); return 0; } - kvm_mce_inj_srao_memscrub2(first_cpu, paddr); + kvm_mce_inject(first_cpu, paddr, code); } else #endif /* KVM_CAP_MCE */ { @@ -442,31 +283,6 @@ int kvm_arch_on_sigbus(int code, void *addr) return 0; } -void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc, - int flag) -{ -#ifdef KVM_CAP_MCE - struct kvm_x86_mce mce = { - .bank = bank, - .status = status, - .mcg_status = mcg_status, - .addr = addr, - .misc = misc, - }; - - if (flag & MCE_BROADCAST) { - kvm_mce_broadcast_rest(cenv); - } - - kvm_inject_x86_mce_on(cenv, &mce, flag); -#else /* !KVM_CAP_MCE*/ - if (flag & ABORT_ON_ERROR) { - abort(); - } -#endif /* !KVM_CAP_MCE*/ -} - static int kvm_inject_mce_oldstyle(CPUState *env) { #ifdef KVM_CAP_MCE @@ -1053,14 +869,10 @@ static int kvm_put_msrs(CPUState *env, int level) if (env->mcg_cap) { int i; - if (level == KVM_PUT_RESET_STATE) { - kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status); - } else if (level == KVM_PUT_FULL_STATE) { - kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status); - kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl); - for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) { - kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]); - } + kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status); + kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl); + for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) { + kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]); } } #endif diff --git a/target-i386/kvm_x86.h b/target-i386/kvm_x86.h deleted file mode 100644 index 9d7b584267..0000000000 --- a/target-i386/kvm_x86.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * QEMU KVM support - * - * Copyright (C) 2009 Red Hat Inc. - * Copyright IBM, Corp. 2008 - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#ifndef __KVM_X86_H__ -#define __KVM_X86_H__ - -#define ABORT_ON_ERROR 0x01 -#define MCE_BROADCAST 0x02 - -void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, - uint64_t mcg_status, uint64_t addr, uint64_t misc, - int flag); - -#endif From 32a420243cd1d6d83bd3a248eebeb9a5ebb5101e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:17 +0100 Subject: [PATCH 14/34] kvm: x86: Clean up kvm_setup_mce There is nothing to abstract here. Fold kvm_setup_mce into its caller and fix up the error reporting (return code of kvm_vcpu_ioctl holds the error value). Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- target-i386/kvm.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 4974fa42d2..9d71b20b80 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -187,11 +187,6 @@ static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap, return -ENOSYS; } -static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap) -{ - return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap); -} - static void kvm_mce_inject(CPUState *env, target_phys_addr_t paddr, int code) { uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | @@ -440,6 +435,7 @@ int kvm_arch_init_vcpu(CPUState *env) && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) { uint64_t mcg_cap; int banks; + int ret; if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks)) { perror("kvm_get_mce_cap_supported FAILED"); @@ -448,8 +444,9 @@ int kvm_arch_init_vcpu(CPUState *env) banks = MCE_BANKS_DEF; mcg_cap &= MCE_CAP_DEF; mcg_cap |= banks; - if (kvm_setup_mce(env, &mcg_cap)) { - perror("kvm_setup_mce FAILED"); + ret = kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, &mcg_cap); + if (ret < 0) { + fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret)); } else { env->mcg_cap = mcg_cap; } From 75d49497332361a574d9ed1f546d36de385d238f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 2 Mar 2011 08:56:18 +0100 Subject: [PATCH 15/34] kvm: x86: Fail kvm_arch_init_vcpu if MCE initialization fails There is no reason to continue if the kernel claims to support MCE but then fails to process our request. Signed-off-by: Jan Kiszka CC: Huang Ying CC: Hidetoshi Seto CC: Jin Dongming Signed-off-by: Marcelo Tosatti --- target-i386/kvm.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 9d71b20b80..15741861a1 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -437,20 +437,24 @@ int kvm_arch_init_vcpu(CPUState *env) int banks; int ret; - if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks)) { - perror("kvm_get_mce_cap_supported FAILED"); - } else { - if (banks > MCE_BANKS_DEF) - banks = MCE_BANKS_DEF; - mcg_cap &= MCE_CAP_DEF; - mcg_cap |= banks; - ret = kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, &mcg_cap); - if (ret < 0) { - fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret)); - } else { - env->mcg_cap = mcg_cap; - } + ret = kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks); + if (ret < 0) { + fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret)); + return ret; } + + if (banks > MCE_BANKS_DEF) { + banks = MCE_BANKS_DEF; + } + mcg_cap &= MCE_CAP_DEF; + mcg_cap |= banks; + ret = kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, &mcg_cap); + if (ret < 0) { + fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret)); + return ret; + } + + env->mcg_cap = mcg_cap; } #endif From cd19cfa23609dc1a35dd34f0b7554a8462337fde Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 2 Mar 2011 08:56:19 +0100 Subject: [PATCH 16/34] Add qemu_ram_remap qemu_ram_remap() unmaps the specified RAM pages, then re-maps these pages again. This is used by KVM HWPoison support to clear HWPoisoned page tables across guest rebooting, so that a new page may be allocated later to recover the memory error. [ Jan: style fixlets, WIN32 fix ] Signed-off-by: Huang Ying Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- cpu-all.h | 4 ++++ cpu-common.h | 1 + exec.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/cpu-all.h b/cpu-all.h index caf5e6c378..4f4631d79c 100644 --- a/cpu-all.h +++ b/cpu-all.h @@ -863,10 +863,14 @@ target_phys_addr_t cpu_get_phys_page_debug(CPUState *env, target_ulong addr); extern int phys_ram_fd; extern ram_addr_t ram_size; +/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ +#define RAM_PREALLOC_MASK (1 << 0) + typedef struct RAMBlock { uint8_t *host; ram_addr_t offset; ram_addr_t length; + uint32_t flags; char idstr[256]; QLIST_ENTRY(RAMBlock) next; #if defined(__linux__) && !defined(TARGET_S390X) diff --git a/cpu-common.h b/cpu-common.h index 54d21d4717..ef4e8dab7a 100644 --- a/cpu-common.h +++ b/cpu-common.h @@ -50,6 +50,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, ram_addr_t size, void *host); ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size); void qemu_ram_free(ram_addr_t addr); +void qemu_ram_remap(ram_addr_t addr, ram_addr_t length); /* This should only be used for ram local to a device. */ void *qemu_get_ram_ptr(ram_addr_t addr); /* Same but slower, to use for migration, where the order of diff --git a/exec.c b/exec.c index 3d78063437..723ace4840 100644 --- a/exec.c +++ b/exec.c @@ -2867,6 +2867,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name, if (host) { new_block->host = host; + new_block->flags |= RAM_PREALLOC_MASK; } else { if (mem_path) { #if defined (__linux__) && !defined(TARGET_S390X) @@ -2920,7 +2921,9 @@ void qemu_ram_free(ram_addr_t addr) QLIST_FOREACH(block, &ram_list.blocks, next) { if (addr == block->offset) { QLIST_REMOVE(block, next); - if (mem_path) { + if (block->flags & RAM_PREALLOC_MASK) { + ; + } else if (mem_path) { #if defined (__linux__) && !defined(TARGET_S390X) if (block->fd) { munmap(block->host, block->length); @@ -2943,6 +2946,64 @@ void qemu_ram_free(ram_addr_t addr) } +#ifndef _WIN32 +void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) +{ + RAMBlock *block; + ram_addr_t offset; + int flags; + void *area, *vaddr; + + QLIST_FOREACH(block, &ram_list.blocks, next) { + offset = addr - block->offset; + if (offset < block->length) { + vaddr = block->host + offset; + if (block->flags & RAM_PREALLOC_MASK) { + ; + } else { + flags = MAP_FIXED; + munmap(vaddr, length); + if (mem_path) { +#if defined(__linux__) && !defined(TARGET_S390X) + if (block->fd) { +#ifdef MAP_POPULATE + flags |= mem_prealloc ? MAP_POPULATE | MAP_SHARED : + MAP_PRIVATE; +#else + flags |= MAP_PRIVATE; +#endif + area = mmap(vaddr, length, PROT_READ | PROT_WRITE, + flags, block->fd, offset); + } else { + flags |= MAP_PRIVATE | MAP_ANONYMOUS; + area = mmap(vaddr, length, PROT_READ | PROT_WRITE, + flags, -1, 0); + } +#endif + } else { +#if defined(TARGET_S390X) && defined(CONFIG_KVM) + flags |= MAP_SHARED | MAP_ANONYMOUS; + area = mmap(vaddr, length, PROT_EXEC|PROT_READ|PROT_WRITE, + flags, -1, 0); +#else + flags |= MAP_PRIVATE | MAP_ANONYMOUS; + area = mmap(vaddr, length, PROT_READ | PROT_WRITE, + flags, -1, 0); +#endif + } + if (area != vaddr) { + fprintf(stderr, "Could not remap addr: %lx@%lx\n", + length, addr); + exit(1); + } + qemu_madvise(vaddr, length, QEMU_MADV_MERGEABLE); + } + return; + } + } +} +#endif /* !_WIN32 */ + /* Return a host pointer to ram allocated with qemu_ram_alloc. With the exception of the softmmu code in this file, this should only be used for local memory (e.g. video ram) that the device owns, From 3c85e74fbf9e5a39d8d13ef91a5f3dd91f0bc8a8 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 2 Mar 2011 08:56:20 +0100 Subject: [PATCH 17/34] KVM, MCE, unpoison memory address across reboot In Linux kernel HWPoison processing implementation, the virtual address in processes mapping the error physical memory page is marked as HWPoison. So that, the further accessing to the virtual address will kill corresponding processes with SIGBUS. If the error physical memory page is used by a KVM guest, the SIGBUS will be sent to QEMU, and QEMU will simulate a MCE to report that memory error to the guest OS. If the guest OS can not recover from the error (for example, the page is accessed by kernel code), guest OS will reboot the system. But because the underlying host virtual address backing the guest physical memory is still poisoned, if the guest system accesses the corresponding guest physical memory even after rebooting, the SIGBUS will still be sent to QEMU and MCE will be simulated. That is, guest system can not recover via rebooting. In fact, across rebooting, the contents of guest physical memory page need not to be kept. We can allocate a new host physical page to back the corresponding guest physical address. This patch fixes this issue in QEMU-KVM via calling qemu_ram_remap() to clear the corresponding page table entry, so that make it possible to allocate a new page to recover the issue. [ Jan: rebasing and tiny cleanups] Signed-off-by: Huang Ying Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- target-i386/kvm.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 15741861a1..f7995bd877 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -173,7 +173,40 @@ static int get_para_features(CPUState *env) } #endif /* CONFIG_KVM_PARA */ +typedef struct HWPoisonPage { + ram_addr_t ram_addr; + QLIST_ENTRY(HWPoisonPage) list; +} HWPoisonPage; + +static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list = + QLIST_HEAD_INITIALIZER(hwpoison_page_list); + +static void kvm_unpoison_all(void *param) +{ + HWPoisonPage *page, *next_page; + + QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) { + QLIST_REMOVE(page, list); + qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE); + qemu_free(page); + } +} + #ifdef KVM_CAP_MCE +static void kvm_hwpoison_page_add(ram_addr_t ram_addr) +{ + HWPoisonPage *page; + + QLIST_FOREACH(page, &hwpoison_page_list, list) { + if (page->ram_addr == ram_addr) { + return; + } + } + page = qemu_malloc(sizeof(HWPoisonPage)); + page->ram_addr = ram_addr; + QLIST_INSERT_HEAD(&hwpoison_page_list, page, list); +} + static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap, int *max_banks) { @@ -233,6 +266,7 @@ int kvm_arch_on_sigbus_vcpu(CPUState *env, int code, void *addr) hardware_memory_error(); } } + kvm_hwpoison_page_add(ram_addr); kvm_mce_inject(env, paddr, code); } else #endif /* KVM_CAP_MCE */ @@ -263,6 +297,7 @@ int kvm_arch_on_sigbus(int code, void *addr) "QEMU itself instead of guest system!: %p\n", addr); return 0; } + kvm_hwpoison_page_add(ram_addr); kvm_mce_inject(first_cpu, paddr, code); } else #endif /* KVM_CAP_MCE */ @@ -571,6 +606,7 @@ int kvm_arch_init(KVMState *s) fprintf(stderr, "e820_add_entry() table is full\n"); return ret; } + qemu_register_reset(kvm_unpoison_all, NULL); return 0; } From 1ab3c6c07382c4249854f811cd6182a0a88e25fb Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:12 +0100 Subject: [PATCH 18/34] Implement qemu_kvm_eat_signals only for CONFIG_LINUX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu_kvm_eat_signals requires POSIX support with realtime extensions for sigtimedwait. Not all our target platforms provide this. Moreover, undefined sigbus_reraise was referenced on non-Linux as well. Signed-off-by: Jan Kiszka CC: Andreas Färber Signed-off-by: Marcelo Tosatti --- cpus.c | 94 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/cpus.c b/cpus.c index 077729c507..26e5bbaeff 100644 --- a/cpus.c +++ b/cpus.c @@ -245,11 +245,58 @@ static void qemu_init_sigbus(void) prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0); } +static void qemu_kvm_eat_signals(CPUState *env) +{ + struct timespec ts = { 0, 0 }; + siginfo_t siginfo; + sigset_t waitset; + sigset_t chkset; + int r; + + sigemptyset(&waitset); + sigaddset(&waitset, SIG_IPI); + sigaddset(&waitset, SIGBUS); + + do { + r = sigtimedwait(&waitset, &siginfo, &ts); + if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { + perror("sigtimedwait"); + exit(1); + } + + switch (r) { + case SIGBUS: + if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr)) { + sigbus_reraise(); + } + break; + default: + break; + } + + r = sigpending(&chkset); + if (r == -1) { + perror("sigpending"); + exit(1); + } + } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); + +#ifndef CONFIG_IOTHREAD + if (sigismember(&chkset, SIGIO) || sigismember(&chkset, SIGALRM)) { + qemu_notify_event(); + } +#endif +} + #else /* !CONFIG_LINUX */ static void qemu_init_sigbus(void) { } + +static void qemu_kvm_eat_signals(CPUState *env) +{ +} #endif /* !CONFIG_LINUX */ #ifndef _WIN32 @@ -455,49 +502,6 @@ static void qemu_tcg_init_cpu_signals(void) #endif } -static void qemu_kvm_eat_signals(CPUState *env) -{ - struct timespec ts = { 0, 0 }; - siginfo_t siginfo; - sigset_t waitset; - sigset_t chkset; - int r; - - sigemptyset(&waitset); - sigaddset(&waitset, SIG_IPI); - sigaddset(&waitset, SIGBUS); - - do { - r = sigtimedwait(&waitset, &siginfo, &ts); - if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { - perror("sigtimedwait"); - exit(1); - } - - switch (r) { - case SIGBUS: - if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr)) { - sigbus_reraise(); - } - break; - default: - break; - } - - r = sigpending(&chkset); - if (r == -1) { - perror("sigpending"); - exit(1); - } - } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); - -#ifndef CONFIG_IOTHREAD - if (sigismember(&chkset, SIGIO) || sigismember(&chkset, SIGALRM)) { - qemu_notify_event(); - } -#endif -} - #else /* _WIN32 */ HANDLE qemu_event_handle; @@ -526,10 +530,6 @@ static void qemu_event_increment(void) } } -static void qemu_kvm_eat_signals(CPUState *env) -{ -} - static int qemu_signal_init(void) { return 0; From 1009d2edea4acd5b683ab1572ad7f4d4583e1860 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:13 +0100 Subject: [PATCH 19/34] x86: Unbreak TCG support for hardware breakpoints Commit 83f338f73e broke x86 hardware breakpoint emulation by moving the debug exception handling out of cpu_exec. Fix this by moving all TCG related bits back, only leaving the generic guest debugging parts in cpus.c. Signed-off-by: Jan Kiszka CC: TeLeMan Signed-off-by: Marcelo Tosatti --- cpu-exec.c | 27 +++++++++++++++++++++++++++ cpus.c | 27 +++------------------------ 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index 34eaedca04..5cc937904a 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -196,6 +196,30 @@ static inline TranslationBlock *tb_find_fast(void) return tb; } +static CPUDebugExcpHandler *debug_excp_handler; + +CPUDebugExcpHandler *cpu_set_debug_excp_handler(CPUDebugExcpHandler *handler) +{ + CPUDebugExcpHandler *old_handler = debug_excp_handler; + + debug_excp_handler = handler; + return old_handler; +} + +static void cpu_handle_debug_exception(CPUState *env) +{ + CPUWatchpoint *wp; + + if (!env->watchpoint_hit) { + QTAILQ_FOREACH(wp, &env->watchpoints, entry) { + wp->flags &= ~BP_WATCHPOINT_HIT; + } + } + if (debug_excp_handler) { + debug_excp_handler(env); + } +} + /* main execution loop */ volatile sig_atomic_t exit_request; @@ -269,6 +293,9 @@ int cpu_exec(CPUState *env1) if (env->exception_index >= EXCP_INTERRUPT) { /* exit request from the cpu execution loop */ ret = env->exception_index; + if (ret == EXCP_DEBUG) { + cpu_handle_debug_exception(env); + } break; } else { #if defined(CONFIG_USER_ONLY) diff --git a/cpus.c b/cpus.c index 26e5bbaeff..975a6ce949 100644 --- a/cpus.c +++ b/cpus.c @@ -166,29 +166,8 @@ static bool all_cpu_threads_idle(void) return true; } -static CPUDebugExcpHandler *debug_excp_handler; - -CPUDebugExcpHandler *cpu_set_debug_excp_handler(CPUDebugExcpHandler *handler) +static void cpu_handle_guest_debug(CPUState *env) { - CPUDebugExcpHandler *old_handler = debug_excp_handler; - - debug_excp_handler = handler; - return old_handler; -} - -static void cpu_handle_debug_exception(CPUState *env) -{ - CPUWatchpoint *wp; - - if (!env->watchpoint_hit) { - QTAILQ_FOREACH(wp, &env->watchpoints, entry) { - wp->flags &= ~BP_WATCHPOINT_HIT; - } - } - if (debug_excp_handler) { - debug_excp_handler(env); - } - gdb_set_stop_cpu(env); qemu_system_debug_request(); #ifdef CONFIG_IOTHREAD @@ -818,7 +797,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) if (cpu_can_run(env)) { r = kvm_cpu_exec(env); if (r == EXCP_DEBUG) { - cpu_handle_debug_exception(env); + cpu_handle_guest_debug(env); } } qemu_kvm_wait_io_event(env); @@ -1110,7 +1089,7 @@ bool cpu_exec_all(void) r = tcg_cpu_exec(env); } if (r == EXCP_DEBUG) { - cpu_handle_debug_exception(env); + cpu_handle_guest_debug(env); break; } } else if (env->stop || env->stopped) { From fd28aa132362320f9f3a30b23f634bb14dee528e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:14 +0100 Subject: [PATCH 20/34] s390: Detect invalid invocations of qemu_ram_free/remap This both detects invalid invocations of qemu_ram_free and qemu_ram_remap when mem_path is non-NULL and fixes a build error on s390 ("'area' may be used uninitialized in this function"). Signed-off-by: Jan Kiszka CC: Alexander Graf Signed-off-by: Marcelo Tosatti --- exec.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/exec.c b/exec.c index 723ace4840..c5358c3ec0 100644 --- a/exec.c +++ b/exec.c @@ -2931,6 +2931,8 @@ void qemu_ram_free(ram_addr_t addr) } else { qemu_vfree(block->host); } +#else + abort(); #endif } else { #if defined(TARGET_S390X) && defined(CONFIG_KVM) @@ -2979,6 +2981,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) area = mmap(vaddr, length, PROT_READ | PROT_WRITE, flags, -1, 0); } +#else + abort(); #endif } else { #if defined(TARGET_S390X) && defined(CONFIG_KVM) From f2c1cc81c8229e8c1e26aae517d32aa534936a0c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:18 +0100 Subject: [PATCH 21/34] kvm: Add in-kernel irqchip awareness to cpu_thread_is_idle With in-kernel irqchip support enabled, the vcpu threads sleep in kernel space while halted. Account for this difference in cpu_thread_is_idle. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- cpus.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpus.c b/cpus.c index 975a6ce949..d310b7e3bd 100644 --- a/cpus.c +++ b/cpus.c @@ -148,7 +148,8 @@ static bool cpu_thread_is_idle(CPUState *env) if (env->stopped || !vm_running) { return true; } - if (!env->halted || qemu_cpu_has_work(env)) { + if (!env->halted || qemu_cpu_has_work(env) || + (kvm_enabled() && kvm_irqchip_in_kernel())) { return false; } return true; From 4601f7b04c63072c73870d4f4db2a5c5ad5ad8d2 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:19 +0100 Subject: [PATCH 22/34] kvm: x86: Do not leave halt if interrupts are disabled When an external interrupt is pending but IF is cleared, we must not leave the halt state prematurely. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- target-i386/kvm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index f7995bd877..3a07fceb2a 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -1590,7 +1590,9 @@ int kvm_arch_process_async_events(CPUState *env) return 0; } - if (env->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI)) { + if (((env->interrupt_request & CPU_INTERRUPT_HARD) && + (env->eflags & IF_MASK)) || + (env->interrupt_request & CPU_INTERRUPT_NMI)) { env->halted = 0; } if (env->interrupt_request & CPU_INTERRUPT_INIT) { From d841b6c4f16c3fc7afe2ce355e7e42813345f053 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:20 +0100 Subject: [PATCH 23/34] kvm: Mark VCPU state dirty on creation This avoids that early cpu_synchronize_state calls try to retrieve an uninitialized state from the kernel. That even causes a deadlock if io-thread is enabled. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- kvm-all.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kvm-all.c b/kvm-all.c index 226843c65f..62b098441d 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -211,6 +211,7 @@ int kvm_init_vcpu(CPUState *env) env->kvm_fd = ret; env->kvm_state = s; + env->kvm_vcpu_dirty = 1; mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); if (mmap_size < 0) { From ebda377f8c6d9a5ec69a0aeb2000aa5ce74f7fa2 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:21 +0100 Subject: [PATCH 24/34] x86: Properly reset PAT MSR Conforming to the Intel spec, set the power-on value of PAT also on reset, but save it across INIT. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- target-i386/cpu.h | 4 ++-- target-i386/cpuid.c | 1 - target-i386/helper.c | 5 +++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/target-i386/cpu.h b/target-i386/cpu.h index d0eae754cc..c7047d5912 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -685,8 +685,6 @@ typedef struct CPUX86State { uint64_t tsc; - uint64_t pat; - uint64_t mcg_status; /* exception/interrupt handling */ @@ -707,6 +705,8 @@ typedef struct CPUX86State { CPU_COMMON + uint64_t pat; + /* processor features (e.g. for CPUID insn) */ uint32_t cpuid_level; uint32_t cpuid_vendor1; diff --git a/target-i386/cpuid.c b/target-i386/cpuid.c index 5382a283f5..814d13e767 100644 --- a/target-i386/cpuid.c +++ b/target-i386/cpuid.c @@ -847,7 +847,6 @@ int cpu_x86_register (CPUX86State *env, const char *cpu_model) env->cpuid_version |= ((def->model & 0xf) << 4) | ((def->model >> 4) << 16); env->cpuid_version |= def->stepping; env->cpuid_features = def->features; - env->pat = 0x0007040600070406ULL; env->cpuid_ext_features = def->ext_features; env->cpuid_ext2_features = def->ext2_features; env->cpuid_ext3_features = def->ext3_features; diff --git a/target-i386/helper.c b/target-i386/helper.c index a08309f97a..d15fca591e 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -99,6 +99,8 @@ void cpu_reset(CPUX86State *env) env->mxcsr = 0x1f80; + env->pat = 0x0007040600070406ULL; + memset(env->dr, 0, sizeof(env->dr)); env->dr[6] = DR6_FIXED_1; env->dr[7] = DR7_FIXED_1; @@ -1280,8 +1282,11 @@ CPUX86State *cpu_x86_init(const char *cpu_model) void do_cpu_init(CPUState *env) { int sipi = env->interrupt_request & CPU_INTERRUPT_SIPI; + uint64_t pat = env->pat; + cpu_reset(env); env->interrupt_request = sipi; + env->pat = pat; apic_init_reset(env->apic_state); env->halted = !cpu_is_bsp(env); } From c995b495b9d6e60ab1e390bd398a22425d0b3c8c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:22 +0100 Subject: [PATCH 25/34] x86: Save/restore PAT MSR Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- target-i386/machine.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/target-i386/machine.c b/target-i386/machine.c index d78eceb779..6384f54b95 100644 --- a/target-i386/machine.c +++ b/target-i386/machine.c @@ -491,6 +491,8 @@ static const VMStateDescription vmstate_cpu = { VMSTATE_UINT64_V(xcr0, CPUState, 12), VMSTATE_UINT64_V(xstate_bv, CPUState, 12), VMSTATE_YMMH_REGS_VARS(ymmh_regs, CPUState, CPU_NB_REGS, 12), + + VMSTATE_UINT64_V(pat, CPUState, 13), VMSTATE_END_OF_LIST() /* The above list is not sorted /wrt version numbers, watch out! */ }, From 0c03266a965a8e5353bb0e8d1b2d3a3ac04aa686 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:23 +0100 Subject: [PATCH 26/34] kvm: x86: Synchronize PAT MSR with the kernel Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- target-i386/kvm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 3a07fceb2a..032bc3e1ac 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -861,6 +861,7 @@ static int kvm_put_msrs(CPUState *env, int level) kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs); kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp); kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip); + kvm_msr_entry_set(&msrs[n++], MSR_PAT, env->pat); if (has_msr_star) { kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star); } @@ -1113,6 +1114,7 @@ static int kvm_get_msrs(CPUState *env) msrs[n++].index = MSR_IA32_SYSENTER_CS; msrs[n++].index = MSR_IA32_SYSENTER_ESP; msrs[n++].index = MSR_IA32_SYSENTER_EIP; + msrs[n++].index = MSR_PAT; if (has_msr_star) { msrs[n++].index = MSR_STAR; } @@ -1168,6 +1170,9 @@ static int kvm_get_msrs(CPUState *env) case MSR_IA32_SYSENTER_EIP: env->sysenter_eip = msrs[i].data; break; + case MSR_PAT: + env->pat = msrs[i].data; + break; case MSR_STAR: env->star = msrs[i].data; break; From 51e8fa606e3cbef9cdf5c84f2468dcf1e07ddb70 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:24 +0100 Subject: [PATCH 27/34] kvm: Consider EXIT_DEBUG unknown without CAP_SET_GUEST_DEBUG Without KVM_CAP_SET_GUEST_DEBUG, we neither motivate the kernel to report KVM_EXIT_DEBUG nor do we expect such exits. So fall through to the arch code which will simply report an unknown exit reason. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- kvm-all.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 62b098441d..906521e837 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -975,17 +975,17 @@ int kvm_cpu_exec(CPUState *env) ret = kvm_handle_internal_error(env, run); break; #endif +#ifdef KVM_CAP_SET_GUEST_DEBUG case KVM_EXIT_DEBUG: DPRINTF("kvm_exit_debug\n"); -#ifdef KVM_CAP_SET_GUEST_DEBUG if (kvm_arch_debug(&run->debug.arch)) { ret = EXCP_DEBUG; goto out; } /* re-enter, this exception was guest-internal */ ret = 1; -#endif /* KVM_CAP_SET_GUEST_DEBUG */ break; +#endif /* KVM_CAP_SET_GUEST_DEBUG */ default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(env, run); From 7cbb533f92f040ab1dbafe2233a51c7d4e321d90 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:25 +0100 Subject: [PATCH 28/34] kvm: Keep KVM_RUN return value in separate variable Avoid using 'ret' both for the return value of KVM_RUN as well as the code kvm_cpu_exec is supposed to return. Both have no direct relation. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- kvm-all.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 906521e837..be235ec29b 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -890,7 +890,7 @@ void kvm_cpu_synchronize_post_init(CPUState *env) int kvm_cpu_exec(CPUState *env) { struct kvm_run *run = env->kvm_run; - int ret; + int ret, run_ret; DPRINTF("kvm_cpu_exec()\n"); @@ -920,7 +920,7 @@ int kvm_cpu_exec(CPUState *env) cpu_single_env = NULL; qemu_mutex_unlock_iothread(); - ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); + run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); qemu_mutex_lock_iothread(); cpu_single_env = env; @@ -928,14 +928,14 @@ int kvm_cpu_exec(CPUState *env) kvm_flush_coalesced_mmio_buffer(); - if (ret == -EINTR || ret == -EAGAIN) { + if (run_ret == -EINTR || run_ret == -EAGAIN) { DPRINTF("io window exit\n"); ret = 0; break; } - if (ret < 0) { - DPRINTF("kvm run failed %s\n", strerror(-ret)); + if (run_ret < 0) { + DPRINTF("kvm run failed %s\n", strerror(-run_ret)); abort(); } From dc77d341d2dcd3eeadae736ed7e7775e22324844 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:26 +0100 Subject: [PATCH 29/34] kvm: Reorder error handling of KVM_RUN Test for general errors first as this is the slower path. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- kvm-all.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index be235ec29b..85315551b4 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -928,13 +928,12 @@ int kvm_cpu_exec(CPUState *env) kvm_flush_coalesced_mmio_buffer(); - if (run_ret == -EINTR || run_ret == -EAGAIN) { - DPRINTF("io window exit\n"); - ret = 0; - break; - } - if (run_ret < 0) { + if (run_ret == -EINTR || run_ret == -EAGAIN) { + DPRINTF("io window exit\n"); + ret = 0; + break; + } DPRINTF("kvm run failed %s\n", strerror(-run_ret)); abort(); } From d73cd8f4ea1c2944bd16f7a1c445eaa25c9e6e26 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:27 +0100 Subject: [PATCH 30/34] kvm: Rework inner loop of kvm_cpu_exec Let kvm_cpu_exec return EXCP_* values consistently and generate those codes already inside its inner loop. This means we will now re-enter the kernel while ret == 0. Update kvm_handle_internal_error accordingly, but keep kvm_arch_handle_exit untouched, it will be converted in a separate step. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- kvm-all.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 85315551b4..271e361f47 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -831,7 +831,7 @@ static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run) fprintf(stderr, "emulation failure\n"); if (!kvm_arch_stop_on_emulation_error(env)) { cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); - return 0; + return EXCP_INTERRUPT; } } /* FIXME: Should trigger a qmp message to let management know @@ -931,14 +931,13 @@ int kvm_cpu_exec(CPUState *env) if (run_ret < 0) { if (run_ret == -EINTR || run_ret == -EAGAIN) { DPRINTF("io window exit\n"); - ret = 0; + ret = EXCP_INTERRUPT; break; } DPRINTF("kvm run failed %s\n", strerror(-run_ret)); abort(); } - ret = 0; /* exit loop */ switch (run->exit_reason) { case KVM_EXIT_IO: DPRINTF("handle_io\n"); @@ -947,7 +946,7 @@ int kvm_cpu_exec(CPUState *env) run->io.direction, run->io.size, run->io.count); - ret = 1; + ret = 0; break; case KVM_EXIT_MMIO: DPRINTF("handle_mmio\n"); @@ -955,14 +954,16 @@ int kvm_cpu_exec(CPUState *env) run->mmio.data, run->mmio.len, run->mmio.is_write); - ret = 1; + ret = 0; break; case KVM_EXIT_IRQ_WINDOW_OPEN: DPRINTF("irq_window_open\n"); + ret = EXCP_INTERRUPT; break; case KVM_EXIT_SHUTDOWN: DPRINTF("shutdown\n"); qemu_system_reset_request(); + ret = EXCP_INTERRUPT; break; case KVM_EXIT_UNKNOWN: fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", @@ -979,28 +980,29 @@ int kvm_cpu_exec(CPUState *env) DPRINTF("kvm_exit_debug\n"); if (kvm_arch_debug(&run->debug.arch)) { ret = EXCP_DEBUG; - goto out; + break; } /* re-enter, this exception was guest-internal */ - ret = 1; + ret = 0; break; #endif /* KVM_CAP_SET_GUEST_DEBUG */ default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(env, run); + if (ret == 0) { + ret = EXCP_INTERRUPT; + } else if (ret > 0) { + ret = 0; + } break; } - } while (ret > 0); + } while (ret == 0); if (ret < 0) { cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); vm_stop(VMSTOP_PANIC); } - ret = EXCP_INTERRUPT; -#ifdef KVM_CAP_SET_GUEST_DEBUG -out: -#endif env->exit_request = 0; cpu_single_env = NULL; return ret; From bb4ea39329d6c3de4c10034621781f703d095699 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:28 +0100 Subject: [PATCH 31/34] kvm: Align kvm_arch_handle_exit to kvm_cpu_exec changes Make the return code of kvm_arch_handle_exit directly usable for kvm_cpu_exec. This is straightforward for x86 and ppc, just s390 would require more work. Avoid this for now by pushing the return code translation logic into s390's kvm_arch_handle_exit. Signed-off-by: Jan Kiszka CC: Alexander Graf Signed-off-by: Marcelo Tosatti --- kvm-all.c | 5 ----- target-i386/kvm.c | 8 ++++---- target-ppc/kvm.c | 8 ++++---- target-s390x/kvm.c | 5 +++++ 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 271e361f47..f34cb69655 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -989,11 +989,6 @@ int kvm_cpu_exec(CPUState *env) default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(env, run); - if (ret == 0) { - ret = EXCP_INTERRUPT; - } else if (ret > 0) { - ret = 0; - } break; } } while (ret == 0); diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 032bc3e1ac..6f8461079a 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -1618,10 +1618,10 @@ static int kvm_handle_halt(CPUState *env) (env->eflags & IF_MASK)) && !(env->interrupt_request & CPU_INTERRUPT_NMI)) { env->halted = 1; - return 0; + return EXCP_HLT; } - return 1; + return 0; } static bool host_supports_vmx(void) @@ -1637,7 +1637,7 @@ static bool host_supports_vmx(void) int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) { uint64_t code; - int ret = 0; + int ret; switch (run->exit_reason) { case KVM_EXIT_HLT: @@ -1645,7 +1645,7 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) ret = kvm_handle_halt(env); break; case KVM_EXIT_SET_TPR: - ret = 1; + ret = 0; break; case KVM_EXIT_FAIL_ENTRY: code = run->fail_entry.hardware_entry_failure_reason; diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 6c99a163b8..593eb98ffc 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -271,7 +271,7 @@ static int kvmppc_handle_halt(CPUState *env) env->exception_index = EXCP_HLT; } - return 1; + return 0; } /* map dcr access to existing qemu dcr emulation */ @@ -280,7 +280,7 @@ static int kvmppc_handle_dcr_read(CPUState *env, uint32_t dcrn, uint32_t *data) if (ppc_dcr_read(env->dcr_env, dcrn, data) < 0) fprintf(stderr, "Read to unhandled DCR (0x%x)\n", dcrn); - return 1; + return 0; } static int kvmppc_handle_dcr_write(CPUState *env, uint32_t dcrn, uint32_t data) @@ -288,12 +288,12 @@ static int kvmppc_handle_dcr_write(CPUState *env, uint32_t dcrn, uint32_t data) if (ppc_dcr_write(env->dcr_env, dcrn, data) < 0) fprintf(stderr, "Write to unhandled DCR (0x%x)\n", dcrn); - return 1; + return 0; } int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) { - int ret = 0; + int ret; switch (run->exit_reason) { case KVM_EXIT_DCR: diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index a85ae0fc11..91232038ea 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -497,6 +497,11 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) break; } + if (ret == 0) { + ret = EXCP_INTERRUPT; + } else if (ret > 0) { + ret = 0; + } return ret; } From 2a4dac835008da7328e61d8596b310c05cab801d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:29 +0100 Subject: [PATCH 32/34] kvm: x86: Reorder functions in kvm.c Required for next patch which will access guest debug services from kvm_arch_handle_exit. No functional changes. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- target-i386/kvm.c | 108 +++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 6f8461079a..3920444944 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -1624,60 +1624,6 @@ static int kvm_handle_halt(CPUState *env) return 0; } -static bool host_supports_vmx(void) -{ - uint32_t ecx, unused; - - host_cpuid(1, 0, &unused, &unused, &ecx, &unused); - return ecx & CPUID_EXT_VMX; -} - -#define VMX_INVALID_GUEST_STATE 0x80000021 - -int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) -{ - uint64_t code; - int ret; - - switch (run->exit_reason) { - case KVM_EXIT_HLT: - DPRINTF("handle_hlt\n"); - ret = kvm_handle_halt(env); - break; - case KVM_EXIT_SET_TPR: - ret = 0; - break; - case KVM_EXIT_FAIL_ENTRY: - code = run->fail_entry.hardware_entry_failure_reason; - fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n", - code); - if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) { - fprintf(stderr, - "\nIf you're runnning a guest on an Intel machine without " - "unrestricted mode\n" - "support, the failure can be most likely due to the guest " - "entering an invalid\n" - "state for Intel VT. For example, the guest maybe running " - "in big real mode\n" - "which is not supported on less recent Intel processors." - "\n\n"); - } - ret = -1; - break; - case KVM_EXIT_EXCEPTION: - fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n", - run->ex.exception, run->ex.error_code); - ret = -1; - break; - default: - fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); - ret = -1; - break; - } - - return ret; -} - #ifdef KVM_CAP_SET_GUEST_DEBUG int kvm_arch_insert_sw_breakpoint(CPUState *env, struct kvm_sw_breakpoint *bp) { @@ -1860,6 +1806,60 @@ void kvm_arch_update_guest_debug(CPUState *env, struct kvm_guest_debug *dbg) } #endif /* KVM_CAP_SET_GUEST_DEBUG */ +static bool host_supports_vmx(void) +{ + uint32_t ecx, unused; + + host_cpuid(1, 0, &unused, &unused, &ecx, &unused); + return ecx & CPUID_EXT_VMX; +} + +#define VMX_INVALID_GUEST_STATE 0x80000021 + +int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) +{ + uint64_t code; + int ret; + + switch (run->exit_reason) { + case KVM_EXIT_HLT: + DPRINTF("handle_hlt\n"); + ret = kvm_handle_halt(env); + break; + case KVM_EXIT_SET_TPR: + ret = 0; + break; + case KVM_EXIT_FAIL_ENTRY: + code = run->fail_entry.hardware_entry_failure_reason; + fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n", + code); + if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) { + fprintf(stderr, + "\nIf you're runnning a guest on an Intel machine without " + "unrestricted mode\n" + "support, the failure can be most likely due to the guest " + "entering an invalid\n" + "state for Intel VT. For example, the guest maybe running " + "in big real mode\n" + "which is not supported on less recent Intel processors." + "\n\n"); + } + ret = -1; + break; + case KVM_EXIT_EXCEPTION: + fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n", + run->ex.exception, run->ex.error_code); + ret = -1; + break; + default: + fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); + ret = -1; + break; + } + + return ret; +} + bool kvm_arch_stop_on_emulation_error(CPUState *env) { return !(env->cr[0] & CR0_PE_MASK) || From f2574737f6a1218b4f4809ad6c8aba935126d90f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:30 +0100 Subject: [PATCH 33/34] kvm: x86: Push kvm_arch_debug to kvm_arch_handle_exit There are no generic bits remaining in the handling of KVM_EXIT_DEBUG. So push its logic completely into arch hands, i.e. only x86 so far. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- kvm-all.c | 11 ----------- kvm.h | 2 -- target-i386/kvm.c | 25 ++++++++++++++++--------- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index f34cb69655..1d7e8eabf4 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -975,17 +975,6 @@ int kvm_cpu_exec(CPUState *env) ret = kvm_handle_internal_error(env, run); break; #endif -#ifdef KVM_CAP_SET_GUEST_DEBUG - case KVM_EXIT_DEBUG: - DPRINTF("kvm_exit_debug\n"); - if (kvm_arch_debug(&run->debug.arch)) { - ret = EXCP_DEBUG; - break; - } - /* re-enter, this exception was guest-internal */ - ret = 0; - break; -#endif /* KVM_CAP_SET_GUEST_DEBUG */ default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(env, run); diff --git a/kvm.h b/kvm.h index 7bc04e0f14..d565dba1da 100644 --- a/kvm.h +++ b/kvm.h @@ -136,8 +136,6 @@ struct kvm_sw_breakpoint { QTAILQ_HEAD(kvm_sw_breakpoint_head, kvm_sw_breakpoint); -int kvm_arch_debug(struct kvm_debug_exit_arch *arch_info); - struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env, target_ulong pc); diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 3920444944..a13599db81 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -1731,31 +1731,31 @@ void kvm_arch_remove_all_hw_breakpoints(void) static CPUWatchpoint hw_watchpoint; -int kvm_arch_debug(struct kvm_debug_exit_arch *arch_info) +static int kvm_handle_debug(struct kvm_debug_exit_arch *arch_info) { - int handle = 0; + int ret = 0; int n; if (arch_info->exception == 1) { if (arch_info->dr6 & (1 << 14)) { if (cpu_single_env->singlestep_enabled) { - handle = 1; + ret = EXCP_DEBUG; } } else { for (n = 0; n < 4; n++) { if (arch_info->dr6 & (1 << n)) { switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) { case 0x0: - handle = 1; + ret = EXCP_DEBUG; break; case 0x1: - handle = 1; + ret = EXCP_DEBUG; cpu_single_env->watchpoint_hit = &hw_watchpoint; hw_watchpoint.vaddr = hw_breakpoint[n].addr; hw_watchpoint.flags = BP_MEM_WRITE; break; case 0x3: - handle = 1; + ret = EXCP_DEBUG; cpu_single_env->watchpoint_hit = &hw_watchpoint; hw_watchpoint.vaddr = hw_breakpoint[n].addr; hw_watchpoint.flags = BP_MEM_ACCESS; @@ -1765,17 +1765,18 @@ int kvm_arch_debug(struct kvm_debug_exit_arch *arch_info) } } } else if (kvm_find_sw_breakpoint(cpu_single_env, arch_info->pc)) { - handle = 1; + ret = EXCP_DEBUG; } - if (!handle) { + if (ret == 0) { cpu_synchronize_state(cpu_single_env); assert(cpu_single_env->exception_injected == -1); + /* pass to guest */ cpu_single_env->exception_injected = arch_info->exception; cpu_single_env->has_error_code = 0; } - return handle; + return ret; } void kvm_arch_update_guest_debug(CPUState *env, struct kvm_guest_debug *dbg) @@ -1851,6 +1852,12 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) run->ex.exception, run->ex.error_code); ret = -1; break; +#ifdef KVM_CAP_SET_GUEST_DEBUG + case KVM_EXIT_DEBUG: + DPRINTF("kvm_exit_debug\n"); + ret = kvm_handle_debug(&run->debug.arch); + break; +#endif /* KVM_CAP_SET_GUEST_DEBUG */ default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; From dc7a09cfe47679d89289101cc9eb387c45e48fe7 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 15 Mar 2011 12:26:31 +0100 Subject: [PATCH 34/34] Expose thread_id in info cpus Based on patch by Glauber Costa: To allow management applications like libvirt to apply CPU affinities to the VCPU threads, expose their ID via info cpus. This patch provides the pre-existing and used interface from qemu-kvm. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- cpu-defs.h | 1 + cpus.c | 2 ++ exec.c | 3 +++ monitor.c | 4 ++++ os-posix.c | 10 ++++++++++ os-win32.c | 5 +++++ osdep.h | 1 + qmp-commands.hx | 3 +++ 8 files changed, 29 insertions(+) diff --git a/cpu-defs.h b/cpu-defs.h index 2b59fa6fb0..db48a7afef 100644 --- a/cpu-defs.h +++ b/cpu-defs.h @@ -203,6 +203,7 @@ typedef struct CPUWatchpoint { int nr_cores; /* number of cores within this CPU package */ \ int nr_threads;/* number of threads within this CPU */ \ int running; /* Nonzero if cpu is currently running(usermode). */ \ + int thread_id; \ /* user data */ \ void *opaque; \ \ diff --git a/cpus.c b/cpus.c index d310b7e3bd..28c2da2aec 100644 --- a/cpus.c +++ b/cpus.c @@ -776,6 +776,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) qemu_mutex_lock(&qemu_global_mutex); qemu_thread_get_self(env->thread); + env->thread_id = qemu_get_thread_id(); r = kvm_init_vcpu(env); if (r < 0) { @@ -817,6 +818,7 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) /* signal CPU creation */ qemu_mutex_lock(&qemu_global_mutex); for (env = first_cpu; env != NULL; env = env->next_cpu) { + env->thread_id = qemu_get_thread_id(); env->created = 1; } qemu_cond_signal(&qemu_cpu_cond); diff --git a/exec.c b/exec.c index c5358c3ec0..964ce318fb 100644 --- a/exec.c +++ b/exec.c @@ -638,6 +638,9 @@ void cpu_exec_init(CPUState *env) env->numa_node = 0; QTAILQ_INIT(&env->breakpoints); QTAILQ_INIT(&env->watchpoints); +#ifndef CONFIG_USER_ONLY + env->thread_id = qemu_get_thread_id(); +#endif *penv = env; #if defined(CONFIG_USER_ONLY) cpu_list_unlock(); diff --git a/monitor.c b/monitor.c index ae20927c5a..481572dec6 100644 --- a/monitor.c +++ b/monitor.c @@ -897,6 +897,9 @@ static void print_cpu_iter(QObject *obj, void *opaque) monitor_printf(mon, " (halted)"); } + monitor_printf(mon, " thread_id=%" PRId64 " ", + qdict_get_int(cpu, "thread_id")); + monitor_printf(mon, "\n"); } @@ -941,6 +944,7 @@ static void do_info_cpus(Monitor *mon, QObject **ret_data) #elif defined(TARGET_MIPS) qdict_put(cpu, "PC", qint_from_int(env->active_tc.PC)); #endif + qdict_put(cpu, "thread_id", qint_from_int(env->thread_id)); qlist_append(cpu_list, cpu); } diff --git a/os-posix.c b/os-posix.c index 38c29d1afe..7971f8620b 100644 --- a/os-posix.c +++ b/os-posix.c @@ -41,6 +41,7 @@ #ifdef CONFIG_LINUX #include +#include #endif #ifdef CONFIG_EVENTFD @@ -382,3 +383,12 @@ int qemu_create_pidfile(const char *filename) return 0; } + +int qemu_get_thread_id(void) +{ +#if defined (__linux__) + return syscall(SYS_gettid); +#else + return getpid(); +#endif +} diff --git a/os-win32.c b/os-win32.c index c971d928ab..d6d54c60b9 100644 --- a/os-win32.c +++ b/os-win32.c @@ -266,3 +266,8 @@ int qemu_create_pidfile(const char *filename) } return 0; } + +int qemu_get_thread_id(void) +{ + return GetCurrentThreadId(); +} diff --git a/osdep.h b/osdep.h index 27eedcf100..748df5416b 100644 --- a/osdep.h +++ b/osdep.h @@ -130,5 +130,6 @@ void qemu_vfree(void *ptr); int qemu_madvise(void *addr, size_t len, int advice); int qemu_create_pidfile(const char *filename); +int qemu_get_thread_id(void); #endif diff --git a/qmp-commands.hx b/qmp-commands.hx index df40a3d42e..1f72a8d422 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -1194,6 +1194,7 @@ Return a json-array. Each CPU is represented by a json-object, which contains: "nip": PPC (json-int) "pc" and "npc": sparc (json-int) "PC": mips (json-int) +- "thread_id": ID of the underlying host thread (json-int) Example: @@ -1205,12 +1206,14 @@ Example: "current":true, "halted":false, "pc":3227107138 + "thread_id":3134 }, { "CPU":1, "current":false, "halted":true, "pc":7108165 + "thread_id":3135 } ] }