From a06cdb5676272a12056820aeb49a1416ad2d0c6f Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 18 May 2010 09:34:12 +0200 Subject: [PATCH 1/8] KVM: powerpc: fix init/exit annotation kvmppc_e500_exit() is a module_exit function, so it should be tagged with __exit, not __init. The incorrect annotation was added by commit 2986b8c72c272ea58edd37903b042c6da985627d. Signed-off-by: Jean Delvare Cc: stable@kernel.org Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/e500.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index bc2b4004eb26..e8a00b0c4449 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -164,7 +164,7 @@ static int __init kvmppc_e500_init(void) return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); } -static void __init kvmppc_e500_exit(void) +static void __exit kvmppc_e500_exit(void) { kvmppc_booke_exit(); } From fe5913e4e1700cbfc337f4b1da9ddb26f6a55586 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 May 2010 14:43:34 +0200 Subject: [PATCH 2/8] KVM: SVM: Handle MCEs early in the vmexit process This patch moves handling of the MC vmexits to an earlier point in the vmexit. The handle_exit function is too late because the vcpu might alreadry have changed its physical cpu. Cc: stable@kernel.org Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 96dc232bfc56..5e1ed033cd7e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1410,7 +1410,7 @@ static int nm_interception(struct vcpu_svm *svm) return 1; } -static int mc_interception(struct vcpu_svm *svm) +static void svm_handle_mce(struct vcpu_svm *svm) { /* * On an #MC intercept the MCE handler is not called automatically in @@ -1420,6 +1420,11 @@ static int mc_interception(struct vcpu_svm *svm) "int $0x12\n"); /* not sure if we ever come back to this point */ + return; +} + +static int mc_interception(struct vcpu_svm *svm) +{ return 1; } @@ -3088,6 +3093,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); } + + /* + * We need to handle MC intercepts here before the vcpu has a chance to + * change the physical cpu + */ + if (unlikely(svm->vmcb->control.exit_code == + SVM_EXIT_EXCP_BASE + MC_VECTOR)) + svm_handle_mce(svm); } #undef R From 67ec66077799f2fef84b21a643912b179c422281 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 May 2010 14:43:35 +0200 Subject: [PATCH 3/8] KVM: SVM: Implement workaround for Erratum 383 This patch implements a workaround for AMD erratum 383 into KVM. Without this erratum fix it is possible for a guest to kill the host machine. This patch implements the suggested workaround for hypervisors which will be published by the next revision guide update. [jan: fix overflow warning on i386] [xiao: fix unused variable warning] Cc: stable@kernel.org Signed-off-by: Joerg Roedel Signed-off-by: Jan Kiszka Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/svm.c | 81 ++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b49d8ca228f6..8c7ae4318629 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -110,6 +110,7 @@ #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5e1ed033cd7e..ce438e0fdd26 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -56,6 +57,8 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +static bool erratum_383_found __read_mostly; + static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, @@ -374,6 +377,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, svm->vmcb->control.event_inj_err = error_code; } +static void svm_init_erratum_383(void) +{ + u32 low, high; + int err; + u64 val; + + /* Only Fam10h is affected */ + if (boot_cpu_data.x86 != 0x10) + return; + + /* Use _safe variants to not break nested virtualization */ + val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); + if (err) + return; + + val |= (1ULL << 47); + + low = lower_32_bits(val); + high = upper_32_bits(val); + + native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + + erratum_383_found = true; +} + static int has_svm(void) { const char *msg; @@ -429,6 +457,8 @@ static int svm_hardware_enable(void *garbage) wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + svm_init_erratum_383(); + return 0; } @@ -1410,8 +1440,59 @@ static int nm_interception(struct vcpu_svm *svm) return 1; } +static bool is_erratum_383(void) +{ + int err, i; + u64 value; + + if (!erratum_383_found) + return false; + + value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); + if (err) + return false; + + /* Bit 62 may or may not be set for this mce */ + value &= ~(1ULL << 62); + + if (value != 0xb600000000010015ULL) + return false; + + /* Clear MCi_STATUS registers */ + for (i = 0; i < 6; ++i) + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); + + value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); + if (!err) { + u32 low, high; + + value &= ~(1ULL << 2); + low = lower_32_bits(value); + high = upper_32_bits(value); + + native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + } + + /* Flush tlb to evict multi-match entries */ + __flush_tlb_all(); + + return true; +} + static void svm_handle_mce(struct vcpu_svm *svm) { + if (is_erratum_383()) { + /* + * Erratum 383 triggered. Guest state is corrupt so kill the + * guest. + */ + pr_err("KVM: Guest triggered AMD Erratum 383\n"); + + set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); + + return; + } + /* * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. From 3be2264be3c00865116f997dc53ebcc90fe7fc4b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 28 May 2010 09:44:59 -0300 Subject: [PATCH 4/8] KVM: MMU: invalidate and flush on spte small->large page size change Always invalidate spte and flush TLBs when changing page size, to make sure different sized translations for the same address are never cached in a CPU's TLB. Currently the only case where this occurs is when a non-leaf spte pointer is overwritten by a leaf, large spte entry. This can happen after dirty logging is disabled on a memslot, for example. Noticed by Andrea. KVM-Stable-Tag Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 81563e76e28f..6fbcb48d5a9b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1870,6 +1870,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); From 69325a122580d3a7b26589e8efdd6663001c3297 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 27 May 2010 14:35:58 +0300 Subject: [PATCH 5/8] KVM: MMU: Remove user access when allowing kernel access to gpte.w=0 page If cr0.wp=0, we have to allow the guest kernel access to a page with pte.w=0. We do that by setting spte.w=1, since the host cr0.wp must remain set so the host can write protect pages. Once we allow write access, we must remove user access otherwise we mistakenly allow the user to write the page. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6fbcb48d5a9b..a6f695d76928 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1815,6 +1815,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_WRITABLE_MASK; + if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) + spte &= ~PT_USER_MASK; + /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection From 05b782ab951a896d7da41775999821f692dc9e01 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 26 May 2010 21:36:33 +0200 Subject: [PATCH 6/8] KVM: Fix order passed to iommu_unmap This is obviously a left-over from the the old interface taking the size. Apparently a mostly harmless issue with the current iommu_unmap implementation. Signed-off-by: Jan Kiszka Acked-by: Joerg Roedel Signed-off-by: Avi Kivity --- virt/kvm/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index d2f06be63354..96048ee9e39e 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -271,7 +271,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm, pfn = phys >> PAGE_SHIFT; /* Unmap address from IO address space */ - order = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); + order = iommu_unmap(domain, gfn_to_gpa(gfn), 0); unmap_pages = 1ULL << order; /* Unpin all pages we just unmapped to not leak any memory */ From 3499f4d0d1159a21245d6071f8af6a71b86a78bc Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 26 May 2010 17:57:05 +0200 Subject: [PATCH 7/8] KVM: ia64: Add missing spin_unlock in kvm_arch_hardware_enable() Add a spin_unlock missing on the error path. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @@ expression E1; @@ * spin_lock(E1,...); <+... when != E1 if (...) { ... when != E1 * return ...; } ...+> * spin_unlock(E1,...); // Signed-off-by: Julia Lawall Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d5f4e9161201..21b701374f72 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -144,6 +144,7 @@ int kvm_arch_hardware_enable(void *garbage) VP_INIT_ENV : VP_INIT_ENV_INITALIZE, __pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base); if (status != 0) { + spin_unlock(&vp_lock); printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n"); return -EINVAL; } From 07dc7263b99e4ddad2b4c69765a428ccb7d48938 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 2 Jun 2010 11:26:26 -0300 Subject: [PATCH 8/8] KVM: read apic->irr with ioapic lock held Read ioapic->irr inside ioapic->lock protected section. KVM-Stable-Tag Signed-off-by: Marcelo Tosatti --- virt/kvm/ioapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 7c79c1d76d0c..3500dee9cf2b 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -192,12 +192,13 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) { - u32 old_irr = ioapic->irr; + u32 old_irr; u32 mask = 1 << irq; union kvm_ioapic_redirect_entry entry; int ret = 1; spin_lock(&ioapic->lock); + old_irr = ioapic->irr; if (irq >= 0 && irq < IOAPIC_NUM_PINS) { entry = ioapic->redirtbl[irq]; level ^= entry.fields.polarity;