From 439793d4b3c99e550daebd868bbd58967c93d0b3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 1 Aug 2012 17:01:42 +0300 Subject: [PATCH 1/6] KVM: x86: update KVM_SAVE_MSRS_BEGIN to correct value When MSR_KVM_PV_EOI_EN was added to msrs_to_save array KVM_SAVE_MSRS_BEGIN was not updated accordingly. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42bce48f6928..dce75b760312 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 9 +#define KVM_SAVE_MSRS_BEGIN 10 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, From 04f995a544d1289ffb8108849cd71b1325c5af6a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 6 Aug 2012 00:03:28 +0000 Subject: [PATCH 2/6] KVM: PPC: Book3S HV: Fix incorrect branch in H_CEDE code In handling the H_CEDE hypercall, if this vcpu has already been prodded (with the H_PROD hypercall, which Linux guests don't in fact use), we branch to a numeric label '1f'. Unfortunately there is another '1:' label before the one that we want to jump to. This fixes the problem by using a textual label, 'kvm_cede_prodded'. It also changes the label for another longish branch from '2:' to 'kvm_cede_exit' to avoid a possible future problem if code modifications add another numeric '2:' label in between. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5a84c8d3d040..44b72feaff7d 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1421,13 +1421,13 @@ _GLOBAL(kvmppc_h_cede) sync /* order setting ceded vs. testing prodded */ lbz r5,VCPU_PRODDED(r3) cmpwi r5,0 - bne 1f + bne kvm_cede_prodded li r0,0 /* set trap to 0 to say hcall is handled */ stw r0,VCPU_TRAP(r3) li r0,H_SUCCESS std r0,VCPU_GPR(R3)(r3) BEGIN_FTR_SECTION - b 2f /* just send it up to host on 970 */ + b kvm_cede_exit /* just send it up to host on 970 */ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) /* @@ -1446,7 +1446,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) or r4,r4,r0 PPC_POPCNTW(R7,R4) cmpw r7,r8 - bge 2f + bge kvm_cede_exit stwcx. r4,0,r6 bne 31b li r0,1 @@ -1555,7 +1555,8 @@ kvm_end_cede: b hcall_real_fallback /* cede when already previously prodded case */ -1: li r0,0 +kvm_cede_prodded: + li r0,0 stb r0,VCPU_PRODDED(r3) sync /* order testing prodded vs. clearing ceded */ stb r0,VCPU_CEDED(r3) @@ -1563,7 +1564,8 @@ kvm_end_cede: blr /* we've ceded but we want to give control to the host */ -2: li r3,H_TOO_HARD +kvm_cede_exit: + li r3,H_TOO_HARD blr secondary_too_late: From 249ba1ee0f8fcb4e40caa5fbea11dafde201cc46 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 3 Aug 2012 13:56:33 +0200 Subject: [PATCH 3/6] KVM: PPC: Add cache flush on page map When we map a page that wasn't icache cleared before, do so when first mapping it in KVM using the same information bits as the Linux mapping logic. That way we are 100% sure that any page we map does not have stale entries in the icache. Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/kvm_ppc.h | 12 ++++++++++++ arch/powerpc/kvm/book3s_32_mmu_host.c | 3 +++ arch/powerpc/kvm/book3s_64_mmu_host.c | 2 ++ arch/powerpc/kvm/e500_tlb.c | 3 +++ arch/powerpc/mm/mem.c | 1 + 6 files changed, 22 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 50ea12fd7bf5..a8bf5c673a3c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -33,6 +33,7 @@ #include #include #include +#include #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 0124937a23b9..e006f0bdea95 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -219,4 +219,16 @@ void kvmppc_claim_lpid(long lpid); void kvmppc_free_lpid(long lpid); void kvmppc_init_lpid(unsigned long nr_lpids); +static inline void kvmppc_mmu_flush_icache(pfn_t pfn) +{ + /* Clear i-cache for new pages */ + struct page *page; + page = pfn_to_page(pfn); + if (!test_bit(PG_arch_1, &page->flags)) { + flush_dcache_icache_page(page); + set_bit(PG_arch_1, &page->flags); + } +} + + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index f922c29bb234..837f13e7b6bf 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -211,6 +211,9 @@ next_pteg: pteg1 |= PP_RWRX; } + if (orig_pte->may_execute) + kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); + local_irq_disable(); if (pteg[rr]) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 10fc8ec9d2a8..0688b6b39585 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -126,6 +126,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) if (!orig_pte->may_execute) rflags |= HPTE_R_N; + else + kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); hash = hpt_hash(va, PTE_SIZE, MMU_SEGSIZE_256M); diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index c510fc961302..fb3bb3ad8b3c 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -539,6 +539,9 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); + + /* Clear i-cache for new pages */ + kvmppc_mmu_flush_icache(pfn); } /* XXX only map the one-one case, for now use TLB0 */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index baaafde7d135..fbdad0e3929a 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -469,6 +469,7 @@ void flush_dcache_icache_page(struct page *page) __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); #endif } +EXPORT_SYMBOL(flush_dcache_icache_page); void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { From e8143ccb6b501f78bb95d9c5ee100d18423008cf Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 14 Aug 2012 12:10:09 +0000 Subject: [PATCH 4/6] ppc: e500_tlb memset clears nothing Put the parameters the right way around Addresses https://bugzilla.kernel.org/show_bug.cgi?id=44031 Reported-by: David Binderman Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500_tlb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index fb3bb3ad8b3c..a2b66717813d 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -322,11 +322,11 @@ static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500) { if (vcpu_e500->g2h_tlb1_map) - memset(vcpu_e500->g2h_tlb1_map, - sizeof(u64) * vcpu_e500->gtlb_params[1].entries, 0); + memset(vcpu_e500->g2h_tlb1_map, 0, + sizeof(u64) * vcpu_e500->gtlb_params[1].entries); if (vcpu_e500->h2g_tlb1_rmap) - memset(vcpu_e500->h2g_tlb1_rmap, - sizeof(unsigned int) * host_tlb_params[1].entries, 0); + memset(vcpu_e500->h2g_tlb1_rmap, 0, + sizeof(unsigned int) * host_tlb_params[1].entries); } static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) From 35f2d16bb9ace0fb2671b8232839944ad9057c6f Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 20 Aug 2012 18:35:39 +0900 Subject: [PATCH 5/6] KVM: MMU: Fix mmu_shrink() so that it can free mmu pages as intended Although the possible race described in commit 85b7059169e128c57a3a8a3e588fb89cb2031da1 KVM: MMU: fix shrinking page from the empty mmu was correct, the real cause of that issue was a more trivial bug of mmu_shrink() introduced by commit 1952639665e92481c34c34c3e2a71bf3e66ba362 KVM: MMU: do not iterate over all VMs in mmu_shrink() Here is the bug: if (kvm->arch.n_used_mmu_pages > 0) { if (!nr_to_scan--) break; continue; } We skip VMs whose n_used_mmu_pages is not zero and try to shrink others: in other words we try to shrink empty ones by mistake. This patch reverses the logic so that mmu_shrink() can free pages from the first VM whose n_used_mmu_pages is not zero. Note that we also add comments explaining the role of nr_to_scan which is not practically important now, hoping this will be improved in the future. Signed-off-by: Takuya Yoshikawa Cc: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01ca00423938..7fbd0d273ea8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4112,17 +4112,22 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) int idx; LIST_HEAD(invalid_list); + /* + * Never scan more than sc->nr_to_scan VM instances. + * Will not hit this condition practically since we do not try + * to shrink more than one VM and it is very unlikely to see + * !n_used_mmu_pages so many times. + */ + if (!nr_to_scan--) + break; /* * n_used_mmu_pages is accessed without holding kvm->mmu_lock * here. We may skip a VM instance errorneosly, but we do not * want to shrink a VM that only started to populate its MMU * anyway. */ - if (kvm->arch.n_used_mmu_pages > 0) { - if (!nr_to_scan--) - break; + if (!kvm->arch.n_used_mmu_pages) continue; - } idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); From 5ad105e569c45dcfad50d724c61d5061248be755 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 19 Aug 2012 14:34:31 +0300 Subject: [PATCH 6/6] KVM: x86 emulator: use stack size attribute to mask rsp in stack ops The sub-register used to access the stack (sp, esp, or rsp) is not determined by the address size attribute like other memory references, but by the stack segment's B bit (if not in x86_64 mode). Fix by using the existing stack_mask() to figure out the correct mask. This long-existing bug was exposed by a combination of a27685c33acccce (emulate invalid guest state by default), which causes many more instructions to be emulated, and a seabios change (possibly a bug) which causes the high 16 bits of esp to become polluted across calls to real mode software interrupts. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97d9a9914ba8..a3b57a27be88 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) return address_mask(ctxt, reg); } +static void masked_increment(ulong *reg, ulong mask, int inc) +{ + assign_masked(reg, *reg + inc, mask); +} + static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) { + ulong mask; + if (ctxt->ad_bytes == sizeof(unsigned long)) - *reg += inc; + mask = ~0UL; else - *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); + mask = ad_mask(ctxt); + masked_increment(reg, mask, inc); +} + +static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) +{ + masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); } static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + rsp_increment(ctxt, -bytes); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; return segmented_write(ctxt, addr, data, bytes); @@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, int rc; struct segmented_address addr; - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); + rsp_increment(ctxt, len); return rc; } @@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], - ctxt->op_bytes); + rsp_increment(ctxt, ctxt->op_bytes); --reg; } @@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); + rsp_increment(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; }