From 439793d4b3c99e550daebd868bbd58967c93d0b3 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 1 Aug 2012 17:01:42 +0300
Subject: [PATCH 1/6] KVM: x86: update KVM_SAVE_MSRS_BEGIN to correct value

When MSR_KVM_PV_EOI_EN was added to msrs_to_save array
KVM_SAVE_MSRS_BEGIN was not updated accordingly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 42bce48f6928..dce75b760312 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	9
+#define KVM_SAVE_MSRS_BEGIN	10
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,

From 04f995a544d1289ffb8108849cd71b1325c5af6a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 6 Aug 2012 00:03:28 +0000
Subject: [PATCH 2/6] KVM: PPC: Book3S HV: Fix incorrect branch in H_CEDE code

In handling the H_CEDE hypercall, if this vcpu has already been
prodded (with the H_PROD hypercall, which Linux guests don't in fact
use), we branch to a numeric label '1f'.  Unfortunately there is
another '1:' label before the one that we want to jump to.  This fixes
the problem by using a textual label, 'kvm_cede_prodded'.  It also
changes the label for another longish branch from '2:' to
'kvm_cede_exit' to avoid a possible future problem if code modifications
add another numeric '2:' label in between.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 5a84c8d3d040..44b72feaff7d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1421,13 +1421,13 @@ _GLOBAL(kvmppc_h_cede)
 	sync			/* order setting ceded vs. testing prodded */
 	lbz	r5,VCPU_PRODDED(r3)
 	cmpwi	r5,0
-	bne	1f
+	bne	kvm_cede_prodded
 	li	r0,0		/* set trap to 0 to say hcall is handled */
 	stw	r0,VCPU_TRAP(r3)
 	li	r0,H_SUCCESS
 	std	r0,VCPU_GPR(R3)(r3)
 BEGIN_FTR_SECTION
-	b	2f		/* just send it up to host on 970 */
+	b	kvm_cede_exit	/* just send it up to host on 970 */
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 
 	/*
@@ -1446,7 +1446,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 	or	r4,r4,r0
 	PPC_POPCNTW(R7,R4)
 	cmpw	r7,r8
-	bge	2f
+	bge	kvm_cede_exit
 	stwcx.	r4,0,r6
 	bne	31b
 	li	r0,1
@@ -1555,7 +1555,8 @@ kvm_end_cede:
 	b	hcall_real_fallback
 
 	/* cede when already previously prodded case */
-1:	li	r0,0
+kvm_cede_prodded:
+	li	r0,0
 	stb	r0,VCPU_PRODDED(r3)
 	sync			/* order testing prodded vs. clearing ceded */
 	stb	r0,VCPU_CEDED(r3)
@@ -1563,7 +1564,8 @@ kvm_end_cede:
 	blr
 
 	/* we've ceded but we want to give control to the host */
-2:	li	r3,H_TOO_HARD
+kvm_cede_exit:
+	li	r3,H_TOO_HARD
 	blr
 
 secondary_too_late:

From 249ba1ee0f8fcb4e40caa5fbea11dafde201cc46 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Fri, 3 Aug 2012 13:56:33 +0200
Subject: [PATCH 3/6] KVM: PPC: Add cache flush on page map

When we map a page that wasn't icache cleared before, do so when first
mapping it in KVM using the same information bits as the Linux mapping
logic. That way we are 100% sure that any page we map does not have stale
entries in the icache.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h   |  1 +
 arch/powerpc/include/asm/kvm_ppc.h    | 12 ++++++++++++
 arch/powerpc/kvm/book3s_32_mmu_host.c |  3 +++
 arch/powerpc/kvm/book3s_64_mmu_host.c |  2 ++
 arch/powerpc/kvm/e500_tlb.c           |  3 +++
 arch/powerpc/mm/mem.c                 |  1 +
 6 files changed, 22 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50ea12fd7bf5..a8bf5c673a3c 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -33,6 +33,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/processor.h>
 #include <asm/page.h>
+#include <asm/cacheflush.h>
 
 #define KVM_MAX_VCPUS		NR_CPUS
 #define KVM_MAX_VCORES		NR_CPUS
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937a23b9..e006f0bdea95 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -219,4 +219,16 @@ void kvmppc_claim_lpid(long lpid);
 void kvmppc_free_lpid(long lpid);
 void kvmppc_init_lpid(unsigned long nr_lpids);
 
+static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
+{
+	/* Clear i-cache for new pages */
+	struct page *page;
+	page = pfn_to_page(pfn);
+	if (!test_bit(PG_arch_1, &page->flags)) {
+		flush_dcache_icache_page(page);
+		set_bit(PG_arch_1, &page->flags);
+	}
+}
+
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index f922c29bb234..837f13e7b6bf 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -211,6 +211,9 @@ next_pteg:
 		pteg1 |= PP_RWRX;
 	}
 
+	if (orig_pte->may_execute)
+		kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT);
+
 	local_irq_disable();
 
 	if (pteg[rr]) {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 10fc8ec9d2a8..0688b6b39585 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -126,6 +126,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 
 	if (!orig_pte->may_execute)
 		rflags |= HPTE_R_N;
+	else
+		kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT);
 
 	hash = hpt_hash(va, PTE_SIZE, MMU_SEGSIZE_256M);
 
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index c510fc961302..fb3bb3ad8b3c 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -539,6 +539,9 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 	kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
 				ref, gvaddr, stlbe);
+
+	/* Clear i-cache for new pages */
+	kvmppc_mmu_flush_icache(pfn);
 }
 
 /* XXX only map the one-one case, for now use TLB0 */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index baaafde7d135..fbdad0e3929a 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -469,6 +469,7 @@ void flush_dcache_icache_page(struct page *page)
 	__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
 #endif
 }
+EXPORT_SYMBOL(flush_dcache_icache_page);
 
 void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
 {

From e8143ccb6b501f78bb95d9c5ee100d18423008cf Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 14 Aug 2012 12:10:09 +0000
Subject: [PATCH 4/6] ppc: e500_tlb memset clears nothing

Put the parameters the right way around

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=44031

Reported-by: David Binderman <dcb314@hotmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/e500_tlb.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index fb3bb3ad8b3c..a2b66717813d 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -322,11 +322,11 @@ static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
 static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	if (vcpu_e500->g2h_tlb1_map)
-		memset(vcpu_e500->g2h_tlb1_map,
-		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries, 0);
+		memset(vcpu_e500->g2h_tlb1_map, 0,
+		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
 	if (vcpu_e500->h2g_tlb1_rmap)
-		memset(vcpu_e500->h2g_tlb1_rmap,
-		       sizeof(unsigned int) * host_tlb_params[1].entries, 0);
+		memset(vcpu_e500->h2g_tlb1_rmap, 0,
+		       sizeof(unsigned int) * host_tlb_params[1].entries);
 }
 
 static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)

From 35f2d16bb9ace0fb2671b8232839944ad9057c6f Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 20 Aug 2012 18:35:39 +0900
Subject: [PATCH 5/6] KVM: MMU: Fix mmu_shrink() so that it can free mmu pages
 as intended

Although the possible race described in

  commit 85b7059169e128c57a3a8a3e588fb89cb2031da1
  KVM: MMU: fix shrinking page from the empty mmu

was correct, the real cause of that issue was a more trivial bug of
mmu_shrink() introduced by

  commit 1952639665e92481c34c34c3e2a71bf3e66ba362
  KVM: MMU: do not iterate over all VMs in mmu_shrink()

Here is the bug:

	if (kvm->arch.n_used_mmu_pages > 0) {
		if (!nr_to_scan--)
			break;
		continue;
	}

We skip VMs whose n_used_mmu_pages is not zero and try to shrink others:
in other words we try to shrink empty ones by mistake.

This patch reverses the logic so that mmu_shrink() can free pages from
the first VM whose n_used_mmu_pages is not zero.  Note that we also add
comments explaining the role of nr_to_scan which is not practically
important now, hoping this will be improved in the future.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01ca00423938..7fbd0d273ea8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4112,17 +4112,22 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 		int idx;
 		LIST_HEAD(invalid_list);
 
+		/*
+		 * Never scan more than sc->nr_to_scan VM instances.
+		 * Will not hit this condition practically since we do not try
+		 * to shrink more than one VM and it is very unlikely to see
+		 * !n_used_mmu_pages so many times.
+		 */
+		if (!nr_to_scan--)
+			break;
 		/*
 		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
 		 * here. We may skip a VM instance errorneosly, but we do not
 		 * want to shrink a VM that only started to populate its MMU
 		 * anyway.
 		 */
-		if (kvm->arch.n_used_mmu_pages > 0) {
-			if (!nr_to_scan--)
-				break;
+		if (!kvm->arch.n_used_mmu_pages)
 			continue;
-		}
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);

From 5ad105e569c45dcfad50d724c61d5061248be755 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 19 Aug 2012 14:34:31 +0300
Subject: [PATCH 6/6] KVM: x86 emulator: use stack size attribute to mask rsp
 in stack ops

The sub-register used to access the stack (sp, esp, or rsp) is not
determined by the address size attribute like other memory references,
but by the stack segment's B bit (if not in x86_64 mode).

Fix by using the existing stack_mask() to figure out the correct mask.

This long-existing bug was exposed by a combination of a27685c33acccce
(emulate invalid guest state by default), which causes many more
instructions to be emulated, and a seabios change (possibly a bug) which
causes the high 16 bits of esp to become polluted across calls to real
mode software interrupts.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97d9a9914ba8..a3b57a27be88 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
 	return address_mask(ctxt, reg);
 }
 
+static void masked_increment(ulong *reg, ulong mask, int inc)
+{
+	assign_masked(reg, *reg + inc, mask);
+}
+
 static inline void
 register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
 {
+	ulong mask;
+
 	if (ctxt->ad_bytes == sizeof(unsigned long))
-		*reg += inc;
+		mask = ~0UL;
 	else
-		*reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
+		mask = ad_mask(ctxt);
+	masked_increment(reg, mask, inc);
+}
+
+static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
+{
+	masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc);
 }
 
 static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
 {
 	struct segmented_address addr;
 
-	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);
-	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
+	rsp_increment(ctxt, -bytes);
+	addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
 	addr.seg = VCPU_SREG_SS;
 
 	return segmented_write(ctxt, addr, data, bytes);
@@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	int rc;
 	struct segmented_address addr;
 
-	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
+	addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
 	addr.seg = VCPU_SREG_SS;
 	rc = segmented_read(ctxt, addr, dest, len);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
+	rsp_increment(ctxt, len);
 	return rc;
 }
 
@@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 
 	while (reg >= VCPU_REGS_RAX) {
 		if (reg == VCPU_REGS_RSP) {
-			register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
-							ctxt->op_bytes);
+			rsp_increment(ctxt, ctxt->op_bytes);
 			--reg;
 		}
 
@@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 	rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
+	rsp_increment(ctxt, ctxt->src.val);
 	return X86EMUL_CONTINUE;
 }