From 0a00a775d2a49d16efba721c786859844184599c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 27 Nov 2013 16:35:22 +0200 Subject: [PATCH 01/67] KVM: Change maintainer email address My Red Hat email address will stop working soon. Updated if with other address. Signed-off-by: Gleb Natapov Signed-off-by: Paolo Bonzini --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f216db847022..b4433da68f46 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4891,7 +4891,7 @@ F: include/linux/sunrpc/ F: include/uapi/linux/sunrpc/ KERNEL VIRTUAL MACHINE (KVM) -M: Gleb Natapov +M: Gleb Natapov M: Paolo Bonzini L: kvm@vger.kernel.org W: http://www.linux-kvm.org From 210b1607012cc9034841a393e0591b2c86d9e26c Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 19 Sep 2013 16:26:18 +0200 Subject: [PATCH 02/67] KVM: s390: Removed SIE_INTERCEPT_UCONTROL The SIE_INTERCEPT_UCONTROL can be removed by moving the related code from kvm_arch_vcpu_ioctl_run() to vcpu_post_run(). Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 24 ++++++------------------ arch/s390/kvm/kvm-s390.h | 2 -- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 569494e01ec6..7f4783525144 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -732,14 +732,12 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) if (exit_reason >= 0) { rc = 0; - } else { - if (kvm_is_ucontrol(vcpu->kvm)) { - rc = SIE_INTERCEPT_UCONTROL; - } else { - VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); - trace_kvm_s390_sie_fault(vcpu); - rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - } + } else if (kvm_is_ucontrol(vcpu->kvm)) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = + current->thread.gmap_addr; + vcpu->run->s390_ucontrol.pgm_code = 0x10; + rc = -EREMOTE; } memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16); @@ -833,16 +831,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) rc = -EINTR; } -#ifdef CONFIG_KVM_S390_UCONTROL - if (rc == SIE_INTERCEPT_UCONTROL) { - kvm_run->exit_reason = KVM_EXIT_S390_UCONTROL; - kvm_run->s390_ucontrol.trans_exc_code = - current->thread.gmap_addr; - kvm_run->s390_ucontrol.pgm_code = 0x10; - rc = 0; - } -#endif - if (rc == -EOPNOTSUPP) { /* intercept cannot be handled in-kernel, prepare kvm-run */ kvm_run->exit_reason = KVM_EXIT_S390_SIEIC; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index b44912a32949..aad541fc9098 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -27,8 +27,6 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); /* declare vfacilities extern */ extern unsigned long *vfacilities; -/* negativ values are error codes, positive values for internal conditions */ -#define SIE_INTERCEPT_UCONTROL (1<<0) int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ From ac5b03420150241dc2db3cb4aa4f58c1e7e4640f Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 7 Oct 2013 17:50:22 +0200 Subject: [PATCH 03/67] KVM: s390: Removed VIRTIODESCSPACE VIRTIODESCSPACE is completely unused nowadays and thus can be removed without any problems. Signed-off-by: Thomas Huth Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index aad541fc9098..fcd25b4bc7af 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -19,9 +19,6 @@ #include #include -/* The current code can have up to 256 pages for virtio */ -#define VIRTIODESCSPACE (256ul * 4096ul) - typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); /* declare vfacilities extern */ From f092669e743048f50c714a1af7f8e3478d7b9e1b Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 9 Oct 2013 14:15:54 +0200 Subject: [PATCH 04/67] KVM: s390: Fix access to CR6 in TPI handler The TPI handler currently uses vcpu->run->s.regs.crs[6] to get the current value of CR6. I think this is wrong, because vcpu->run->s.regs.crs is only updated when kvm_arch_vcpu_ioctl_run() drops back to userspace. So let's change the TPI handler to use vcpu->arch.sie_block->gcr[6] instead. Signed-off-by: Thomas Huth Acked-by: Christian Borntraeger Signed-off-by: Cornelia Huck --- arch/s390/kvm/priv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 2440602e6df1..b18fe52c2c47 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -197,7 +197,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu) if (addr & 3) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); cc = 0; - inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0); + inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->arch.sie_block->gcr[6], 0); if (!inti) goto no_interrupt; cc = 1; From c95221f69dfa5d3696b2b91374cbd7e5897657c5 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 9 Oct 2013 16:49:03 +0200 Subject: [PATCH 05/67] KVM: s390: Do not set CC3 for EQBS and SQBS The EQBS and SQBS instructions do not set CC3 for invalid channels, but should throw an operation exception instead when not available. Thus they should not be handled by the handle_io_inst() wrapper but drop to userspace instead (which will then inject the operation exception). Signed-off-by: Thomas Huth Acked-by: Christian Borntraeger Signed-off-by: Cornelia Huck --- arch/s390/kvm/priv.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index b18fe52c2c47..05537ab22382 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -638,7 +638,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) static const intercept_handler_t b9_handlers[256] = { [0x8d] = handle_epsw, - [0x9c] = handle_io_inst, [0xaf] = handle_pfmf, }; @@ -731,7 +730,6 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) static const intercept_handler_t eb_handlers[256] = { [0x2f] = handle_lctlg, - [0x8a] = handle_io_inst, }; int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) From e879892c725217a4af1012f31ae56be762473216 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 6 Nov 2013 15:46:33 +0100 Subject: [PATCH 06/67] KVM: s390: Always store status during SIGP STOP_AND_STORE_STATUS The SIGP order STOP_AND_STORE_STATUS is defined to stop a CPU and store its status. However, we only stored the status if the CPU was still running, so make sure that the status is now also stored if the CPU was already stopped. This fixes the problem that the CPU information was not stored correctly in kdump files, rendering them unreadable. Signed-off-by: Thomas Huth Acked-by: Christian Borntraeger Cc: stable@vger.kernel.org Signed-off-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 25 +++++++++++++++---------- arch/s390/kvm/kvm-s390.h | 4 ++-- arch/s390/kvm/sigp.c | 15 ++++++++++++++- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7f4783525144..55eb8dec2a77 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -873,7 +873,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, void *from, * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit * KVM_S390_STORE_STATUS_PREFIXED: -> prefix */ -int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) +int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr) { unsigned char archmode = 1; int prefix; @@ -891,15 +891,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) } else prefix = 0; - /* - * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy - * copying in vcpu load/put. Lets update our copies before we save - * it into the save area - */ - save_fp_ctl(&vcpu->arch.guest_fpregs.fpc); - save_fp_regs(vcpu->arch.guest_fpregs.fprs); - save_access_regs(vcpu->run->s.regs.acrs); - if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs), vcpu->arch.guest_fpregs.fprs, 128, prefix)) return -EFAULT; @@ -944,6 +935,20 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) return 0; } +int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) +{ + /* + * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy + * copying in vcpu load/put. Lets update our copies before we save + * it into the save area + */ + save_fp_ctl(&vcpu->arch.guest_fpregs.fpc); + save_fp_regs(vcpu->arch.guest_fpregs.fprs); + save_access_regs(vcpu->run->s.regs.acrs); + + return kvm_s390_store_status_unloaded(vcpu, addr); +} + static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index fcd25b4bc7af..36f6b1890d4b 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -145,8 +145,8 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ -int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, - unsigned long addr); +int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); +int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); void s390_vcpu_block(struct kvm_vcpu *vcpu); void s390_vcpu_unblock(struct kvm_vcpu *vcpu); void exit_sie(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index bec398c57acf..6805601262e0 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -130,6 +130,7 @@ unlock: static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) { struct kvm_s390_interrupt_info *inti; + int rc = SIGP_CC_ORDER_CODE_ACCEPTED; inti = kzalloc(sizeof(*inti), GFP_ATOMIC); if (!inti) @@ -139,6 +140,8 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) spin_lock_bh(&li->lock); if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { kfree(inti); + if ((action & ACTION_STORE_ON_STOP) != 0) + rc = -ESHUTDOWN; goto out; } list_add_tail(&inti->list, &li->list); @@ -150,7 +153,7 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action) out: spin_unlock_bh(&li->lock); - return SIGP_CC_ORDER_CODE_ACCEPTED; + return rc; } static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) @@ -174,6 +177,16 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action) unlock: spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr); + + if ((action & ACTION_STORE_ON_STOP) != 0 && rc == -ESHUTDOWN) { + /* If the CPU has already been stopped, we still have + * to save the status when doing stop-and-store. This + * has to be done after unlocking all spinlocks. */ + struct kvm_vcpu *dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + rc = kvm_s390_store_status_unloaded(dst_vcpu, + KVM_S390_STORE_STATUS_NOADDR); + } + return rc; } From 178bd789775ab29233e0553155253ec8d73af71f Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 13 Nov 2013 20:28:18 +0100 Subject: [PATCH 07/67] KVM: s390: Fix clock comparator field for STORE STATUS Only the most 7 significant bytes of the clock comparator must be saved to the status area, and the byte at offset 304 has to be zero. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 55eb8dec2a77..1bb1ddaf93c0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -877,6 +877,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr) { unsigned char archmode = 1; int prefix; + u64 clkcomp; if (addr == KVM_S390_STORE_STATUS_NOADDR) { if (copy_to_guest_absolute(vcpu, 163ul, &archmode, 1)) @@ -920,8 +921,9 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr) &vcpu->arch.sie_block->cputm, 8, prefix)) return -EFAULT; + clkcomp = vcpu->arch.sie_block->ckc >> 8; if (__guestcopy(vcpu, addr + offsetof(struct save_area, clk_cmp), - &vcpu->arch.sie_block->ckc, 8, prefix)) + &clkcomp, 8, prefix)) return -EFAULT; if (__guestcopy(vcpu, addr + offsetof(struct save_area, acc_regs), From 743db27c526e0f31cc507959d662e97e2048a86f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Nov 2013 13:56:47 +0100 Subject: [PATCH 08/67] KVM: s390: fix diagnose code extraction The diagnose code to be used is the contents of the base register (if not zero), plus the displacement. The current code ignores the base register contents. So let's fix that... Reviewed-by: Cornelia Huck Cc: stable@vger.kernel.org Signed-off-by: Heiko Carstens Signed-off-by: Cornelia Huck --- arch/s390/kvm/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 78d967f180f4..5ff29be7d87a 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -137,7 +137,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) { - int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; + int code = kvm_s390_get_base_disp_rs(vcpu) & 0xffff; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); From 00e9e435f97b409db8986f9cd35d126ae2d02a0c Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 13 Nov 2013 20:48:51 +0100 Subject: [PATCH 09/67] KVM: s390: Add SIGP store-status-at-address order The STORE STATUS AT ADDRESS order of SIGP was still missing. Now it is supported, using the common kvm_s390_store_status() function. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/kvm/sigp.c | 35 +++++++++++++++++++++++++++++++++++ arch/s390/kvm/trace.h | 1 + 2 files changed, 36 insertions(+) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 6805601262e0..c137ed35a86d 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -275,6 +275,37 @@ out_fi: return rc; } +static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, u16 cpu_id, + u32 addr, u64 *reg) +{ + struct kvm_vcpu *dst_vcpu = NULL; + int flags; + int rc; + + if (cpu_id < KVM_MAX_VCPUS) + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_id); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + + spin_lock_bh(&dst_vcpu->arch.local_int.lock); + flags = atomic_read(dst_vcpu->arch.local_int.cpuflags); + spin_unlock_bh(&dst_vcpu->arch.local_int.lock); + if (!(flags & CPUSTAT_STOPPED)) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INCORRECT_STATE; + return SIGP_CC_STATUS_STORED; + } + + addr &= 0x7ffffe00; + rc = kvm_s390_store_status_unloaded(dst_vcpu, addr); + if (rc == -EFAULT) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INVALID_PARAMETER; + rc = SIGP_CC_STATUS_STORED; + } + return rc; +} + static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, u64 *reg) { @@ -379,6 +410,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP | ACTION_STOP_ON_STOP); break; + case SIGP_STORE_STATUS_AT_ADDRESS: + rc = __sigp_store_status_at_addr(vcpu, cpu_addr, parameter, + &vcpu->run->s.regs.gprs[r1]); + break; case SIGP_SET_ARCHITECTURE: vcpu->stat.instruction_sigp_arch++; rc = __sigp_set_arch(vcpu, parameter); diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index 0c991c6748ab..3db76b2daed7 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -175,6 +175,7 @@ TRACE_EVENT(kvm_s390_intercept_validity, {SIGP_STOP_AND_STORE_STATUS, "stop and store status"}, \ {SIGP_SET_ARCHITECTURE, "set architecture"}, \ {SIGP_SET_PREFIX, "set prefix"}, \ + {SIGP_STORE_STATUS_AT_ADDRESS, "store status at addr"}, \ {SIGP_SENSE_RUNNING, "sense running"}, \ {SIGP_RESTART, "restart"} From 36daca9bb36f0395755817d1b0c45ab6fbf0441b Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 14 Nov 2013 11:08:20 +0100 Subject: [PATCH 10/67] KVM: s390: Removed kvm_s390_inject_sigp_stop() The function kvm_s390_inject_sigp_stop() as been unused since the removal of the old mmu reload code and thus can be removed safely. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.h | 1 - arch/s390/kvm/sigp.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 36f6b1890d4b..095cf51b16ec 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -128,7 +128,6 @@ int __must_check kvm_s390_inject_vm(struct kvm *kvm, int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt *s390int); int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); -int __must_check kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 cr6, u64 schid); diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index c137ed35a86d..c3700585b4bb 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -190,12 +190,6 @@ unlock: return rc; } -int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action) -{ - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - return __inject_sigp_stop(li, action); -} - static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) { int rc; From 4fda342cc7f577599c53fd27b99c953c7b1da18a Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 19 Nov 2013 14:59:12 -0500 Subject: [PATCH 11/67] arm/arm64: kvm: Use virt_to_idmap instead of virt_to_phys for idmap mappings KVM initialisation fails on architectures implementing virt_to_idmap() because virt_to_phys() on such architectures won't fetch you the correct idmap page. So update the KVM ARM code to use the virt_to_idmap() to fix the issue. Since the KVM code is shared between arm and arm64, we create kvm_virt_to_phys() and handle the redirection in respective headers. Cc: Christoffer Dall Cc: Marc Zyngier Cc: Catalin Marinas Signed-off-by: Santosh Shilimkar Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 1 + arch/arm/kvm/mmu.c | 8 ++++---- arch/arm64/include/asm/kvm_mmu.h | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 77de4a41cc50..2d122adcdb22 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -140,6 +140,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, } #define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) +#define kvm_virt_to_phys(x) virt_to_idmap((unsigned long)(x)) #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 580906989db1..659db0ed1370 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -916,9 +916,9 @@ int kvm_mmu_init(void) { int err; - hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start); - hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end); - hyp_idmap_vector = virt_to_phys(__kvm_hyp_init); + hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); + hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); + hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) { /* @@ -945,7 +945,7 @@ int kvm_mmu_init(void) */ kvm_flush_dcache_to_poc(init_bounce_page, len); - phys_base = virt_to_phys(init_bounce_page); + phys_base = kvm_virt_to_phys(init_bounce_page); hyp_idmap_vector += phys_base - hyp_idmap_start; hyp_idmap_start = phys_base; hyp_idmap_end = phys_base + len; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 680f74e67497..7f1f9408ff66 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -136,6 +136,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, } #define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) +#define kvm_virt_to_phys(x) __virt_to_phys((unsigned long)(x)) #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ From d9101fca3d572b8675b7fe3f4ef9f6f99bbf4364 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 13 Nov 2013 11:15:02 +0100 Subject: [PATCH 12/67] KVM: s390: diagnose call documentation Add some further documentation on the DIAGNOSE calls we support. Reviewed-by: Thomas Huth Signed-off-by: Cornelia Huck --- Documentation/virtual/kvm/hypercalls.txt | 3 + Documentation/virtual/kvm/s390-diag.txt | 80 ++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 Documentation/virtual/kvm/s390-diag.txt diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index 022198e389d7..d922d73efa7b 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -17,6 +17,9 @@ S390: S390 uses diagnose instruction as hypercall (0x500) along with hypercall number in R1. + For further information on the S390 diagnose call as supported by KVM, + refer to Documentation/virtual/kvm/s390-diag.txt. + PowerPC: It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. Return value is placed in R3. diff --git a/Documentation/virtual/kvm/s390-diag.txt b/Documentation/virtual/kvm/s390-diag.txt new file mode 100644 index 000000000000..f1de4fbade15 --- /dev/null +++ b/Documentation/virtual/kvm/s390-diag.txt @@ -0,0 +1,80 @@ +The s390 DIAGNOSE call on KVM +============================= + +KVM on s390 supports the DIAGNOSE call for making hypercalls, both for +native hypercalls and for selected hypercalls found on other s390 +hypervisors. + +Note that bits are numbered as by the usual s390 convention (most significant +bit on the left). + + +General remarks +--------------- + +DIAGNOSE calls by the guest cause a mandatory intercept. This implies +all supported DIAGNOSE calls need to be handled by either KVM or its +userspace. + +All DIAGNOSE calls supported by KVM use the RS-a format: + +-------------------------------------- +| '83' | R1 | R3 | B2 | D2 | +-------------------------------------- +0 8 12 16 20 31 + +The second-operand address (obtained by the base/displacement calculation) +is not used to address data. Instead, bits 48-63 of this address specify +the function code, and bits 0-47 are ignored. + +The supported DIAGNOSE function codes vary by the userspace used. For +DIAGNOSE function codes not specific to KVM, please refer to the +documentation for the s390 hypervisors defining them. + + +DIAGNOSE function code 'X'500' - KVM virtio functions +----------------------------------------------------- + +If the function code specifies 0x500, various virtio-related functions +are performed. + +General register 1 contains the virtio subfunction code. Supported +virtio subfunctions depend on KVM's userspace. Generally, userspace +provides either s390-virtio (subcodes 0-2) or virtio-ccw (subcode 3). + +Upon completion of the DIAGNOSE instruction, general register 2 contains +the function's return code, which is either a return code or a subcode +specific value. + +Subcode 0 - s390-virtio notification and early console printk + Handled by userspace. + +Subcode 1 - s390-virtio reset + Handled by userspace. + +Subcode 2 - s390-virtio set status + Handled by userspace. + +Subcode 3 - virtio-ccw notification + Handled by either userspace or KVM (ioeventfd case). + + General register 2 contains a subchannel-identification word denoting + the subchannel of the virtio-ccw proxy device to be notified. + + General register 3 contains the number of the virtqueue to be notified. + + General register 4 contains a 64bit identifier for KVM usage (the + kvm_io_bus cookie). If general register 4 does not contain a valid + identifier, it is ignored. + + After completion of the DIAGNOSE call, general register 2 may contain + a 64bit identifier (in the kvm_io_bus cookie case). + + See also the virtio standard for a discussion of this hypercall. + + +DIAGNOSE function code 'X'501 - KVM breakpoint +---------------------------------------------- + +If the function code specifies 0x501, breakpoint functions may be performed. +This function code is handled by userspace. From 949c007acd8b6887cf5f3ac86512a7b12fa245dc Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 26 Nov 2013 12:27:16 +0100 Subject: [PATCH 13/67] KVM: s390: Use helper function to set CC in SIGP handler We've got a helper function for setting the condition code now, so let's use it in the SIGP handler, too. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/kvm/sigp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index c3700585b4bb..bc0d85a24b1b 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -435,7 +435,6 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) if (rc < 0) return rc; - vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); - vcpu->arch.sie_block->gpsw.mask |= (rc & 3ul) << 44; + kvm_s390_set_psw_cc(vcpu, rc); return 0; } From b13d3580ee47ba3b2814e90b8a9b8241f7a4ba83 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 21 Nov 2013 16:01:48 +0100 Subject: [PATCH 14/67] KVM: s390: Add the SIGP order CONDITIONAL EMERGENCY SIGNAL This patch adds the missing SIGP order "conditional emergency signal" by calling the "emergency signal" SIGP handler if the required conditions are met. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/include/asm/sigp.h | 1 + arch/s390/kvm/sigp.c | 37 +++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index 5a87d16d3e7c..c002cd529c71 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h @@ -12,6 +12,7 @@ #define SIGP_SET_PREFIX 13 #define SIGP_STORE_STATUS_AT_ADDRESS 14 #define SIGP_SET_ARCHITECTURE 18 +#define SIGP_COND_EMERGENCY_SIGNAL 19 #define SIGP_SENSE_RUNNING 21 /* SIGP condition codes */ diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index bc0d85a24b1b..eee1402349f2 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -1,7 +1,7 @@ /* * handling interprocessor communication * - * Copyright IBM Corp. 2008, 2009 + * Copyright IBM Corp. 2008, 2013 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) @@ -89,6 +89,37 @@ unlock: return rc; } +static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr, + u16 asn, u64 *reg) +{ + struct kvm_vcpu *dst_vcpu = NULL; + const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT; + u16 p_asn, s_asn; + psw_t *psw; + u32 flags; + + if (cpu_addr < KVM_MAX_VCPUS) + dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr); + if (!dst_vcpu) + return SIGP_CC_NOT_OPERATIONAL; + flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); + psw = &dst_vcpu->arch.sie_block->gpsw; + p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */ + s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */ + + /* Deliver the emergency signal? */ + if (!(flags & CPUSTAT_STOPPED) + || (psw->mask & psw_int_mask) != psw_int_mask + || ((flags & CPUSTAT_WAIT) && psw->addr != 0) + || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) { + return __sigp_emergency(vcpu, cpu_addr); + } else { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_INCORRECT_STATE; + return SIGP_CC_STATUS_STORED; + } +} + static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; @@ -417,6 +448,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) rc = __sigp_set_prefix(vcpu, cpu_addr, parameter, &vcpu->run->s.regs.gprs[r1]); break; + case SIGP_COND_EMERGENCY_SIGNAL: + rc = __sigp_conditional_emergency(vcpu, cpu_addr, parameter, + &vcpu->run->s.regs.gprs[r1]); + break; case SIGP_SENSE_RUNNING: vcpu->stat.instruction_sigp_sense_running++; rc = __sigp_sense_running(vcpu, cpu_addr, From cc92d6dea11cd43842e20cd05c066963de586417 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 27 Nov 2013 11:47:10 +0100 Subject: [PATCH 15/67] KVM: s390: Reworked SIGP RESTART order When SIGP RESTART detected an illegal CPU address, there is no need to drop to userspace, we can return CC3 to the guest directly instead. Also renamed __sigp_restart() to sigp_check_callable() (since this is a better description of what the function is really doing) and moved a string specific to RESTART to the calling place instead, so that this function gets usable by other SIGP orders, too. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/kvm/sigp.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index eee1402349f2..509547d4fca1 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -363,7 +363,8 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr, return rc; } -static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) +/* Test whether the destination CPU is available and not busy */ +static int sigp_check_callable(struct kvm_vcpu *vcpu, u16 cpu_addr) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li; @@ -382,9 +383,6 @@ static int __sigp_restart(struct kvm_vcpu *vcpu, u16 cpu_addr) spin_lock_bh(&li->lock); if (li->action_bits & ACTION_STOP_ON_STOP) rc = SIGP_CC_BUSY; - else - VCPU_EVENT(vcpu, 4, "sigp restart %x to handle userspace", - cpu_addr); spin_unlock_bh(&li->lock); out: spin_unlock(&fi->lock); @@ -459,10 +457,15 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) break; case SIGP_RESTART: vcpu->stat.instruction_sigp_restart++; - rc = __sigp_restart(vcpu, cpu_addr); - if (rc == SIGP_CC_BUSY) - break; - /* user space must know about restart */ + rc = sigp_check_callable(vcpu, cpu_addr); + if (rc == SIGP_CC_ORDER_CODE_ACCEPTED) { + VCPU_EVENT(vcpu, 4, + "sigp restart %x to handle userspace", + cpu_addr); + /* user space must know about restart */ + rc = -EOPNOTSUPP; + } + break; default: return -EOPNOTSUPP; } From 58bc33b2b700f8524772f3fc20272da2187060c8 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 3 Dec 2013 12:54:55 +0100 Subject: [PATCH 16/67] KVM: s390: SIGP START has to report BUSY while stopping a CPU Just like the RESTART order, the START order also has to report BUSY while a STOP request is pending, to avoid that the START might be ignored due to a race condition between the STOP and the START order. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/include/asm/sigp.h | 1 + arch/s390/kvm/sigp.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index c002cd529c71..d091aa1aaf11 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h @@ -5,6 +5,7 @@ #define SIGP_SENSE 1 #define SIGP_EXTERNAL_CALL 2 #define SIGP_EMERGENCY_SIGNAL 3 +#define SIGP_START 4 #define SIGP_STOP 5 #define SIGP_RESTART 6 #define SIGP_STOP_AND_STORE_STATUS 9 diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 509547d4fca1..87c2b3a3bd3e 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -455,6 +455,11 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) rc = __sigp_sense_running(vcpu, cpu_addr, &vcpu->run->s.regs.gprs[r1]); break; + case SIGP_START: + rc = sigp_check_callable(vcpu, cpu_addr); + if (rc == SIGP_CC_ORDER_CODE_ACCEPTED) + rc = -EOPNOTSUPP; /* Handle START in user space */ + break; case SIGP_RESTART: vcpu->stat.instruction_sigp_restart++; rc = sigp_check_callable(vcpu, cpu_addr); From ff1f3cb4b3ac5d039f02679f34cb1498d110d241 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Mon, 9 Dec 2013 18:30:01 +0100 Subject: [PATCH 17/67] KVM: s390: ioeventfd: ignore leftmost bits The diagnose 500 subcode 3 contains the 32 bit subchannel id in bits 32-63 (counting from the left). As for other I/O instructions, bits 0-31 should be ignored and thus not be passed to kvm_io_bus_write_cookie(). This fixes a bug where the guest passed non-zero bits 0-31 which the host tried to interpret, leading to ioeventfd notification failures. Cc: stable@vger.kernel.org Signed-off-by: Dominik Dingel Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- arch/s390/kvm/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 5ff29be7d87a..8216c0e0b2e2 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -121,7 +121,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) * - gpr 4 contains the index on the bus (optionally) */ ret = kvm_io_bus_write_cookie(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS, - vcpu->run->s.regs.gprs[2], + vcpu->run->s.regs.gprs[2] & 0xffffffff, 8, &vcpu->run->s.regs.gprs[3], vcpu->run->s.regs.gprs[4]); From 2961e8764faad212234e93907a370a7c36a67da5 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 25 Nov 2013 15:37:13 +0200 Subject: [PATCH 18/67] KVM: VMX: shadow VM_(ENTRY|EXIT)_CONTROLS vmcs field VM_(ENTRY|EXIT)_CONTROLS vmcs fields are read/written on each guest entry but most times it can be avoided since values do not changes. Keep fields copy in memory to avoid unnecessary reads from vmcs. Signed-off-by: Gleb Natapov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 112 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b2fe1c252f35..1024689ac717 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -418,6 +418,8 @@ struct vcpu_vmx { u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif + u32 vm_entry_controls_shadow; + u32 vm_exit_controls_shadow; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -1326,6 +1328,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_ENTRY_CONTROLS, val); + vmx->vm_entry_controls_shadow = val; +} + +static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_entry_controls_shadow != val) + vm_entry_controls_init(vmx, val); +} + +static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_entry_controls_shadow; +} + + +static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); +} + +static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); +} + +static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VM_EXIT_CONTROLS, val); + vmx->vm_exit_controls_shadow = val; +} + +static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) +{ + if (vmx->vm_exit_controls_shadow != val) + vm_exit_controls_init(vmx, val); +} + +static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) +{ + return vmx->vm_exit_controls_shadow; +} + + +static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); +} + +static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) +{ + vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); +} + static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { vmx->segment_cache.bitmask = 0; @@ -1410,11 +1468,11 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } -static void clear_atomic_switch_msr_special(unsigned long entry, - unsigned long exit) +static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit) { - vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); - vmcs_clear_bits(VM_EXIT_CONTROLS, exit); + vm_entry_controls_clearbit(vmx, entry); + vm_exit_controls_clearbit(vmx, exit); } static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) @@ -1425,14 +1483,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer) { - clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + clear_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER); return; } break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl) { - clear_atomic_switch_msr_special( + clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); return; @@ -1453,14 +1512,15 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } -static void add_atomic_switch_msr_special(unsigned long entry, - unsigned long exit, unsigned long guest_val_vmcs, - unsigned long host_val_vmcs, u64 guest_val, u64 host_val) +static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, + unsigned long entry, unsigned long exit, + unsigned long guest_val_vmcs, unsigned long host_val_vmcs, + u64 guest_val, u64 host_val) { vmcs_write64(guest_val_vmcs, guest_val); vmcs_write64(host_val_vmcs, host_val); - vmcs_set_bits(VM_ENTRY_CONTROLS, entry); - vmcs_set_bits(VM_EXIT_CONTROLS, exit); + vm_entry_controls_setbit(vmx, entry); + vm_exit_controls_setbit(vmx, exit); } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, @@ -1472,7 +1532,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer) { - add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + add_atomic_switch_msr_special(vmx, + VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER, GUEST_IA32_EFER, HOST_IA32_EFER, @@ -1482,7 +1543,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl) { - add_atomic_switch_msr_special( + add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, GUEST_IA32_PERF_GLOBAL_CTRL, @@ -3182,14 +3243,10 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.efer = efer; if (efer & EFER_LMA) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_IA32E_MODE); + vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer; } else { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_IA32E_MODE); + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer & ~EFER_LME; } @@ -3217,9 +3274,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - & ~VM_ENTRY_IA32E_MODE); + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } @@ -4346,10 +4401,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ - vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); @@ -7759,12 +7815,12 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exit_control = vmcs_config.vmexit_ctrl; if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - vmcs_write32(VM_EXIT_CONTROLS, exit_control); + vm_exit_controls_init(vmx, exit_control); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. */ - vmcs_write32(VM_ENTRY_CONTROLS, + vm_entry_controls_init(vmx, (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); @@ -8186,7 +8242,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | - (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); + (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ @@ -8390,6 +8446,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) vcpu->cpu = cpu; put_cpu(); + vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); + vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); vmx_segment_cache_clear(vmx); /* if no vmcs02 cache requested, remove the one we used */ From 6dfacadd5858882eee1983995854d4e1fb1b966e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 4 Dec 2013 08:58:54 +0100 Subject: [PATCH 19/67] KVM: nVMX: Add support for activity state HLT We can easily emulate the HLT activity state for L1: If it decides that L2 shall be halted on entry, just invoke the normal emulation of halt after switching to L2. We do not depend on specific host features to provide this, so we can expose the capability unconditionally. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 966502d4682e..2067264fb7f5 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -100,6 +100,7 @@ #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f #define VMX_MISC_SAVE_EFER_LMA 0x00000020 +#define VMX_MISC_ACTIVITY_HLT 0x00000040 /* VMCS Encodings */ enum vmcs_field { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1024689ac717..f90320b204a9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2340,6 +2340,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT; nested_vmx_misc_high = 0; } @@ -7938,7 +7939,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) { + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -8067,6 +8069,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) prepare_vmcs02(vcpu, vmcs12); + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) + return kvm_emulate_halt(vcpu); + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet From c08ac06ab3f3cdb8d34376c3a8a5e46a31a62c8f Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 13 Dec 2013 15:07:21 +0900 Subject: [PATCH 20/67] KVM: Use cond_resched() directly and remove useless kvm_resched() Since the commit 15ad7146 ("KVM: Use the scheduler preemption notifiers to make kvm preemptible"), the remaining stuff in this function is a simple cond_resched() call with an extra need_resched() check which was there to avoid dropping VCPUs unnecessarily. Now it is meaningless. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/ia64/kvm/kvm-ia64.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 8 -------- 5 files changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 985bf80c622e..53f44bee9ebb 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -702,7 +702,7 @@ again: out: srcu_read_unlock(&vcpu->kvm->srcu, idx); if (r > 0) { - kvm_resched(vcpu); + cond_resched(); idx = srcu_read_lock(&vcpu->kvm->srcu); goto again; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 072287f1c3bc..3fa99b20894f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1348,7 +1348,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) kvm_guest_exit(); preempt_enable(); - kvm_resched(vcpu); + cond_resched(); spin_lock(&vc->lock); now = get_tb(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 21ef1ba184ae..4fb1ee619c1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6125,7 +6125,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } if (need_resched()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_resched(vcpu); + cond_resched(); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9523d2ad7535..4ecf10775c4f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -583,7 +583,6 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); bool kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); -void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a0aa84b5941a..03c97e7ae4ca 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1710,14 +1710,6 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_vcpu_kick); #endif /* !CONFIG_S390 */ -void kvm_resched(struct kvm_vcpu *vcpu) -{ - if (!need_resched()) - return; - cond_resched(); -} -EXPORT_SYMBOL_GPL(kvm_resched); - bool kvm_vcpu_yield_to(struct kvm_vcpu *target) { struct pid *pid; From 9357d93952143b178fa9d1f5095b8f273b01a1f1 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 13 Dec 2013 15:08:38 +0900 Subject: [PATCH 21/67] KVM: x86: Add comment on vcpu_enter_guest()'s return value Giving proper names to the 0 and 1 was once suggested. But since 0 is returned to the userspace, giving it another name can introduce extra confusion. This patch just explains the meanings instead. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4fb1ee619c1c..1dc0359e2095 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5865,6 +5865,11 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_apic_update_tmr(vcpu, tmr); } +/* + * Returns 1 to let __vcpu_run() continue the guest execution loop without + * exiting to the userspace. Otherwise, the value will be returned to the + * userspace. + */ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; From beb11fc71370bb49c58d7454d5d9c5a00a7cdb4b Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 12 Dec 2013 21:42:24 +0530 Subject: [PATCH 22/67] KVM: Documentation: Fix typo for KVM_ARM_VCPU_INIT ioctl Fix minor typo in "Parameters:" of KVM_ARM_VCPU_INIT documentation. Signed-off-by: Anup Patel Signed-off-by: Pranavkumar Sawargaonkar Signed-off-by: Christoffer Dall --- Documentation/virtual/kvm/api.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a30035dd4c26..aad3244a579e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2327,7 +2327,7 @@ current state. "addr" is ignored. Capability: basic Architectures: arm, arm64 Type: vcpu ioctl -Parameters: struct struct kvm_vcpu_init (in) +Parameters: struct kvm_vcpu_init (in) Returns: 0 on success; -1 on error Errors:  EINVAL:    the target is unknown, or the combination of features is invalid. From ca3f257ae570c37d3da30a524a2f61ce602c6c99 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 16 Dec 2013 12:55:46 +0100 Subject: [PATCH 23/67] KVM: nVMX: Support direct APIC access from L2 It's a pathological case, but still a valid one: If L1 disables APIC virtualization and also allows L2 to directly write to the APIC page, we have to forcibly enable APIC virtualization while in L2 if the in-kernel APIC is in use. This allows to run the direct interrupt test case in the vmx unit test without x2APIC. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f90320b204a9..31eb5776d854 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7763,6 +7763,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); + } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { + exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmcs_write64(APIC_ACCESS_ADDR, + page_to_phys(vcpu->kvm->arch.apic_access_page)); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); From 4c4d563b49830a66537c3f51070dad74d7a81d3a Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 18 Dec 2013 19:16:24 +0100 Subject: [PATCH 24/67] KVM: VMX: Do not skip the instruction if handle_dr injects a fault If kvm_get_dr or kvm_set_dr reports that it raised a fault, we must not advance the instruction pointer. Otherwise the exception will hit the wrong instruction. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 31eb5776d854..9cc54842ae14 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5137,10 +5137,14 @@ static int handle_dr(struct kvm_vcpu *vcpu) reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; - if (!kvm_get_dr(vcpu, dr, &val)) - kvm_register_write(vcpu, reg, val); + + if (kvm_get_dr(vcpu, dr, &val)) + return 1; + kvm_register_write(vcpu, reg, val); } else - kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); + if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg])) + return 1; + skip_emulated_instruction(vcpu); return 1; } From 989c6b34f6a9480e397b170cc62237e89bf4fdb9 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 19 Dec 2013 15:28:51 -0200 Subject: [PATCH 25/67] KVM: MMU: handle invalid root_hpa at __direct_map It is possible for __direct_map to be called on invalid root_hpa (-1), two examples: 1) try_async_pf -> can_do_async_pf -> vmx_interrupt_allowed -> nested_vmx_vmexit 2) vmx_handle_exit -> vmx_interrupt_allowed -> nested_vmx_vmexit Then to load_vmcs12_host_state and kvm_mmu_reset_context. Check for this possibility, let fault exception be regenerated. BZ: https://bugzilla.redhat.com/show_bug.cgi?id=924916 Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 40772ef0f2b1..31a570287fcc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2659,6 +2659,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int emulate = 0; gfn_t pseudo_gfn; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return 0; + for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, From 478a8237f656d86d25b3e4e4bf3c48f590156294 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 19 Nov 2013 17:43:19 -0800 Subject: [PATCH 26/67] arm: KVM: Don't return PSCI_INVAL if waitqueue is inactive The current KVM implementation of PSCI returns INVALID_PARAMETERS if the waitqueue for the corresponding CPU is not active. This does not seem correct, since KVM should not care what the specific thread is doing, for example, user space may not have called KVM_RUN on this VCPU yet or the thread may be busy looping to user space because it received a signal; this is really up to the user space implementation. Instead we should check specifically that the CPU is marked as being turned off, regardless of the VCPU thread state, and if it is, we shall simply clear the pause flag on the CPU and wake up the thread if it happens to be blocked for us. Further, the implementation seems to be racy when executing multiple VCPU threads. There really isn't a reasonable user space programming scheme to ensure all secondary CPUs have reached kvm_vcpu_first_run_init before turning on the boot CPU. Therefore, set the pause flag on the vcpu at VCPU init time (which can reasonably be expected to be completed for all CPUs by user space before running any VCPUs) and clear both this flag and the feature (in case the feature can somehow get set again in the future) and ping the waitqueue on turning on a VCPU using PSCI. Reported-by: Peter Maydell Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 30 +++++++++++++++++++----------- arch/arm/kvm/psci.c | 11 ++++++----- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 2a700e00528d..151eb9160482 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -478,15 +478,6 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) return ret; } - /* - * Handle the "start in power-off" case by calling into the - * PSCI code. - */ - if (test_and_clear_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) { - *vcpu_reg(vcpu, 0) = KVM_PSCI_FN_CPU_OFF; - kvm_psci_call(vcpu); - } - return 0; } @@ -700,6 +691,24 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return -EINVAL; } +static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, + struct kvm_vcpu_init *init) +{ + int ret; + + ret = kvm_vcpu_set_target(vcpu, init); + if (ret) + return ret; + + /* + * Handle the "start in power-off" case by marking the VCPU as paused. + */ + if (__test_and_clear_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) + vcpu->arch.pause = true; + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -713,8 +722,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&init, argp, sizeof(init))) return -EFAULT; - return kvm_vcpu_set_target(vcpu, &init); - + return kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); } case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index 0881bf169fbc..448f60e8d23c 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -54,15 +54,15 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) } } - if (!vcpu) + /* + * Make sure the caller requested a valid CPU and that the CPU is + * turned off. + */ + if (!vcpu || !vcpu->arch.pause) return KVM_PSCI_RET_INVAL; target_pc = *vcpu_reg(source_vcpu, 2); - wq = kvm_arch_vcpu_wq(vcpu); - if (!waitqueue_active(wq)) - return KVM_PSCI_RET_INVAL; - kvm_reset_vcpu(vcpu); /* Gracefully handle Thumb2 entry point */ @@ -79,6 +79,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) vcpu->arch.pause = false; smp_mb(); /* Make sure the above is visible */ + wq = kvm_arch_vcpu_wq(vcpu); wake_up_interruptible(wq); return KVM_PSCI_RET_SUCCESS; From a1a64387adeeba7a34ce06f2774e81f496ee803b Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 16 Nov 2013 10:51:25 -0800 Subject: [PATCH 27/67] arm/arm64: KVM: arch_timer: Initialize cntvoff at kvm_init Initialize the cntvoff at kvm_init_vm time, not before running the VCPUs at the first time because that will overwrite any potentially restored values from user space. Cc: Andre Przywara Acked-by: Marc Zynger Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 2 ++ virt/kvm/arm/vgic.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 2a700e00528d..13205bd9b359 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -137,6 +137,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out_free_stage2_pgd; + kvm_timer_init(kvm); + /* Mark the initial VMID generation invalid */ kvm->arch.vmid_gen = 0; diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 685fc72fc751..81e9481184a7 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1409,7 +1409,6 @@ int kvm_vgic_init(struct kvm *kvm) for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4) vgic_set_target_reg(kvm, 0, i); - kvm_timer_init(kvm); kvm->arch.vgic.ready = true; out: mutex_unlock(&kvm->lock); From 39735a3a390431bcf60f9174b7d64f787fd6afa9 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 13 Dec 2013 14:23:26 +0100 Subject: [PATCH 28/67] ARM/KVM: save and restore generic timer registers For migration to work we need to save (and later restore) the state of each core's virtual generic timer. Since this is per VCPU, we can use the [gs]et_one_reg ioctl and export the three needed registers (control, counter, compare value). Though they live in cp15 space, we don't use the existing list, since they need special accessor functions and the arch timer is optional. Acked-by: Marc Zynger Signed-off-by: Andre Przywara Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 + arch/arm/include/uapi/asm/kvm.h | 20 +++++++ arch/arm/kvm/guest.c | 92 ++++++++++++++++++++++++++++++- arch/arm64/include/uapi/asm/kvm.h | 18 ++++++ virt/kvm/arm/arch_timer.c | 34 ++++++++++++ 5 files changed, 166 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 8a6f6db14ee4..098f7dd6d564 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -225,4 +225,7 @@ static inline int kvm_arch_dev_ioctl_check_extension(long ext) int kvm_perf_init(void); int kvm_perf_teardown(void); +u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); +int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index c498b60c0505..835b8678de03 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -119,6 +119,26 @@ struct kvm_arch_memory_slot { #define KVM_REG_ARM_32_CRN_MASK 0x0000000000007800 #define KVM_REG_ARM_32_CRN_SHIFT 11 +#define ARM_CP15_REG_SHIFT_MASK(x,n) \ + (((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK) + +#define __ARM_CP15_REG(op1,crn,crm,op2) \ + (KVM_REG_ARM | (15 << KVM_REG_ARM_COPROC_SHIFT) | \ + ARM_CP15_REG_SHIFT_MASK(op1, OPC1) | \ + ARM_CP15_REG_SHIFT_MASK(crn, 32_CRN) | \ + ARM_CP15_REG_SHIFT_MASK(crm, CRM) | \ + ARM_CP15_REG_SHIFT_MASK(op2, 32_OPC2)) + +#define ARM_CP15_REG32(...) (__ARM_CP15_REG(__VA_ARGS__) | KVM_REG_SIZE_U32) + +#define __ARM_CP15_REG64(op1,crm) \ + (__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64) +#define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__) + +#define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1) +#define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14) +#define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14) + /* Normal registers are mapped as coprocessor 16. */ #define KVM_REG_ARM_CORE (0x0010 << KVM_REG_ARM_COPROC_SHIFT) #define KVM_REG_ARM_CORE_REG(name) (offsetof(struct kvm_regs, name) / 4) diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 20f8d97904af..2786eae10c0d 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -109,6 +109,83 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return -EINVAL; } +#ifndef CONFIG_KVM_ARM_TIMER + +#define NUM_TIMER_REGS 0 + +static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + return 0; +} + +static bool is_timer_reg(u64 index) +{ + return false; +} + +int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) +{ + return 0; +} + +u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) +{ + return 0; +} + +#else + +#define NUM_TIMER_REGS 3 + +static bool is_timer_reg(u64 index) +{ + switch (index) { + case KVM_REG_ARM_TIMER_CTL: + case KVM_REG_ARM_TIMER_CNT: + case KVM_REG_ARM_TIMER_CVAL: + return true; + } + return false; +} + +static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + if (put_user(KVM_REG_ARM_TIMER_CTL, uindices)) + return -EFAULT; + uindices++; + if (put_user(KVM_REG_ARM_TIMER_CNT, uindices)) + return -EFAULT; + uindices++; + if (put_user(KVM_REG_ARM_TIMER_CVAL, uindices)) + return -EFAULT; + + return 0; +} + +#endif + +static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + int ret; + + ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); + if (ret != 0) + return ret; + + return kvm_arm_timer_set_reg(vcpu, reg->id, val); +} + +static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + void __user *uaddr = (void __user *)(long)reg->addr; + u64 val; + + val = kvm_arm_timer_get_reg(vcpu, reg->id); + return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)); +} + static unsigned long num_core_regs(void) { return sizeof(struct kvm_regs) / sizeof(u32); @@ -121,7 +198,8 @@ static unsigned long num_core_regs(void) */ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { - return num_core_regs() + kvm_arm_num_coproc_regs(vcpu); + return num_core_regs() + kvm_arm_num_coproc_regs(vcpu) + + NUM_TIMER_REGS; } /** @@ -133,6 +211,7 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { unsigned int i; const u64 core_reg = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE; + int ret; for (i = 0; i < sizeof(struct kvm_regs)/sizeof(u32); i++) { if (put_user(core_reg | i, uindices)) @@ -140,6 +219,11 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) uindices++; } + ret = copy_timer_indices(vcpu, uindices); + if (ret) + return ret; + uindices += NUM_TIMER_REGS; + return kvm_arm_copy_coproc_indices(vcpu, uindices); } @@ -153,6 +237,9 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) return get_core_reg(vcpu, reg); + if (is_timer_reg(reg->id)) + return get_timer_reg(vcpu, reg); + return kvm_arm_coproc_get_reg(vcpu, reg); } @@ -166,6 +253,9 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) return set_core_reg(vcpu, reg); + if (is_timer_reg(reg->id)) + return set_timer_reg(vcpu, reg); + return kvm_arm_coproc_set_reg(vcpu, reg); } diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 5031f4263937..7c25ca8b02b3 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -129,6 +129,24 @@ struct kvm_arch_memory_slot { #define KVM_REG_ARM64_SYSREG_OP2_MASK 0x0000000000000007 #define KVM_REG_ARM64_SYSREG_OP2_SHIFT 0 +#define ARM64_SYS_REG_SHIFT_MASK(x,n) \ + (((x) << KVM_REG_ARM64_SYSREG_ ## n ## _SHIFT) & \ + KVM_REG_ARM64_SYSREG_ ## n ## _MASK) + +#define __ARM64_SYS_REG(op0,op1,crn,crm,op2) \ + (KVM_REG_ARM64 | KVM_REG_ARM64_SYSREG | \ + ARM64_SYS_REG_SHIFT_MASK(op0, OP0) | \ + ARM64_SYS_REG_SHIFT_MASK(op1, OP1) | \ + ARM64_SYS_REG_SHIFT_MASK(crn, CRN) | \ + ARM64_SYS_REG_SHIFT_MASK(crm, CRM) | \ + ARM64_SYS_REG_SHIFT_MASK(op2, OP2)) + +#define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64) + +#define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1) +#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) +#define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) + /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 #define KVM_ARM_IRQ_TYPE_MASK 0xff diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index c2e1ef4604e8..5081e809821f 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -182,6 +182,40 @@ static void kvm_timer_init_interrupt(void *info) enable_percpu_irq(host_vtimer_irq, 0); } +int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + switch (regid) { + case KVM_REG_ARM_TIMER_CTL: + timer->cntv_ctl = value; + break; + case KVM_REG_ARM_TIMER_CNT: + vcpu->kvm->arch.timer.cntvoff = kvm_phys_timer_read() - value; + break; + case KVM_REG_ARM_TIMER_CVAL: + timer->cntv_cval = value; + break; + default: + return -1; + } + return 0; +} + +u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + switch (regid) { + case KVM_REG_ARM_TIMER_CTL: + return timer->cntv_ctl; + case KVM_REG_ARM_TIMER_CNT: + return kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + case KVM_REG_ARM_TIMER_CVAL: + return timer->cntv_cval; + } + return (u64)-1; +} static int kvm_timer_cpu_notify(struct notifier_block *self, unsigned long action, void *cpu) From e1ba0207a1b3714bb3f000e506285ae5123cdfa7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 23 Sep 2013 14:55:55 -0700 Subject: [PATCH 29/67] ARM: KVM: Allow creating the VGIC after VCPUs Rework the VGIC initialization slightly to allow initialization of the vgic cpu-specific state even if the irqchip (the VGIC) hasn't been created by user space yet. This is safe, because the vgic data structures are already allocated when the CPU is allocated if VGIC support is compiled into the kernel. Further, the init process does not depend on any other information and the sacrifice is a slight performance degradation for creating VMs in the no-VGIC case. The reason is that the new device control API doesn't mandate creating the VGIC before creating the VCPU and it is unreasonable to require user space to create the VGIC before creating the VCPUs. At the same time move the irqchip_in_kernel check out of kvm_vcpu_first_run_init and into the init function to make the per-vcpu and global init functions symmetric and add comments on the exported functions making it a bit easier to understand the init flow by only looking at vgic.c. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 7 ++++--- virt/kvm/arm/vgic.c | 22 +++++++++++++++++++--- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 13205bd9b359..c9fe9d71be73 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -464,6 +464,8 @@ static void update_vttbr(struct kvm *kvm) static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) { + int ret; + if (likely(vcpu->arch.has_run_once)) return 0; @@ -473,9 +475,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) * Initialize the VGIC before running a vcpu the first time on * this VM. */ - if (irqchip_in_kernel(vcpu->kvm) && - unlikely(!vgic_initialized(vcpu->kvm))) { - int ret = kvm_vgic_init(vcpu->kvm); + if (unlikely(!vgic_initialized(vcpu->kvm))) { + ret = kvm_vgic_init(vcpu->kvm); if (ret) return ret; } diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 81e9481184a7..5e9df47778fb 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1243,15 +1243,19 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data) return IRQ_HANDLED; } +/** + * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state + * @vcpu: pointer to the vcpu struct + * + * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to + * this vcpu and enable the VGIC for this VCPU + */ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int i; - if (!irqchip_in_kernel(vcpu->kvm)) - return 0; - if (vcpu->vcpu_id >= VGIC_MAX_CPUS) return -EBUSY; @@ -1383,10 +1387,22 @@ out: return ret; } +/** + * kvm_vgic_init - Initialize global VGIC state before running any VCPUs + * @kvm: pointer to the kvm struct + * + * Map the virtual CPU interface into the VM before running any VCPUs. We + * can't do this at creation time, because user space must first set the + * virtual CPU interface address in the guest physical address space. Also + * initialize the ITARGETSRn regs to 0 on the emulated distributor. + */ int kvm_vgic_init(struct kvm *kvm) { int ret = 0, i; + if (!irqchip_in_kernel(kvm)) + return 0; + mutex_lock(&kvm->lock); if (vgic_initialized(kvm)) From 7330672befe6269e575f79b924a7068b26c144b4 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 25 Oct 2013 17:29:18 +0100 Subject: [PATCH 30/67] KVM: arm-vgic: Support KVM_CREATE_DEVICE for VGIC Support creating the ARM VGIC device through the KVM_CREATE_DEVICE ioctl, which can then later be leveraged to use the KVM_{GET/SET}_DEVICE_ATTR, which is useful both for setting addresses in a more generic API than the ARM-specific one and is useful for save/restore of VGIC state. Adds KVM_CAP_DEVICE_CTRL to ARM capabilities. Note that we change the check for creating a VGIC from bailing out if any VCPUs were created, to bailing out if any VCPUs were ever run. This is an important distinction that shouldn't break anything, but allows creating the VGIC after the VCPUs have been created. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- .../virtual/kvm/devices/arm-vgic.txt | 10 +++ arch/arm/kvm/arm.c | 1 + include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + virt/kvm/arm/vgic.c | 63 ++++++++++++++++++- virt/kvm/kvm_main.c | 5 ++ 6 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 Documentation/virtual/kvm/devices/arm-vgic.txt diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt new file mode 100644 index 000000000000..38f27f709a99 --- /dev/null +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -0,0 +1,10 @@ +ARM Virtual Generic Interrupt Controller (VGIC) +=============================================== + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 + +Only one VGIC instance may be instantiated through either this API or the +legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt +controller, requiring emulated user-space devices to inject interrupts to the +VGIC instead of directly to CPUs. diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index c9fe9d71be73..cc7c41af9c38 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -190,6 +190,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQCHIP: r = vgic_present; break; + case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4ecf10775c4f..1f46f66f60ab 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1075,6 +1075,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; extern struct kvm_device_ops kvm_vfio_ops; +extern struct kvm_device_ops kvm_arm_vgic_v2_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 902f12461873..b647c2917391 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -853,6 +853,7 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP 1 #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_TYPE_ARM_VGIC_V2 5 /* * ioctls for VM fds diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 5e9df47778fb..b15d6c17a090 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1433,20 +1433,45 @@ out: int kvm_vgic_create(struct kvm *kvm) { - int ret = 0; + int i, vcpu_lock_idx = -1, ret = 0; + struct kvm_vcpu *vcpu; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) || kvm->arch.vgic.vctrl_base) { + if (kvm->arch.vgic.vctrl_base) { ret = -EEXIST; goto out; } + /* + * Any time a vcpu is run, vcpu_load is called which tries to grab the + * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure + * that no other VCPUs are run while we create the vgic. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!mutex_trylock(&vcpu->mutex)) + goto out_unlock; + vcpu_lock_idx = i; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.has_run_once) { + ret = -EBUSY; + goto out_unlock; + } + } + spin_lock_init(&kvm->arch.vgic.lock); kvm->arch.vgic.vctrl_base = vgic_vctrl_base; kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; +out_unlock: + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); + mutex_unlock(&vcpu->mutex); + } + out: mutex_unlock(&kvm->lock); return ret; @@ -1510,3 +1535,37 @@ int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr) mutex_unlock(&kvm->lock); return r; } + +static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + return -ENXIO; +} + +static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + return -ENXIO; +} + +static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + return -ENXIO; +} + +static void vgic_destroy(struct kvm_device *dev) +{ + kfree(dev); +} + +static int vgic_create(struct kvm_device *dev, u32 type) +{ + return kvm_vgic_create(dev->kvm); +} + +struct kvm_device_ops kvm_arm_vgic_v2_ops = { + .name = "kvm-arm-vgic", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_set_attr, + .get_attr = vgic_get_attr, + .has_attr = vgic_has_attr, +}; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 03c97e7ae4ca..3efba97bdce2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2272,6 +2272,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, case KVM_DEV_TYPE_VFIO: ops = &kvm_vfio_ops; break; +#endif +#ifdef CONFIG_KVM_ARM_VGIC + case KVM_DEV_TYPE_ARM_VGIC_V2: + ops = &kvm_arm_vgic_v2_ops; + break; #endif default: return -ENODEV; From ce01e4e8874d410738f4b4733b26642d6611a331 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 23 Sep 2013 14:55:56 -0700 Subject: [PATCH 31/67] KVM: arm-vgic: Set base addr through device API Support setting the distributor and cpu interface base addresses in the VM physical address space through the KVM_{SET,GET}_DEVICE_ATTR API in addition to the ARM specific API. This has the added benefit of being able to share more code in user space and do things in a uniform manner. Also deprecate the older API at the same time, but backwards compatibility will be maintained. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- Documentation/virtual/kvm/api.txt | 7 +- .../virtual/kvm/devices/arm-vgic.txt | 11 +++ arch/arm/include/uapi/asm/kvm.h | 2 + arch/arm/kvm/arm.c | 2 +- include/kvm/arm_vgic.h | 2 +- virt/kvm/arm/vgic.c | 87 ++++++++++++++++--- 6 files changed, 96 insertions(+), 15 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a30035dd4c26..867112f1968d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2391,7 +2391,8 @@ struct kvm_reg_list { This ioctl returns the guest registers that are supported for the KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. -4.85 KVM_ARM_SET_DEVICE_ADDR + +4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) Capability: KVM_CAP_ARM_SET_DEVICE_ADDR Architectures: arm, arm64 @@ -2429,6 +2430,10 @@ must be called after calling KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the base addresses will return -EEXIST. +Note, this IOCTL is deprecated and the more flexible SET/GET_DEVICE_ATTR API +should be used instead. + + 4.86 KVM_PPC_RTAS_DEFINE_TOKEN Capability: KVM_CAP_PPC_RTAS diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index 38f27f709a99..c9febb2a0c3e 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -8,3 +8,14 @@ Only one VGIC instance may be instantiated through either this API or the legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt controller, requiring emulated user-space devices to inject interrupts to the VGIC instead of directly to CPUs. + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit) + Base address in the guest physical address space of the GIC distributor + register mappings. + + KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) + Base address in the guest physical address space of the GIC virtual cpu + interface register mappings. diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 835b8678de03..76a742769e2b 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -163,6 +163,8 @@ struct kvm_arch_memory_slot { #define KVM_REG_ARM_VFP_FPINST 0x1009 #define KVM_REG_ARM_VFP_FPINST2 0x100A +/* Device Control API: ARM VGIC */ +#define KVM_DEV_ARM_VGIC_GRP_ADDR 0 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index cc7c41af9c38..f290b2250ed5 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -776,7 +776,7 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, case KVM_ARM_DEVICE_VGIC_V2: if (!vgic_present) return -ENXIO; - return kvm_vgic_set_addr(kvm, type, dev_addr->addr); + return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); default: return -ENODEV; } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 7e2d15837b02..be85127bfed3 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -144,7 +144,7 @@ struct kvm_run; struct kvm_exit_mmio; #ifdef CONFIG_KVM_ARM_VGIC -int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr); +int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); int kvm_vgic_hyp_init(void); int kvm_vgic_init(struct kvm *kvm); int kvm_vgic_create(struct kvm *kvm); diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index b15d6c17a090..45db48de4282 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1495,6 +1495,12 @@ static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr, { int ret; + if (addr & ~KVM_PHYS_MASK) + return -E2BIG; + + if (addr & (SZ_4K - 1)) + return -EINVAL; + if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) return -EEXIST; if (addr + size < addr) @@ -1507,26 +1513,41 @@ static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr, return ret; } -int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr) +/** + * kvm_vgic_addr - set or get vgic VM base addresses + * @kvm: pointer to the vm struct + * @type: the VGIC addr type, one of KVM_VGIC_V2_ADDR_TYPE_XXX + * @addr: pointer to address value + * @write: if true set the address in the VM address space, if false read the + * address + * + * Set or get the vgic base addresses for the distributor and the virtual CPU + * interface in the VM physical address space. These addresses are properties + * of the emulated core/SoC and therefore user space initially knows this + * information. + */ +int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) { int r = 0; struct vgic_dist *vgic = &kvm->arch.vgic; - if (addr & ~KVM_PHYS_MASK) - return -E2BIG; - - if (addr & (SZ_4K - 1)) - return -EINVAL; - mutex_lock(&kvm->lock); switch (type) { case KVM_VGIC_V2_ADDR_TYPE_DIST: - r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base, - addr, KVM_VGIC_V2_DIST_SIZE); + if (write) { + r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base, + *addr, KVM_VGIC_V2_DIST_SIZE); + } else { + *addr = vgic->vgic_dist_base; + } break; case KVM_VGIC_V2_ADDR_TYPE_CPU: - r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base, - addr, KVM_VGIC_V2_CPU_SIZE); + if (write) { + r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base, + *addr, KVM_VGIC_V2_CPU_SIZE); + } else { + *addr = vgic->vgic_cpu_base; + } break; default: r = -ENODEV; @@ -1538,16 +1559,58 @@ int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr) static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + int r; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 addr; + unsigned long type = (unsigned long)attr->attr; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + r = kvm_vgic_addr(dev->kvm, type, &addr, true); + return (r == -ENODEV) ? -ENXIO : r; + } + } + return -ENXIO; } static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - return -ENXIO; + int r = -ENXIO; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + u64 addr; + unsigned long type = (unsigned long)attr->attr; + + r = kvm_vgic_addr(dev->kvm, type, &addr, false); + if (r) + return (r == -ENODEV) ? -ENXIO : r; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + } + } + + return r; } static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + case KVM_VGIC_V2_ADDR_TYPE_CPU: + return 0; + } + break; + } return -ENXIO; } From 0307e1770fdeff2732cf7a35d0f7f49db67c6621 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 23 Sep 2013 14:55:56 -0700 Subject: [PATCH 32/67] irqchip: arm-gic: Define additional MMIO offsets and masks Define CPU interface offsets for the GICC_ABPR, GICC_APR, and GICC_IIDR registers. Define distributor registers for the GICD_SPENDSGIR and the GICD_CPENDSGIR. KVM/ARM needs to know about these definitions to fully support save/restore of the VGIC. Also define some masks and shifts for the various GICH_VMCR fields. Cc: Thomas Gleixner Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/linux/irqchip/arm-gic.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index cac496b1e279..0ceb389dba6c 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -17,6 +17,9 @@ #define GIC_CPU_EOI 0x10 #define GIC_CPU_RUNNINGPRI 0x14 #define GIC_CPU_HIGHPRI 0x18 +#define GIC_CPU_ALIAS_BINPOINT 0x1c +#define GIC_CPU_ACTIVEPRIO 0xd0 +#define GIC_CPU_IDENT 0xfc #define GIC_DIST_CTRL 0x000 #define GIC_DIST_CTR 0x004 @@ -56,6 +59,15 @@ #define GICH_LR_ACTIVE_BIT (1 << 29) #define GICH_LR_EOI (1 << 19) +#define GICH_VMCR_CTRL_SHIFT 0 +#define GICH_VMCR_CTRL_MASK (0x21f << GICH_VMCR_CTRL_SHIFT) +#define GICH_VMCR_PRIMASK_SHIFT 27 +#define GICH_VMCR_PRIMASK_MASK (0x1f << GICH_VMCR_PRIMASK_SHIFT) +#define GICH_VMCR_BINPOINT_SHIFT 21 +#define GICH_VMCR_BINPOINT_MASK (0x7 << GICH_VMCR_BINPOINT_SHIFT) +#define GICH_VMCR_ALIAS_BINPOINT_SHIFT 18 +#define GICH_VMCR_ALIAS_BINPOINT_MASK (0x7 << GICH_VMCR_ALIAS_BINPOINT_SHIFT) + #define GICH_MISR_EOI (1 << 0) #define GICH_MISR_U (1 << 1) From 1006e8cb22e861260688917ca4cfe6cde8ad69eb Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 23 Sep 2013 14:55:56 -0700 Subject: [PATCH 33/67] KVM: arm-vgic: Make vgic mmio functions more generic Rename the vgic_ranges array to vgic_dist_ranges to be more specific and to prepare for handling CPU interface register access as well (for save/restore of VGIC state). Pass offset from distributor or interface MMIO base to find_matching_range function instead of the physical address of the access in the VM memory map. This allows other callers unaware of the VM specifics, but with generic VGIC knowledge to reuse the function. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 45db48de4282..e2596f618281 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -602,7 +602,7 @@ struct mmio_range { phys_addr_t offset); }; -static const struct mmio_range vgic_ranges[] = { +static const struct mmio_range vgic_dist_ranges[] = { { .base = GIC_DIST_CTRL, .len = 12, @@ -669,14 +669,13 @@ static const struct mmio_range vgic_ranges[] = { static const struct mmio_range *find_matching_range(const struct mmio_range *ranges, struct kvm_exit_mmio *mmio, - phys_addr_t base) + phys_addr_t offset) { const struct mmio_range *r = ranges; - phys_addr_t addr = mmio->phys_addr - base; while (r->len) { - if (addr >= r->base && - (addr + mmio->len) <= (r->base + r->len)) + if (offset >= r->base && + (offset + mmio->len) <= (r->base + r->len)) return r; r++; } @@ -713,7 +712,8 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, return true; } - range = find_matching_range(vgic_ranges, mmio, base); + offset = mmio->phys_addr - base; + range = find_matching_range(vgic_dist_ranges, mmio, offset); if (unlikely(!range || !range->handle_mmio)) { pr_warn("Unhandled access %d %08llx %d\n", mmio->is_write, mmio->phys_addr, mmio->len); From e9b152cb957cb194437f37e79f0f3c9d34fe53d6 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 11 Dec 2013 20:29:11 -0800 Subject: [PATCH 34/67] arm/arm64: kvm: Set vcpu->cpu to -1 on vcpu_put The arch-generic KVM code expects the cpu field of a vcpu to be -1 if the vcpu is no longer assigned to a cpu. This is used for the optimized make_all_cpus_request path and will be used by the vgic code to check that no vcpus are running. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index f290b2250ed5..b92ff6d3e34b 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -342,6 +342,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + /* + * The arch-generic KVM code expects the cpu field of a vcpu to be -1 + * if the vcpu is no longer assigned to a cpu. This is used for the + * optimized make_all_cpus_request path. + */ + vcpu->cpu = -1; + kvm_arm_set_running_vcpu(NULL); } From c07a0191ef2de1f9510f12d1f88e3b0b5cd8d66f Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 25 Oct 2013 21:17:31 +0100 Subject: [PATCH 35/67] KVM: arm-vgic: Add vgic reg access from dev attr Add infrastructure to handle distributor and cpu interface register accesses through the KVM_{GET/SET}_DEVICE_ATTR interface by adding the KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_CPU_REGS groups and defining the semantics of the attr field to be the MMIO offset as specified in the GICv2 specs. Missing register accesses or other changes in individual register access functions to support save/restore of the VGIC state is added in subsequent patches. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- .../virtual/kvm/devices/arm-vgic.txt | 52 +++++ arch/arm/include/uapi/asm/kvm.h | 6 + virt/kvm/arm/vgic.c | 178 ++++++++++++++++++ 3 files changed, 236 insertions(+) diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index c9febb2a0c3e..7f4e91b1316b 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -19,3 +19,55 @@ Groups: KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) Base address in the guest physical address space of the GIC virtual cpu interface register mappings. + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | + values: | reserved | cpu id | offset | + + All distributor regs are (rw, 32-bit) + + The offset is relative to the "Distributor base address" as defined in the + GICv2 specs. Getting or setting such a register has the same effect as + reading or writing the register on the actual hardware from the cpu + specified with cpu id field. Note that most distributor fields are not + banked, but return the same value regardless of the cpu id used to access + the register. + Limitations: + - Priorities are not implemented, and registers are RAZ/WI + Errors: + -ENODEV: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running + + KVM_DEV_ARM_VGIC_GRP_CPU_REGS + Attributes: + The attr field of kvm_device_attr encodes two values: + bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | + values: | reserved | cpu id | offset | + + All CPU interface regs are (rw, 32-bit) + + The offset specifies the offset from the "CPU interface base address" as + defined in the GICv2 specs. Getting or setting such a register has the + same effect as reading or writing the register on the actual hardware. + + The Active Priorities Registers APRn are implementation defined, so we set a + fixed format for our implementation that fits with the model of a "GICv2 + implementation without the security extensions" which we present to the + guest. This interface always exposes four register APR[0-3] describing the + maximum possible 128 preemption levels. The semantics of the register + indicate if any interrupts in a given preemption level are in the active + state by setting the corresponding bit. + + Thus, preemption level X has one or more active interrupts if and only if: + + APRn[X mod 32] == 0b1, where n = X / 32 + + Bits for undefined preemption levels are RAZ/WI. + + Limitations: + - Priorities are not implemented, and registers are RAZ/WI + Errors: + -ENODEV: Getting or setting this register is not yet supported + -EBUSY: One or more VCPUs are running diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 76a742769e2b..ef0c8785ba16 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -165,6 +165,12 @@ struct kvm_arch_memory_slot { /* Device Control API: ARM VGIC */ #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 +#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 +#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2 +#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32 +#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) +#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 +#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index e2596f618281..88599b585362 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -589,6 +589,20 @@ static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, return false; } +static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + return false; +} + +static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + return false; +} + /* * I would have liked to use the kvm_bus_io_*() API instead, but it * cannot cope with banked registers (only the VM pointer is passed @@ -663,6 +677,16 @@ static const struct mmio_range vgic_dist_ranges[] = { .len = 4, .handle_mmio = handle_mmio_sgi_reg, }, + { + .base = GIC_DIST_SGI_PENDING_CLEAR, + .len = VGIC_NR_SGIS, + .handle_mmio = handle_mmio_sgi_clear, + }, + { + .base = GIC_DIST_SGI_PENDING_SET, + .len = VGIC_NR_SGIS, + .handle_mmio = handle_mmio_sgi_set, + }, {} }; @@ -1557,6 +1581,114 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) return r; } +static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ + return true; +} + +static const struct mmio_range vgic_cpu_ranges[] = { + { + .base = GIC_CPU_CTRL, + .len = 12, + .handle_mmio = handle_cpu_mmio_misc, + }, + { + .base = GIC_CPU_ALIAS_BINPOINT, + .len = 4, + .handle_mmio = handle_cpu_mmio_misc, + }, + { + .base = GIC_CPU_ACTIVEPRIO, + .len = 16, + .handle_mmio = handle_cpu_mmio_misc, + }, + { + .base = GIC_CPU_IDENT, + .len = 4, + .handle_mmio = handle_cpu_mmio_misc, + }, +}; + +static int vgic_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u32 *reg, bool is_write) +{ + const struct mmio_range *r = NULL, *ranges; + phys_addr_t offset; + int ret, cpuid, c; + struct kvm_vcpu *vcpu, *tmp_vcpu; + struct vgic_dist *vgic; + struct kvm_exit_mmio mmio; + + offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> + KVM_DEV_ARM_VGIC_CPUID_SHIFT; + + mutex_lock(&dev->kvm->lock); + + if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { + ret = -EINVAL; + goto out; + } + + vcpu = kvm_get_vcpu(dev->kvm, cpuid); + vgic = &dev->kvm->arch.vgic; + + mmio.len = 4; + mmio.is_write = is_write; + if (is_write) + mmio_data_write(&mmio, ~0, *reg); + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + mmio.phys_addr = vgic->vgic_dist_base + offset; + ranges = vgic_dist_ranges; + break; + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + mmio.phys_addr = vgic->vgic_cpu_base + offset; + ranges = vgic_cpu_ranges; + break; + default: + BUG(); + } + r = find_matching_range(ranges, &mmio, offset); + + if (unlikely(!r || !r->handle_mmio)) { + ret = -ENXIO; + goto out; + } + + + spin_lock(&vgic->lock); + + /* + * Ensure that no other VCPU is running by checking the vcpu->cpu + * field. If no other VPCUs are running we can safely access the VGIC + * state, because even if another VPU is run after this point, that + * VCPU will not touch the vgic state, because it will block on + * getting the vgic->lock in kvm_vgic_sync_hwstate(). + */ + kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) { + if (unlikely(tmp_vcpu->cpu != -1)) { + ret = -EBUSY; + goto out_vgic_unlock; + } + } + + offset -= r->base; + r->handle_mmio(vcpu, &mmio, offset); + + if (!is_write) + *reg = mmio_data_read(&mmio, ~0); + + ret = 0; +out_vgic_unlock: + spin_unlock(&vgic->lock); +out: + mutex_unlock(&dev->kvm->lock); + return ret; +} + static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r; @@ -1573,6 +1705,18 @@ static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) r = kvm_vgic_addr(dev->kvm, type, &addr, true); return (r == -ENODEV) ? -ENXIO : r; } + + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 reg; + + if (get_user(reg, uaddr)) + return -EFAULT; + + return vgic_attr_regs_access(dev, attr, ®, true); + } + } return -ENXIO; @@ -1594,14 +1738,42 @@ static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) if (copy_to_user(uaddr, &addr, sizeof(addr))) return -EFAULT; + break; } + + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 reg = 0; + + r = vgic_attr_regs_access(dev, attr, ®, false); + if (r) + return r; + r = put_user(reg, uaddr); + break; + } + } return r; } +static int vgic_has_attr_regs(const struct mmio_range *ranges, + phys_addr_t offset) +{ + struct kvm_exit_mmio dev_attr_mmio; + + dev_attr_mmio.len = 4; + if (find_matching_range(ranges, &dev_attr_mmio, offset)) + return 0; + else + return -ENXIO; +} + static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + phys_addr_t offset; + switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: switch (attr->attr) { @@ -1610,6 +1782,12 @@ static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) return 0; } break; + case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: + offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + return vgic_has_attr_regs(vgic_dist_ranges, offset); + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; + return vgic_has_attr_regs(vgic_cpu_ranges, offset); } return -ENXIO; } From cbd333a4bfd0d93bba36d46a0e4e7979228873a6 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 15 Nov 2013 20:51:31 -0800 Subject: [PATCH 36/67] KVM: arm-vgic: Support unqueueing of LRs to the dist To properly access the VGIC state from user space it is very unpractical to have to loop through all the LRs in all register access functions. Instead, support moving all pending state from LRs to the distributor, but leave active state LRs alone. Note that to accurately present the active and pending state to VCPUs reading these distributor registers from a live VM, we would have to stop all other VPUs than the calling VCPU and ask each CPU to unqueue their LR state onto the distributor and add fields to track active state on the distributor side as well. We don't have any users of such functionality yet and there are other inaccuracies of the GIC emulation, so don't provide accurate synchronized access to this state just yet. However, when the time comes, having this function should help. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 88 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 88599b585362..d08ba28e729a 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -589,6 +589,80 @@ static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, return false; } +#define LR_CPUID(lr) \ + (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT) +#define LR_IRQID(lr) \ + ((lr) & GICH_LR_VIRTUALID) + +static void vgic_retire_lr(int lr_nr, int irq, struct vgic_cpu *vgic_cpu) +{ + clear_bit(lr_nr, vgic_cpu->lr_used); + vgic_cpu->vgic_lr[lr_nr] &= ~GICH_LR_STATE; + vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; +} + +/** + * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor + * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs + * + * Move any pending IRQs that have already been assigned to LRs back to the + * emulated distributor state so that the complete emulated state can be read + * from the main emulation structures without investigating the LRs. + * + * Note that IRQs in the active state in the LRs get their pending state moved + * to the distributor but the active state stays in the LRs, because we don't + * track the active state on the distributor side. + */ +static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + int vcpu_id = vcpu->vcpu_id; + int i, irq, source_cpu; + u32 *lr; + + for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) { + lr = &vgic_cpu->vgic_lr[i]; + irq = LR_IRQID(*lr); + source_cpu = LR_CPUID(*lr); + + /* + * There are three options for the state bits: + * + * 01: pending + * 10: active + * 11: pending and active + * + * If the LR holds only an active interrupt (not pending) then + * just leave it alone. + */ + if ((*lr & GICH_LR_STATE) == GICH_LR_ACTIVE_BIT) + continue; + + /* + * Reestablish the pending state on the distributor and the + * CPU interface. It may have already been pending, but that + * is fine, then we are only setting a few bits that were + * already set. + */ + vgic_dist_irq_set(vcpu, irq); + if (irq < VGIC_NR_SGIS) + dist->irq_sgi_sources[vcpu_id][irq] |= 1 << source_cpu; + *lr &= ~GICH_LR_PENDING_BIT; + + /* + * If there's no state left on the LR (it could still be + * active), then the LR does not hold any useful info and can + * be marked as free for other use. + */ + if (!(*lr & GICH_LR_STATE)) + vgic_retire_lr(i, irq, vgic_cpu); + + /* Finally update the VGIC state. */ + vgic_update_state(vcpu->kvm); + } +} + static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) @@ -848,8 +922,6 @@ static void vgic_update_state(struct kvm *kvm) } } -#define LR_CPUID(lr) \ - (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT) #define MK_LR_PEND(src, irq) \ (GICH_LR_PENDING_BIT | ((src) << GICH_LR_PHYSID_CPUID_SHIFT) | (irq)) @@ -871,9 +943,7 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) int irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; if (!vgic_irq_is_enabled(vcpu, irq)) { - vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; - clear_bit(lr, vgic_cpu->lr_used); - vgic_cpu->vgic_lr[lr] &= ~GICH_LR_STATE; + vgic_retire_lr(lr, irq, vgic_cpu); if (vgic_irq_is_active(vcpu, irq)) vgic_irq_clear_active(vcpu, irq); } @@ -1675,6 +1745,14 @@ static int vgic_attr_regs_access(struct kvm_device *dev, } } + /* + * Move all pending IRQs from the LRs on all VCPUs so the pending + * state can be properly represented in the register state accessible + * through this API. + */ + kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) + vgic_unqueue_irqs(tmp_vcpu); + offset -= r->base; r->handle_mmio(vcpu, &mmio, offset); From 90a5355ee7639e92c0492ec592cba5c31bd80687 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 25 Oct 2013 21:22:31 +0100 Subject: [PATCH 37/67] KVM: arm-vgic: Add GICD_SPENDSGIR and GICD_CPENDSGIR handlers Handle MMIO accesses to the two registers which should support both the case where the VMs want to read/write either of these registers and the case where user space reads/writes these registers to do save/restore of the VGIC state. Note that the added complexity compared to simple set/clear enable registers stems from the bookkeping of source cpu ids. It may be possible to change the underlying data structure to simplify the complexity, but since this is not in the critical path at all, this will do. Also note that reading this register from a live guest will not be accurate compared to on hardware, because some state may be living on the CPU LRs and the only way to give a consistent read would be to force stop all the VCPUs and request them to unqueu the LR state onto the distributor. Until we have an actual user of live reading this register, we can live with the difference. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 70 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 4 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index d08ba28e729a..e59aaa4c64e5 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -663,18 +663,80 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) } } -static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) +/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */ +static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) { + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int sgi; + int min_sgi = (offset & ~0x3) * 4; + int max_sgi = min_sgi + 3; + int vcpu_id = vcpu->vcpu_id; + u32 reg = 0; + + /* Copy source SGIs from distributor side */ + for (sgi = min_sgi; sgi <= max_sgi; sgi++) { + int shift = 8 * (sgi - min_sgi); + reg |= (u32)dist->irq_sgi_sources[vcpu_id][sgi] << shift; + } + + mmio_data_write(mmio, ~0, reg); return false; } +static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset, bool set) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + int sgi; + int min_sgi = (offset & ~0x3) * 4; + int max_sgi = min_sgi + 3; + int vcpu_id = vcpu->vcpu_id; + u32 reg; + bool updated = false; + + reg = mmio_data_read(mmio, ~0); + + /* Clear pending SGIs on the distributor */ + for (sgi = min_sgi; sgi <= max_sgi; sgi++) { + u8 mask = reg >> (8 * (sgi - min_sgi)); + if (set) { + if ((dist->irq_sgi_sources[vcpu_id][sgi] & mask) != mask) + updated = true; + dist->irq_sgi_sources[vcpu_id][sgi] |= mask; + } else { + if (dist->irq_sgi_sources[vcpu_id][sgi] & mask) + updated = true; + dist->irq_sgi_sources[vcpu_id][sgi] &= ~mask; + } + } + + if (updated) + vgic_update_state(vcpu->kvm); + + return updated; +} + static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - return false; + if (!mmio->is_write) + return read_set_clear_sgi_pend_reg(vcpu, mmio, offset); + else + return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true); +} + +static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + if (!mmio->is_write) + return read_set_clear_sgi_pend_reg(vcpu, mmio, offset); + else + return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false); } /* From fa20f5aea56f271f83e91b9cde00f043a5a14990 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 23 Sep 2013 14:55:57 -0700 Subject: [PATCH 38/67] KVM: arm-vgic: Support CPU interface reg access Implement support for the CPU interface register access driven by MMIO address offsets from the CPU interface base address. Useful for user space to support save/restore of the VGIC state. This commit adds support only for the same logic as the current VGIC support, and no more. For example, the active priority registers are handled as RAZ/WI, just like setting priorities on the emulated distributor. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/vgic.c | 81 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 8 deletions(-) diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index e59aaa4c64e5..be456ce264d0 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -71,6 +71,10 @@ #define VGIC_ADDR_UNDEF (-1) #define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) +#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ +#define IMPLEMENTER_ARM 0x43b +#define GICC_ARCH_VERSION_V2 0x2 + /* Physical address of vgic virtual cpu interface */ static phys_addr_t vgic_vcpu_base; @@ -312,7 +316,7 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu, u32 word_offset = offset & 3; switch (offset & ~3) { - case 0: /* CTLR */ + case 0: /* GICD_CTLR */ reg = vcpu->kvm->arch.vgic.enabled; vgic_reg_access(mmio, ®, word_offset, ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); @@ -323,15 +327,15 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu, } break; - case 4: /* TYPER */ + case 4: /* GICD_TYPER */ reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; reg |= (VGIC_NR_IRQS >> 5) - 1; vgic_reg_access(mmio, ®, word_offset, ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); break; - case 8: /* IIDR */ - reg = 0x4B00043B; + case 8: /* GICD_IIDR */ + reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); vgic_reg_access(mmio, ®, word_offset, ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); break; @@ -1716,9 +1720,70 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - return true; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u32 reg, mask = 0, shift = 0; + bool updated = false; + + switch (offset & ~0x3) { + case GIC_CPU_CTRL: + mask = GICH_VMCR_CTRL_MASK; + shift = GICH_VMCR_CTRL_SHIFT; + break; + case GIC_CPU_PRIMASK: + mask = GICH_VMCR_PRIMASK_MASK; + shift = GICH_VMCR_PRIMASK_SHIFT; + break; + case GIC_CPU_BINPOINT: + mask = GICH_VMCR_BINPOINT_MASK; + shift = GICH_VMCR_BINPOINT_SHIFT; + break; + case GIC_CPU_ALIAS_BINPOINT: + mask = GICH_VMCR_ALIAS_BINPOINT_MASK; + shift = GICH_VMCR_ALIAS_BINPOINT_SHIFT; + break; + } + + if (!mmio->is_write) { + reg = (vgic_cpu->vgic_vmcr & mask) >> shift; + mmio_data_write(mmio, ~0, reg); + } else { + reg = mmio_data_read(mmio, ~0); + reg = (reg << shift) & mask; + if (reg != (vgic_cpu->vgic_vmcr & mask)) + updated = true; + vgic_cpu->vgic_vmcr &= ~mask; + vgic_cpu->vgic_vmcr |= reg; + } + return updated; } +static bool handle_mmio_abpr(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, phys_addr_t offset) +{ + return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT); +} + +static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu, + struct kvm_exit_mmio *mmio, + phys_addr_t offset) +{ + u32 reg; + + if (mmio->is_write) + return false; + + /* GICC_IIDR */ + reg = (PRODUCT_ID_KVM << 20) | + (GICC_ARCH_VERSION_V2 << 16) | + (IMPLEMENTER_ARM << 0); + mmio_data_write(mmio, ~0, reg); + return false; +} + +/* + * CPU Interface Register accesses - these are not accessed by the VM, but by + * user space for saving and restoring VGIC state. + */ static const struct mmio_range vgic_cpu_ranges[] = { { .base = GIC_CPU_CTRL, @@ -1728,17 +1793,17 @@ static const struct mmio_range vgic_cpu_ranges[] = { { .base = GIC_CPU_ALIAS_BINPOINT, .len = 4, - .handle_mmio = handle_cpu_mmio_misc, + .handle_mmio = handle_mmio_abpr, }, { .base = GIC_CPU_ACTIVEPRIO, .len = 16, - .handle_mmio = handle_cpu_mmio_misc, + .handle_mmio = handle_mmio_raz_wi, }, { .base = GIC_CPU_IDENT, .len = 4, - .handle_mmio = handle_cpu_mmio_misc, + .handle_mmio = handle_cpu_mmio_ident, }, }; From da7814700a0c408bead58ce4714b7625ffbaade1 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 12 Dec 2013 16:12:22 +0000 Subject: [PATCH 39/67] arm64: KVM: Add Kconfig option for max VCPUs per-Guest Current max VCPUs per-Guest is set to 4 which is preventing us from creating a Guest (or VM) with 8 VCPUs on Host (e.g. X-Gene Storm SOC) with 8 Host CPUs. The correct value of max VCPUs per-Guest should be same as the max CPUs supported by GICv2 which is 8 but, increasing value of max VCPUs per-Guest can make things slower hence we add Kconfig option to let KVM users select appropriate max VCPUs per-Guest. Signed-off-by: Anup Patel Signed-off-by: Pranavkumar Sawargaonkar Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 7 ++++++- arch/arm64/kvm/Kconfig | 11 +++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5d85a02d1231..0a1d69751562 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -26,7 +26,12 @@ #include #include -#define KVM_MAX_VCPUS 4 +#if defined(CONFIG_KVM_ARM_MAX_VCPUS) +#define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS +#else +#define KVM_MAX_VCPUS 0 +#endif + #define KVM_USER_MEM_SLOTS 32 #define KVM_PRIVATE_MEM_SLOTS 4 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 4480ab339a00..8ba85e9ea388 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -36,6 +36,17 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +config KVM_ARM_MAX_VCPUS + int "Number maximum supported virtual CPUs per VM" + depends on KVM_ARM_HOST + default 4 + help + Static number of max supported virtual CPUs per VM. + + If you choose a high number, the vcpu structures will be quite + large, so only choose a reasonable number that you expect to + actually use. + config KVM_ARM_VGIC bool depends on KVM_ARM_HOST && OF From e28100bd8ed9e37b7cd4578140a1e7f95bd40835 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 14 Nov 2013 15:20:08 +0000 Subject: [PATCH 40/67] arm64: KVM: Support X-Gene guest VCPU on APM X-Gene host This patch allows us to have X-Gene guest VCPU when using KVM arm64 on APM X-Gene host. We add KVM_ARM_TARGET_XGENE_POTENZA for X-Gene Potenza compatible guest VCPU and we return KVM_ARM_TARGET_XGENE_POTENZA in kvm_target_cpu() when running on X-Gene host with Potenza core. [maz: sanitized the commit log] Signed-off-by: Anup Patel Signed-off-by: Pranavkumar Sawargaonkar Signed-off-by: Marc Zyngier --- arch/arm64/include/uapi/asm/kvm.h | 3 ++- arch/arm64/kvm/guest.c | 32 +++++++++++++++++----------- arch/arm64/kvm/sys_regs_generic_v8.c | 3 +++ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 5031f4263937..d9f026bc1a27 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -55,8 +55,9 @@ struct kvm_regs { #define KVM_ARM_TARGET_AEM_V8 0 #define KVM_ARM_TARGET_FOUNDATION_V8 1 #define KVM_ARM_TARGET_CORTEX_A57 2 +#define KVM_ARM_TARGET_XGENE_POTENZA 3 -#define KVM_ARM_NUM_TARGETS 3 +#define KVM_ARM_NUM_TARGETS 4 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 3f0731e53274..08745578d54d 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -207,20 +207,26 @@ int __attribute_const__ kvm_target_cpu(void) unsigned long implementor = read_cpuid_implementor(); unsigned long part_number = read_cpuid_part_number(); - if (implementor != ARM_CPU_IMP_ARM) - return -EINVAL; + switch (implementor) { + case ARM_CPU_IMP_ARM: + switch (part_number) { + case ARM_CPU_PART_AEM_V8: + return KVM_ARM_TARGET_AEM_V8; + case ARM_CPU_PART_FOUNDATION: + return KVM_ARM_TARGET_FOUNDATION_V8; + case ARM_CPU_PART_CORTEX_A57: + return KVM_ARM_TARGET_CORTEX_A57; + }; + break; + case ARM_CPU_IMP_APM: + switch (part_number) { + case APM_CPU_PART_POTENZA: + return KVM_ARM_TARGET_XGENE_POTENZA; + }; + break; + }; - switch (part_number) { - case ARM_CPU_PART_AEM_V8: - return KVM_ARM_TARGET_AEM_V8; - case ARM_CPU_PART_FOUNDATION: - return KVM_ARM_TARGET_FOUNDATION_V8; - case ARM_CPU_PART_CORTEX_A57: - /* Currently handled by the generic backend */ - return KVM_ARM_TARGET_CORTEX_A57; - default: - return -EINVAL; - } + return -EINVAL; } int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/sys_regs_generic_v8.c b/arch/arm64/kvm/sys_regs_generic_v8.c index 4268ab9356b1..8fe6f76b0edc 100644 --- a/arch/arm64/kvm/sys_regs_generic_v8.c +++ b/arch/arm64/kvm/sys_regs_generic_v8.c @@ -90,6 +90,9 @@ static int __init sys_reg_genericv8_init(void) &genericv8_target_table); kvm_register_target_sys_reg_table(KVM_ARM_TARGET_CORTEX_A57, &genericv8_target_table); + kvm_register_target_sys_reg_table(KVM_ARM_TARGET_XGENE_POTENZA, + &genericv8_target_table); + return 0; } late_initcall(sys_reg_genericv8_init); From e5cf9dcdbfd26cd4e1991db08755da900454efeb Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 12 Dec 2013 16:12:23 +0000 Subject: [PATCH 41/67] arm64: KVM: Force undefined exception for Guest SMC intructions The SMC-based PSCI emulation for Guest is going to be very different from the in-kernel HVC-based PSCI emulation hence for now just inject undefined exception when Guest executes SMC instruction. Signed-off-by: Anup Patel Signed-off-by: Pranavkumar Sawargaonkar Signed-off-by: marc Zyngier --- arch/arm64/kvm/handle_exit.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 8da56067c304..df84d7bcc7df 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -39,9 +39,6 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) { - if (kvm_psci_call(vcpu)) - return 1; - kvm_inject_undefined(vcpu); return 1; } From 171800328f6e2443e0e356de5b41fb7e0fff4448 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 22 Dec 2013 01:21:23 +0900 Subject: [PATCH 42/67] KVM: doc: Fix typo in doc/virtual/kvm Correct spelling typo in Documentations/virtual/kvm Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 4 ++-- Documentation/virtual/kvm/hypercalls.txt | 2 +- Documentation/virtual/kvm/locking.txt | 4 ++-- Documentation/virtual/kvm/ppc-pv.txt | 2 +- Documentation/virtual/kvm/timekeeping.txt | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 867112f1968d..a8cdc7b72281 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2104,7 +2104,7 @@ Returns: 0 on success, -1 on error Allows setting an eventfd to directly trigger a guest interrupt. kvm_irqfd.fd specifies the file descriptor to use as the eventfd and kvm_irqfd.gsi specifies the irqchip pin toggled by this event. When -an event is tiggered on the eventfd, an interrupt is injected into +an event is triggered on the eventfd, an interrupt is injected into the guest using the specified gsi pin. The irqfd is removed using the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd and kvm_irqfd.gsi. @@ -2115,7 +2115,7 @@ interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an additional eventfd in the kvm_irqfd.resamplefd field. When operating in resample mode, posting of an interrupt through kvm_irq.fd asserts the specified gsi in the irqchip. When the irqchip is resampled, such -as from an EOI, the gsi is de-asserted and the user is notifed via +as from an EOI, the gsi is de-asserted and the user is notified via kvm_irqfd.resamplefd. It is the user's responsibility to re-queue the interrupt if the device making use of it still requires service. Note that closing the resamplefd is not sufficient to disable the diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index d922d73efa7b..c8d040e27046 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -77,7 +77,7 @@ Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest kernel mode for an event to occur (ex: a spinlock to become available) can execute HLT instruction once it has busy-waited for more than a threshold time-interval. Execution of HLT instruction would cause the hypervisor to put -the vcpu to sleep until occurence of an appropriate event. Another vcpu of the +the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) is used in the hypercall for future use. diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index f8869410d40c..d68af4dc3006 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -112,7 +112,7 @@ The Dirty bit is lost in this case. In order to avoid this kind of issue, we always treat the spte as "volatile" if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, -the spte is always atomicly updated in this case. +the spte is always atomically updated in this case. 3): flush tlbs due to spte updated If the spte is updated from writable to readonly, we should flush all TLBs, @@ -125,7 +125,7 @@ be flushed caused by this reason in mmu_spte_update() since this is a common function to update spte (present -> present). Since the spte is "volatile" if it can be updated out of mmu-lock, we always -atomicly update the spte, the race caused by fast page fault can be avoided, +atomically update the spte, the race caused by fast page fault can be avoided, See the comments in spte_has_volatile_bits() and mmu_spte_update(). 3. Reference diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt index 4cd076febb02..4643cde517c4 100644 --- a/Documentation/virtual/kvm/ppc-pv.txt +++ b/Documentation/virtual/kvm/ppc-pv.txt @@ -115,7 +115,7 @@ If any other bit changes in the MSR, please still use mtmsr(d). Patched instructions ==================== -The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions +The "ld" and "std" instructions are transformed to "lwz" and "stw" instructions respectively on 32 bit systems with an added offset of 4 to accommodate for big endianness. diff --git a/Documentation/virtual/kvm/timekeeping.txt b/Documentation/virtual/kvm/timekeeping.txt index df8946377cb6..76808a17ad84 100644 --- a/Documentation/virtual/kvm/timekeeping.txt +++ b/Documentation/virtual/kvm/timekeeping.txt @@ -467,7 +467,7 @@ at any time. This causes problems as the passage of real time, the injection of machine interrupts and the associated clock sources are no longer completely synchronized with real time. -This same problem can occur on native harware to a degree, as SMM mode may +This same problem can occur on native hardware to a degree, as SMM mode may steal cycles from the naturally on X86 systems when SMM mode is used by the BIOS, but not in such an extreme fashion. However, the fact that SMM mode may cause similar problems to virtualization makes it a good justification for From 2f0a6397dd3cac2fb05b46cad08c1d532c04d6b8 Mon Sep 17 00:00:00 2001 From: Zhihui Zhang Date: Mon, 30 Dec 2013 15:56:29 -0500 Subject: [PATCH 43/67] KVM: VMX: check use I/O bitmap first before unconditional I/O exit According to Table C-1 of Intel SDM 3C, a VM exit happens on an I/O instruction when "use I/O bitmaps" VM-execution control was 0 _and_ the "unconditional I/O exiting" VM-execution control was 1. So we can't just check "unconditional I/O exiting" alone. This patch was improved by suggestion from Jan Kiszka. Reviewed-by: Jan Kiszka Signed-off-by: Zhihui Zhang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9cc54842ae14..0abf8b783f19 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6521,11 +6521,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, int size; u8 b; - if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING)) - return 1; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - return 0; + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); From 7940876e1330671708186ac3386aa521ffb5c182 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 Dec 2013 12:12:29 -0800 Subject: [PATCH 44/67] kvm: make local functions static Running 'make namespacecheck' found lots of functions that should be declared static, since only used in one file. Signed-off-by: Stephen Hemminger Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 16 ---------------- virt/kvm/ioapic.c | 2 +- virt/kvm/ioapic.h | 1 - virt/kvm/kvm_main.c | 35 ++++++++++++++++++----------------- 4 files changed, 19 insertions(+), 35 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1f46f66f60ab..4306c5608f6d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -463,8 +463,6 @@ void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); -void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new, - u64 last_generation); static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) { @@ -537,7 +535,6 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); -void kvm_set_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); @@ -549,7 +546,6 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_release_pfn_dirty(pfn_t pfn); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); void kvm_set_pfn_accessed(pfn_t pfn); @@ -576,8 +572,6 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); -void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, - gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); @@ -604,8 +598,6 @@ int kvm_get_dirty_log(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); -int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem); int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status); long kvm_arch_vm_ioctl(struct file *filp, @@ -653,8 +645,6 @@ void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -void kvm_free_physmem(struct kvm *kvm); - void *kvm_kvzalloc(unsigned long size); void kvm_kvfree(const void *addr); @@ -1097,12 +1087,6 @@ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } - -static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) -{ - return true; -} - #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ #endif diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 2d682977ce82..ce9ed99ad7dc 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -520,7 +520,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, return 0; } -void kvm_ioapic_reset(struct kvm_ioapic *ioapic) +static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) { int i; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 615d8c995c3c..90d43e95dcf8 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -91,7 +91,6 @@ void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, int level, bool line_status); void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); -void kvm_ioapic_reset(struct kvm_ioapic *ioapic); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3efba97bdce2..e7c6ddd8ecc0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -95,6 +95,12 @@ static int hardware_enable_all(void); static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); +static void update_memslots(struct kvm_memslots *slots, + struct kvm_memory_slot *new, u64 last_generation); + +static void kvm_release_pfn_dirty(pfn_t pfn); +static void mark_page_dirty_in_slot(struct kvm *kvm, + struct kvm_memory_slot *memslot, gfn_t gfn); bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); @@ -553,7 +559,7 @@ static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free, free->npages = 0; } -void kvm_free_physmem(struct kvm *kvm) +static void kvm_free_physmem(struct kvm *kvm) { struct kvm_memslots *slots = kvm->memslots; struct kvm_memory_slot *memslot; @@ -675,8 +681,9 @@ static void sort_memslots(struct kvm_memslots *slots) slots->id_to_index[slots->memslots[i].id] = i; } -void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new, - u64 last_generation) +static void update_memslots(struct kvm_memslots *slots, + struct kvm_memory_slot *new, + u64 last_generation) { if (new) { int id = new->id; @@ -924,8 +931,8 @@ int kvm_set_memory_region(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_set_memory_region); -int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) +static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) { if (mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; @@ -1047,7 +1054,7 @@ static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, } unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, - gfn_t gfn) + gfn_t gfn) { return gfn_to_hva_many(slot, gfn, NULL); } @@ -1387,18 +1394,11 @@ void kvm_release_page_dirty(struct page *page) } EXPORT_SYMBOL_GPL(kvm_release_page_dirty); -void kvm_release_pfn_dirty(pfn_t pfn) +static void kvm_release_pfn_dirty(pfn_t pfn) { kvm_set_pfn_dirty(pfn); kvm_release_pfn_clean(pfn); } -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); - -void kvm_set_page_dirty(struct page *page) -{ - kvm_set_pfn_dirty(page_to_pfn(page)); -} -EXPORT_SYMBOL_GPL(kvm_set_page_dirty); void kvm_set_pfn_dirty(pfn_t pfn) { @@ -1640,8 +1640,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, - gfn_t gfn) +static void mark_page_dirty_in_slot(struct kvm *kvm, + struct kvm_memory_slot *memslot, + gfn_t gfn) { if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; @@ -1757,7 +1758,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); * locking does not harm. It may result in trying to yield to same VCPU, fail * and continue with next VCPU and so on. */ -bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) +static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) { bool eligible; From ea0269bc34a7df6bda1ee862ad198dee0839f170 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 Dec 2013 12:13:08 -0800 Subject: [PATCH 45/67] kvm: remove dead code The function kvm_io_bus_read_cookie is defined but never used in current in-tree code. Signed-off-by: Stephen Hemminger Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 27 --------------------------- 2 files changed, 29 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4306c5608f6d..b8e9a43e501a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -172,8 +172,6 @@ int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie); int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); -int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, - int len, void *val, long cookie); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e7c6ddd8ecc0..b28579e84248 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2934,33 +2934,6 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, return r < 0 ? r : 0; } -/* kvm_io_bus_read_cookie - called under kvm->slots_lock */ -int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, - int len, void *val, long cookie) -{ - struct kvm_io_bus *bus; - struct kvm_io_range range; - - range = (struct kvm_io_range) { - .addr = addr, - .len = len, - }; - - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); - - /* First try the device referenced by cookie. */ - if ((cookie >= 0) && (cookie < bus->dev_count) && - (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) - if (!kvm_iodevice_read(bus->range[cookie].dev, addr, len, - val)) - return cookie; - - /* - * cookie contained garbage; fall back to search and return the - * correct cookie value. - */ - return __kvm_io_bus_read(bus, &range, val); -} /* Caller must hold slots_lock. */ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, From 96893977b8f732493815e7a2b552c37e1bb967e5 Mon Sep 17 00:00:00 2001 From: Chen Fan Date: Thu, 2 Jan 2014 17:14:11 +0800 Subject: [PATCH 46/67] KVM: x86: Fix debug typo error in lapic fix the 'vcpi' typos when apic_debug is enabled. Signed-off-by: Chen Fan Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5439117d5c4c..206715bc603d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -432,7 +432,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) u8 val; if (pv_eoi_get_user(vcpu, &val) < 0) apic_debug("Can't read EOI MSR value: 0x%llx\n", - (unsigned long long)vcpi->arch.pv_eoi.msr_val); + (unsigned long long)vcpu->arch.pv_eoi.msr_val); return val & 0x1; } @@ -440,7 +440,7 @@ static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { apic_debug("Can't set EOI MSR value: 0x%llx\n", - (unsigned long long)vcpi->arch.pv_eoi.msr_val); + (unsigned long long)vcpu->arch.pv_eoi.msr_val); return; } __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); @@ -450,7 +450,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { apic_debug("Can't clear EOI MSR value: 0x%llx\n", - (unsigned long long)vcpi->arch.pv_eoi.msr_val); + (unsigned long long)vcpu->arch.pv_eoi.msr_val); return; } __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); From 26a865f4aa8e66a6d94958de7656f7f1b03c6c56 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Jan 2014 17:00:51 -0200 Subject: [PATCH 47/67] KVM: VMX: fix use after free of vmx->loaded_vmcs After free_loaded_vmcs executes, the "loaded_vmcs" structure is kfreed, and now vmx->loaded_vmcs points to a kfreed area. Subsequent free_loaded_vmcs then attempts to manipulate vmx->loaded_vmcs. Switch the order to avoid the problem. https://bugzilla.redhat.com/show_bug.cgi?id=1047892 Reviewed-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0abf8b783f19..7661eb171936 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7390,8 +7390,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - free_nested(vmx); free_loaded_vmcs(vmx->loaded_vmcs); + free_nested(vmx); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); From 136d737fd20102f1be9b02356590fd55e3a40d0e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 13 Dec 2013 16:56:06 +0000 Subject: [PATCH 48/67] arm/arm64: KVM: relax the requirements of VMA alignment for THP The THP code in KVM/ARM is a bit restrictive in not allowing a THP to be used if the VMA is not 2MB aligned. Actually, it is not so much the VMA that matters, but the associated memslot: A process can perfectly mmap a region with no particular alignment restriction, and then pass a 2MB aligned address to KVM. In this case, KVM will only use this 2MB aligned region, and will ignore the range between vma->vm_start and memslot->userspace_addr. It can also choose to place this memslot at whatever alignment it wants in the IPA space. In the end, what matters is the relative alignment of the user space and IPA mappings with respect to a 2M page. They absolutely must be the same if you want to use THP. Cc: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/mmu.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 659db0ed1370..7789857d1470 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -667,14 +667,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { /* - * Pages belonging to VMAs not aligned to the PMD mapping - * granularity cannot be mapped using block descriptors even - * if the pages belong to a THP for the process, because the - * stage-2 block descriptor will cover more than a single THP - * and we loose atomicity for unmapping, updates, and splits - * of the THP or other pages in the stage-2 block range. + * Pages belonging to memslots that don't have the same + * alignment for userspace and IPA cannot be mapped using + * block descriptors even if the pages belong to a THP for + * the process, because the stage-2 block descriptor will + * cover more than a single THP and we loose atomicity for + * unmapping, updates, and splits of the THP or other pages + * in the stage-2 block range. */ - if (vma->vm_start & ~PMD_MASK) + if ((memslot->userspace_addr & ~PMD_MASK) != + ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) force_pte = true; } up_read(¤t->mm->mmap_sem); From 61466710de078c697106fa5b70ec7afc9feab520 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Tue, 7 Jan 2014 13:45:15 +0530 Subject: [PATCH 49/67] KVM: ARM: Remove duplicate include trace.h was included twice. Remove duplicate inclusion. Signed-off-by: Sachin Kamat Signed-off-by: Christoffer Dall --- arch/arm/kvm/handle_exit.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index a92079011a83..0de91fc6de0f 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -26,8 +26,6 @@ #include "trace.h" -#include "trace.h" - typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); static int handle_svc_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) From e81d1ad32753cdeaef56b9bffe3b8ab7b5c776e5 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Fri, 10 Jan 2014 01:28:46 +0100 Subject: [PATCH 50/67] kvm: vfio: silence GCC warning Building vfio.o triggers a GCC warning (when building for 32 bits x86): arch/x86/kvm/../../../virt/kvm/vfio.c: In function 'kvm_vfio_set_group': arch/x86/kvm/../../../virt/kvm/vfio.c:104:22: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] void __user *argp = (void __user *)arg; ^ Silence this warning by casting arg to unsigned long. argp's current type, "void __user *", is always casted to "int32_t __user *". So its type might as well be changed to "int32_t __user *". Signed-off-by: Paul Bolle Signed-off-by: Paolo Bonzini --- virt/kvm/vfio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index ca4260e35037..b4f9507ae650 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -101,14 +101,14 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) struct kvm_vfio *kv = dev->private; struct vfio_group *vfio_group; struct kvm_vfio_group *kvg; - void __user *argp = (void __user *)arg; + int32_t __user *argp = (int32_t __user *)(unsigned long)arg; struct fd f; int32_t fd; int ret; switch (attr) { case KVM_DEV_VFIO_GROUP_ADD: - if (get_user(fd, (int32_t __user *)argp)) + if (get_user(fd, argp)) return -EFAULT; f = fdget(fd); @@ -148,7 +148,7 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) return 0; case KVM_DEV_VFIO_GROUP_DEL: - if (get_user(fd, (int32_t __user *)argp)) + if (get_user(fd, argp)) return -EFAULT; f = fdget(fd); From 4a55dd7273c95b4a19fbcf0ae1bbd1cfd09dfc36 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 9 Jan 2014 18:43:16 -0600 Subject: [PATCH 51/67] kvm: Provide kvm_vcpu_eligible_for_directed_yield() stub Commit 7940876e1330671708186ac3386aa521ffb5c182 ("kvm: make local functions static") broke KVM PPC builds due to removing (rather than moving) the stub version of kvm_vcpu_eligible_for_directed_yield(). This patch reintroduces it. Signed-off-by: Scott Wood Cc: Stephen Hemminger Cc: Alexander Graf [Move the #ifdef inside the function. - Paolo] Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b28579e84248..9ed9c8c7b874 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1735,7 +1735,6 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target) } EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); -#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT /* * Helper that checks whether a VCPU is eligible for directed yield. * Most eligible candidate to yield is decided by following heuristics: @@ -1760,6 +1759,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); */ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) { +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT bool eligible; eligible = !vcpu->spin_loop.in_spin_loop || @@ -1770,8 +1770,10 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); return eligible; -} +#else + return true; #endif +} void kvm_vcpu_on_spin(struct kvm_vcpu *me) { From 37f6a4e237303549c8676dfe1fd1991ceab512eb Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Jan 2014 17:09:32 -0200 Subject: [PATCH 52/67] KVM: x86: handle invalid root_hpa everywhere Rom Freiman notes other code paths vulnerable to bug fixed by 989c6b34f6a9480e397b. Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 9 +++++++++ arch/x86/kvm/paging_tmpl.h | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 31a570287fcc..e50425d0f5f7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2832,6 +2832,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, bool ret = false; u64 spte = 0ull; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return false; + if (!page_fault_can_be_fast(error_code)) return false; @@ -3227,6 +3230,9 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) struct kvm_shadow_walk_iterator iterator; u64 spte = 0ull; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return spte; + walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) if (!is_shadow_present_pte(spte)) @@ -4513,6 +4519,9 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) u64 spte; int nr_sptes = 0; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return nr_sptes; + walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { sptes[iterator.level-1] = spte; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ad75d77999d0..cba218a2f08d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -569,6 +569,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + goto out_gpte_changed; + for (shadow_walk_init(&it, vcpu, addr); shadow_walk_okay(&it) && it.level > gw->level; shadow_walk_next(&it)) { @@ -820,6 +823,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) */ mmu_topup_memory_caches(vcpu); + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + WARN_ON(1); + return; + } + spin_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry(vcpu, gva, iterator) { level = iterator.level; From 9ed96e87c5748de4c2807ef17e81287c7304186c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 6 Jan 2014 12:00:02 -0200 Subject: [PATCH 53/67] KVM: x86: limit PIT timer frequency Limit PIT timer frequency similarly to the limit applied by LAPIC timer. Cc: stable@kernel.org Reviewed-by: Jan Kiszka Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/i8254.c | 18 ++++++++++++++++++ arch/x86/kvm/lapic.c | 3 --- arch/x86/kvm/x86.c | 3 +++ arch/x86/kvm/x86.h | 2 ++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 412a5aa0ef94..518d86471b76 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -37,6 +37,7 @@ #include "irq.h" #include "i8254.h" +#include "x86.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -349,6 +350,23 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) atomic_set(&ps->pending, 0); ps->irq_ack = 1; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (ps->is_periodic) { + s64 min_period = min_timer_period_us * 1000LL; + + if (ps->period < min_period) { + pr_info_ratelimited( + "kvm: requested %lld ns " + "i8254 timer period limited to %lld ns\n", + ps->period, min_period); + ps->period = min_period; + } + } + hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 206715bc603d..1ac009390780 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -71,9 +71,6 @@ #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) -static unsigned int min_timer_period_us = 500; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); - static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { *((u32 *) (apic->regs + reg_off)) = val; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1dc0359e2095..0fd2bd78fccf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -94,6 +94,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); + bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 587fb9ede436..8da5823bcde6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -125,5 +125,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) extern u64 host_xcr0; +extern unsigned int min_timer_period_us; + extern struct static_key kvm_no_apic_vcpu; #endif From f25e656d31ad112612839edaded18920cafea3b1 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 6 Jan 2014 12:18:59 -0200 Subject: [PATCH 54/67] KVM: x86: fix tsc catchup issue with tsc scaling To fix a problem related to different resolution of TSC and system clock, the offset in TSC units is approximated by delta = vcpu->hv_clock.tsc_timestamp - vcpu->last_guest_tsc (Guest TSC value at (Guest TSC value at last VM-exit) the last kvm_guest_time_update call) Delta is then later scaled using mult,shift pair found in hv_clock structure (which is correct against tsc_timestamp in that structure). However, if a frequency change is performed between these two points, this delta is measured using different TSC frequencies, but scaled using mult,shift pair for one frequency only. The end result is an incorrect delta. The bug which this code works around is not the only cause for clock backwards events. The global accumulator is still necessary, so remove the max_kernel_ns fix and rely on the global accumulator for no clock backwards events. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 41 +---------------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0fd2bd78fccf..842abd33e9b5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1487,7 +1487,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; - s64 kernel_ns, max_kernel_ns; + s64 kernel_ns; u64 tsc_timestamp, host_tsc; struct pvclock_vcpu_time_info guest_hv_clock; u8 pvclock_flags; @@ -1546,37 +1546,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (!vcpu->pv_time_enabled) return 0; - /* - * Time as measured by the TSC may go backwards when resetting the base - * tsc_timestamp. The reason for this is that the TSC resolution is - * higher than the resolution of the other clock scales. Thus, many - * possible measurments of the TSC correspond to one measurement of any - * other clock, and so a spread of values is possible. This is not a - * problem for the computation of the nanosecond clock; with TSC rates - * around 1GHZ, there can only be a few cycles which correspond to one - * nanosecond value, and any path through this code will inevitably - * take longer than that. However, with the kernel_ns value itself, - * the precision may be much lower, down to HZ granularity. If the - * first sampling of TSC against kernel_ns ends in the low part of the - * range, and the second in the high end of the range, we can get: - * - * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new - * - * As the sampling errors potentially range in the thousands of cycles, - * it is possible such a time value has already been observed by the - * guest. To protect against this, we must compute the system time as - * observed by the guest and ensure the new system time is greater. - */ - max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp) { - max_kernel_ns = vcpu->last_guest_tsc - - vcpu->hv_clock.tsc_timestamp; - max_kernel_ns = pvclock_scale_delta(max_kernel_ns, - vcpu->hv_clock.tsc_to_system_mul, - vcpu->hv_clock.tsc_shift); - max_kernel_ns += vcpu->last_kernel_ns; - } - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, &vcpu->hv_clock.tsc_shift, @@ -1584,14 +1553,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = this_tsc_khz; } - /* with a master tuple, - * pvclock clock reads always increase at the (scaled) rate - * of guest TSC - no need to deal with sampling errors. - */ - if (!use_master_clock) { - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - } /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; From aab6d7ce37cf20753a336dc74473cf8a8aefa7c0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jan 2014 18:07:31 +0100 Subject: [PATCH 55/67] KVM: remove useless write to vcpu->hv_clock.tsc_timestamp After the previous patch from Marcelo, the comment before this write became obsolete. In fact, the write is unnecessary. The calls to kvm_write_tsc ultimately result in a master clock update as soon as all TSCs agree and the master clock is re-enabled. This master clock update will rewrite tsc_timestamp. So, together with the comment, delete the dead write too. Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 842abd33e9b5..0fbdced78737 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1278,8 +1278,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - /* Reset of TSC must disable overshoot protection below */ - vcpu->arch.hv_clock.tsc_timestamp = 0; vcpu->arch.last_guest_tsc = data; /* Keep track of which generation this VCPU has synchronized to */ From e984097b553ed2d6551c805223e4057421370f00 Mon Sep 17 00:00:00 2001 From: Vadim Rozenfeld Date: Thu, 16 Jan 2014 20:18:37 +1100 Subject: [PATCH 56/67] add support for Hyper-V reference time counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off: Peter Lieven Signed-off: Gleb Natapov Signed-off: Vadim Rozenfeld After some consideration I decided to submit only Hyper-V reference counters support this time. I will submit iTSC support as a separate patch as soon as it is ready. v1 -> v2 1. mark TSC page dirty as suggested by Eric Northup and Gleb 2. disable local irq when calling get_kernel_ns, as it was done by Peter Lieven 3. move check for TSC page enable from second patch to this one. v3 -> v4     Get rid of ref counter offset. v4 -> v5 replace __copy_to_user with kvm_write_guest when updateing iTSC page. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/uapi/asm/hyperv.h | 13 +++++++++++++ arch/x86/kvm/x86.c | 28 +++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 1 + 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ae5d7830855c..33fef0738a29 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -605,6 +605,7 @@ struct kvm_arch { /* fields used by HYPER-V emulation */ u64 hv_guest_os_id; u64 hv_hypercall; + u64 hv_tsc_page; #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index b8f1c0176cbc..462efe746d77 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -28,6 +28,9 @@ /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ #define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +/* A partition's reference time stamp counter (TSC) page */ +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 + /* * There is a single feature flag that signifies the presence of the MSR * that can be used to retrieve both the local APIC Timer frequency as @@ -198,6 +201,9 @@ #define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) +#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 +#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 + #define HV_PROCESSOR_POWER_STATE_C0 0 #define HV_PROCESSOR_POWER_STATE_C1 1 #define HV_PROCESSOR_POWER_STATE_C2 2 @@ -210,4 +216,11 @@ #define HV_STATUS_INVALID_ALIGNMENT 4 #define HV_STATUS_INSUFFICIENT_BUFFERS 19 +typedef struct _HV_REFERENCE_TSC_PAGE { + __u32 tsc_sequence; + __u32 res1; + __u64 tsc_scale; + __s64 tsc_offset; +} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0fbdced78737..0b3fd809a3c7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -839,11 +839,12 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 10 +#define KVM_SAVE_MSRS_BEGIN 12 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, @@ -1788,6 +1789,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr) switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: + case HV_X64_MSR_REFERENCE_TSC: + case HV_X64_MSR_TIME_REF_COUNT: r = true; break; } @@ -1829,6 +1832,20 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm->arch.hv_hypercall = data; break; } + case HV_X64_MSR_REFERENCE_TSC: { + u64 gfn; + HV_REFERENCE_TSC_PAGE tsc_ref; + memset(&tsc_ref, 0, sizeof(tsc_ref)); + kvm->arch.hv_tsc_page = data; + if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + break; + gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + if (kvm_write_guest(kvm, data, + &tsc_ref, sizeof(tsc_ref))) + return 1; + mark_page_dirty(kvm, gfn); + break; + } default: vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " "data 0x%llx\n", msr, data); @@ -2253,6 +2270,14 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_HYPERCALL: data = kvm->arch.hv_hypercall; break; + case HV_X64_MSR_TIME_REF_COUNT: { + data = + div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + break; + } + case HV_X64_MSR_REFERENCE_TSC: + data = kvm->arch.hv_tsc_page; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -2566,6 +2591,7 @@ int kvm_dev_ioctl_check_extension(long ext) #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: + case KVM_CAP_HYPERV_TIME: #endif r = 1; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b647c2917391..932d7f2637d6 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -674,6 +674,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_EL1_32BIT 93 #define KVM_CAP_SPAPR_MULTITCE 94 #define KVM_CAP_EXT_EMUL_CPUID 95 +#define KVM_CAP_HYPERV_TIME 96 #ifdef KVM_CAP_IRQ_ROUTING From 9926c9fdbdd54bb229fe6fdbd15ca3af2b8425ae Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:15 +0100 Subject: [PATCH 57/67] KVM: x86: Sync DR7 on KVM_SET_DEBUGREGS Whenever we change arch.dr7, we also have to call kvm_update_dr7. In case guest debugging is off, this will synchronize the new state into hardware. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b3fd809a3c7..59907c9a9d05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2976,6 +2976,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; + kvm_update_dr7(vcpu); return 0; } From 73aaf249ee2287b4686ff079dcbdbbb658156e64 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:16 +0100 Subject: [PATCH 58/67] KVM: SVM: Fix reading of DR6 In contrast to VMX, SVM dose not automatically transfer DR6 into the VCPU's arch.dr6. So if we face a DR6 read, we must consult a new vendor hook to obtain the current value. And as SVM now picks the DR6 state from its VMCB, we also need a set callback in order to write updates of DR6 back. Fixes a regression of 020df0794f. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 15 +++++++++++++++ arch/x86/kvm/vmx.c | 11 +++++++++++ arch/x86/kvm/x86.c | 19 +++++++++++++++++-- 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33fef0738a29..fdf83afbb7d9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -700,6 +700,8 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + u64 (*get_dr6)(struct kvm_vcpu *vcpu); + void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c7168a5cff1b..e81df8fce027 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1671,6 +1671,19 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) mark_dirty(svm->vmcb, VMCB_ASID); } +static u64 svm_get_dr6(struct kvm_vcpu *vcpu) +{ + return to_svm(vcpu)->vmcb->save.dr6; +} + +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.dr6 = value; + mark_dirty(svm->vmcb, VMCB_DR); +} + static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4286,6 +4299,8 @@ static struct kvm_x86_ops svm_x86_ops = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .get_dr6 = svm_get_dr6, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7661eb171936..79b360e4fed1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5149,6 +5149,15 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; } +static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.dr6; +} + +static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -8556,6 +8565,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .get_dr6 = vmx_get_dr6, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59907c9a9d05..59b95b1a04dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -722,6 +722,12 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static void kvm_update_dr6(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); +} + static void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@ -750,6 +756,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) if (val & 0xffffffff00000000ULL) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + kvm_update_dr6(vcpu); break; case 5: if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) @@ -791,7 +798,10 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) return 1; /* fall through */ case 6: - *val = vcpu->arch.dr6; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) + *val = vcpu->arch.dr6; + else + *val = kvm_x86_ops->get_dr6(vcpu); break; case 5: if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) @@ -2960,8 +2970,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { + unsigned long val; + memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - dbgregs->dr6 = vcpu->arch.dr6; + _kvm_get_dr(vcpu, 6, &val); + dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); @@ -2975,6 +2988,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = dbgregs->dr6; + kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; kvm_update_dr7(vcpu); @@ -6749,6 +6763,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = DR6_FIXED_1; + kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); From 8246bf52c75aa9b9b336a84f31ed2248754d0f71 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:17 +0100 Subject: [PATCH 59/67] KVM: VMX: Fix DR6 update on #DB exception According to the SDM, only bits 0-3 of DR6 "may" be cleared by "certain" debug exception. So do update them on #DB exception in KVM, but leave the rest alone, only setting BD and BS in addition to already set bits in DR6. This also aligns us with kvm_vcpu_check_singlestep. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 79b360e4fed1..c8eb27ff1a2d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4869,7 +4869,8 @@ static int handle_exception(struct kvm_vcpu *vcpu) dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 = dr6 | DR6_FIXED_1; + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; kvm_queue_exception(vcpu, DB_VECTOR); return 1; } From 42124925c1f580068661bebd963d7c102175a8a9 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:19 +0100 Subject: [PATCH 60/67] KVM: nVMX: Leave VMX mode on clearing of feature control MSR When userspace sets MSR_IA32_FEATURE_CONTROL to 0, make sure we leave root and non-root mode, fully disabling VMX. The register state of the VCPU is undefined after this step, so userspace has to set it to a proper state afterward. This enables to reboot a VM while it is running some hypervisor code. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c8eb27ff1a2d..bff55554faec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2455,6 +2455,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 1; } +static void vmx_leave_nested(struct kvm_vcpu *vcpu); + static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr_index = msr_info->index; @@ -2470,6 +2472,8 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) & FEATURE_CONTROL_LOCKED) return 0; to_vmx(vcpu)->nested.msr_ia32_feature_control = data; + if (host_initialized && data == 0) + vmx_leave_nested(vcpu); return 1; } @@ -8503,6 +8507,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) vmx->nested.sync_shadow_vmcs = true; } +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +static void vmx_leave_nested(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + nested_vmx_vmexit(vcpu); + free_nested(to_vmx(vcpu)); +} + /* * L1's failure to enter L2 is a subset of a normal exit, as explained in * 23.7 "VM-entry failures during or after loading guest state" (this also From 533558bcb69ef28aff81b6ae9acda8943575319f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:20 +0100 Subject: [PATCH 61/67] KVM: nVMX: Pass vmexit parameters to nested_vmx_vmexit Instead of fixing up the vmcs12 after the nested vmexit, pass key parameters already when calling nested_vmx_vmexit. This will help tracing those vmexits. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 63 +++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bff55554faec..e3578b301d81 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1058,7 +1058,9 @@ static inline bool is_exception(u32 intr_info) == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); } -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 reason, unsigned long qualification); @@ -1967,7 +1969,9 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); return 1; } @@ -4649,15 +4653,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (to_vmx(vcpu)->nested.nested_run_pending) return 0; if (nested_exit_on_nmi(vcpu)) { - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI; - vmcs12->vm_exit_intr_info = NMI_VECTOR | - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); /* * The NMI-triggered VM exit counts as injection: * clear this one and block further NMIs. @@ -4679,15 +4680,11 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (to_vmx(vcpu)->nested.nested_run_pending) return 0; if (nested_exit_on_intr(vcpu)) { - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = - EXIT_REASON_EXTERNAL_INTERRUPT; - vmcs12->vm_exit_intr_info = 0; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + 0, 0); /* * fall through to normal code, but now in L1, not L2 */ @@ -6849,7 +6846,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return handle_invalid_guest_state(vcpu); if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); return 1; } @@ -7590,15 +7589,14 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { - struct vmcs12 *vmcs12; - nested_vmx_vmexit(vcpu); - vmcs12 = get_vmcs12(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 exit_reason; if (fault->error_code & PFERR_RSVD_MASK) - vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + exit_reason = EXIT_REASON_EPT_MISCONFIG; else - vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; - vmcs12->exit_qualification = vcpu->arch.exit_qualification; + exit_reason = EXIT_REASON_EPT_VIOLATION; + nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); vmcs12->guest_physical_address = fault->address; } @@ -7636,7 +7634,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); else kvm_inject_page_fault(vcpu, fault); } @@ -8191,7 +8191,9 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, * exit-information fields only. Other fields are modified by L1 with VMWRITE, * which already writes to vmcs12 directly. */ -static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 exit_reason, u32 exit_intr_info, + unsigned long exit_qualification) { /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); @@ -8282,10 +8284,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) /* update exit information fields: */ - vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason; - vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + vmcs12->vm_exit_reason = exit_reason; + vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmcs12->vm_exit_intr_info = exit_intr_info; if ((vmcs12->vm_exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) @@ -8452,7 +8454,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; @@ -8462,7 +8466,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vmx->nested.nested_run_pending); leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12); + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); cpu = get_cpu(); vmx->loaded_vmcs = &vmx->vmcs01; @@ -8513,7 +8518,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) static void vmx_leave_nested(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) - nested_vmx_vmexit(vcpu); + nested_vmx_vmexit(vcpu, -1, 0, 0); free_nested(to_vmx(vcpu)); } From 542060ea79c861e100411a5a44df747b56a693df Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:21 +0100 Subject: [PATCH 62/67] KVM: nVMX: Add tracepoints for nested_vmexit and nested_vmexit_inject Already used by nested SVM for tracing nested vmexit: kvm_nested_vmexit marks exits from L2 to L0 while kvm_nested_vmexit_inject marks vmexits that are reflected to L1. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e3578b301d81..e539c45eb669 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6697,6 +6697,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); u32 exit_reason = vmx->exit_reason; + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + if (vmx->nested.nested_run_pending) return 0; @@ -8469,6 +8476,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); + cpu = get_cpu(); vmx->loaded_vmcs = &vmx->vmcs01; vmx_vcpu_put(vcpu); From cae501397a25dc1e88375925c5e93a264d4a55ba Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:22 +0100 Subject: [PATCH 63/67] KVM: nVMX: Clean up handling of VMX-related MSRs This simplifies the code and also stops issuing warning about writing to unhandled MSRs when VMX is disabled or the Feature Control MSR is locked - we do handle them all according to the spec. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/msr-index.h | 1 + arch/x86/kvm/vmx.c | 79 ++++++++------------------- 2 files changed, 24 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 37813b5ddc37..2e4a42d31cfe 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -527,6 +527,7 @@ #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 +#define MSR_IA32_VMX_VMFUNC 0x00000491 /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e539c45eb669..fc4a255d5426 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2361,32 +2361,10 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* - * If we allow our guest to use VMX instructions (i.e., nested VMX), we should - * also let it use VMX-specific MSRs. - * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a - * VMX-specific MSR, or 0 when we haven't (and the caller should handle it - * like all other MSRs). - */ +/* Returns 0 on success, non-0 otherwise. */ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { - if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && - msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { - /* - * According to the spec, processors which do not support VMX - * should throw a #GP(0) when VMX capability MSRs are read. - */ - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - switch (msr_index) { - case MSR_IA32_FEATURE_CONTROL: - if (nested_vmx_allowed(vcpu)) { - *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control; - break; - } - return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2453,38 +2431,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = nested_vmx_ept_caps; break; default: - return 0; - } - - return 1; -} - -static void vmx_leave_nested(struct kvm_vcpu *vcpu); - -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - u32 msr_index = msr_info->index; - u64 data = msr_info->data; - bool host_initialized = msr_info->host_initiated; - - if (!nested_vmx_allowed(vcpu)) - return 0; - - if (msr_index == MSR_IA32_FEATURE_CONTROL) { - if (!host_initialized && - to_vmx(vcpu)->nested.msr_ia32_feature_control - & FEATURE_CONTROL_LOCKED) - return 0; - to_vmx(vcpu)->nested.msr_ia32_feature_control = data; - if (host_initialized && data == 0) - vmx_leave_nested(vcpu); return 1; } - /* - * No need to treat VMX capability MSRs specially: If we don't handle - * them, handle_wrmsr will #GP(0), which is correct (they are readonly) - */ return 0; } @@ -2530,13 +2479,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_FEATURE_CONTROL: + if (!nested_vmx_allowed(vcpu)) + return 1; + data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_get_vmx_msr(vcpu, msr_index, pdata); case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; /* Otherwise falls through */ default: - if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) - return 0; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { data = msr->data; @@ -2549,6 +2505,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static void vmx_leave_nested(struct kvm_vcpu *vcpu); + /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2603,6 +2561,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_FEATURE_CONTROL: + if (!nested_vmx_allowed(vcpu) || + (to_vmx(vcpu)->nested.msr_ia32_feature_control & + FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) + return 1; + vmx->nested.msr_ia32_feature_control = data; + if (msr_info->host_initiated && data == 0) + vmx_leave_nested(vcpu); + break; + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: + return 1; /* they are read-only */ case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -2611,8 +2580,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_info)) - break; msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; From 7af40ad37b3f097f367cbe9c0198caccce6fd83b Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:23 +0100 Subject: [PATCH 64/67] KVM: nVMX: Fix nested_run_pending on activity state HLT When we suspend the guest in HLT state, the nested run is no longer pending - we emulated it completely. So only set nested_run_pending after checking the activity state. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fc4a255d5426..f9a54331c808 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8046,8 +8046,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enter_guest_mode(vcpu); - vmx->nested.nested_run_pending = 1; - vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); cpu = get_cpu(); @@ -8066,6 +8064,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) return kvm_emulate_halt(vcpu); + vmx->nested.nested_run_pending = 1; + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet From 3edf1e698ff638c2ab095e8c60fd11c5d292fc5f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 4 Jan 2014 18:47:24 +0100 Subject: [PATCH 65/67] KVM: nVMX: Update guest activity state field on L2 exits Set guest activity state in L1's VMCS according to the VCPUs mp_state. This ensures we report the correct state in case we L2 executed HLT or if we put L2 into HLT state and it was now woken up by an event. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f9a54331c808..407b05c29d7b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8219,6 +8219,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; + else + vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) From 699bde3b6c95319749a8e1b7aa2b3f6bee84bff8 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 20 Jan 2014 12:34:13 +0100 Subject: [PATCH 66/67] KVM: s390: Fix memory access error detection Seems that commit 210b1607012cc9034841a393e0591b2c86d9e26c (KVM: s390: Removed SIE_INTERCEPT_UCONTROL) lost a hunk when we reworked our patch queue to rework the async_fp code. We now ignore faults on the sie instruction (guest accesses non-existing memory) instead of sending a fault into the guest. This leads to hang situations with the old virtio transport that checks for descriptor memory after guest memory. Instead of bailing out this code now goes wild... Lets re-add the check. Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1bb1ddaf93c0..7635c00a1479 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -738,6 +738,10 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) current->thread.gmap_addr; vcpu->run->s390_ucontrol.pgm_code = 0x10; rc = -EREMOTE; + } else { + VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); + trace_kvm_s390_sie_fault(vcpu); + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); } memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16); From 94491620e1362f6065ab821c13eb54b716ada19f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Jan 2014 18:02:38 -0800 Subject: [PATCH 67/67] kvm: make KVM_MMU_AUDIT help text more readable Make KVM_MMU_AUDIT kconfig help text readable and collapse two spaces between words down to one space. Signed-off-by: Randy Dunlap Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b89c5db2b832..287e4c85fff9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -80,7 +80,7 @@ config KVM_MMU_AUDIT depends on KVM && TRACEPOINTS ---help--- This option adds a R/W kVM module parameter 'mmu_audit', which allows - audit KVM MMU at runtime. + auditing of KVM MMU events at runtime. config KVM_DEVICE_ASSIGNMENT bool "KVM legacy PCI device assignment support"