linux/arch/x86/kvm/vmx/vmcs.h

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_VMX_VMCS_H
#define __KVM_X86_VMX_VMCS_H

#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/nospec.h>

#include <asm/kvm.h>
#include <asm/vmx.h>

#include "capabilities.h"

struct vmcs_hdr {
	u32 revision_id:31;
	u32 shadow_vmcs:1;
};

struct vmcs {
	struct vmcs_hdr hdr;
	u32 abort;
	char data[0];
};

DECLARE_PER_CPU(struct vmcs *, current_vmcs);

/*
 * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
 * and whose values change infrequently, but are not constant.  I.e. this is
 * used as a write-through cache of the corresponding VMCS fields.
 */
struct vmcs_host_state {
	unsigned long cr3;	/* May not match real cr3 */
	unsigned long cr4;	/* May not match real cr4 */
	unsigned long gs_base;
	unsigned long fs_base;
	unsigned long rsp;

	u16           fs_sel, gs_sel, ldt_sel;
#ifdef CONFIG_X86_64
	u16           ds_sel, es_sel;
#endif
};

struct vmcs_controls_shadow {
	u32 vm_entry;
	u32 vm_exit;
	u32 pin;
	u32 exec;
	u32 secondary_exec;
};

/*
 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
 * loaded on this CPU (so we can clear them if the CPU goes down).
 */
struct loaded_vmcs {
	struct vmcs *vmcs;
	struct vmcs *shadow_vmcs;
	int cpu;
	bool launched;
	bool nmi_known_unmasked;
	bool hv_timer_soft_disabled;
	/* Support for vnmi-less CPUs */
	int soft_vnmi_blocked;
	ktime_t entry_time;
	s64 vnmi_blocked_time;
	unsigned long *msr_bitmap;
	struct list_head loaded_vmcss_on_cpu_link;
	struct vmcs_host_state host_state;
	struct vmcs_controls_shadow controls_shadow;
};

static inline bool is_exception_n(u32 intr_info, u8 vector)
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
		(INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
}

static inline bool is_debug(u32 intr_info)
{
	return is_exception_n(intr_info, DB_VECTOR);
}

static inline bool is_breakpoint(u32 intr_info)
{
	return is_exception_n(intr_info, BP_VECTOR);
}

static inline bool is_page_fault(u32 intr_info)
{
	return is_exception_n(intr_info, PF_VECTOR);
}

static inline bool is_invalid_opcode(u32 intr_info)
{
	return is_exception_n(intr_info, UD_VECTOR);
}

static inline bool is_gp_fault(u32 intr_info)
{
	return is_exception_n(intr_info, GP_VECTOR);
}

static inline bool is_machine_check(u32 intr_info)
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
			     INTR_INFO_VALID_MASK)) ==
		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
}

/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
		== (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
}

static inline bool is_nmi(u32 intr_info)
{
	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
}

static inline bool is_external_intr(u32 intr_info)
{
	return (intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
		== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR);
}

enum vmcs_field_width {
	VMCS_FIELD_WIDTH_U16 = 0,
	VMCS_FIELD_WIDTH_U64 = 1,
	VMCS_FIELD_WIDTH_U32 = 2,
	VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
};

static inline int vmcs_field_width(unsigned long field)
{
	if (0x1 & field)	/* the *_HIGH fields are all 32 bit */
		return VMCS_FIELD_WIDTH_U32;
	return (field >> 13) & 0x3;
}

static inline int vmcs_field_readonly(unsigned long field)
{
	return (((field >> 10) & 0x3) == 1);
}

#endif /* __KVM_X86_VMX_VMCS_H */
KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00			`/* SPDX-License-Identifier: GPL-2.0 */`
			`#ifndef __KVM_X86_VMX_VMCS_H`
			`#define __KVM_X86_VMX_VMCS_H`

			`#include <linux/ktime.h>`
KVM: nVMX: Move vmcs12 code to dedicated files vmcs12 is the KVM-defined struct used to track a nested VMCS, e.g. a VMCS created by L1 for L2. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:05 +01:00			`#include <linux/list.h>`
			`#include <linux/nospec.h>`
KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00
KVM: nVMX: Move vmcs12 code to dedicated files vmcs12 is the KVM-defined struct used to track a nested VMCS, e.g. a VMCS created by L1 for L2. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:05 +01:00			`#include <asm/kvm.h>`
KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00			`#include <asm/vmx.h>`

			`#include "capabilities.h"`

			`struct vmcs_hdr {`
			`u32 revision_id:31;`
			`u32 shadow_vmcs:1;`
			`};`

			`struct vmcs {`
			`struct vmcs_hdr hdr;`
			`u32 abort;`
			`char data[0];`
			`};`

KVM: VMX: Move eVMCS code to dedicated files The header, evmcs.h, already exists and contains a fair amount of code, but there are a few pieces in vmx.c that can be moved verbatim. In addition, move an array definition to evmcs.c to prepare for multiple consumers of evmcs.h. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:06 +01:00			`DECLARE_PER_CPU(struct vmcs *, current_vmcs);`

KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00			`/*`
			`* vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT`
			`* and whose values change infrequently, but are not constant. I.e. this is`
			`* used as a write-through cache of the corresponding VMCS fields.`
			`*/`
			`struct vmcs_host_state {`
			`unsigned long cr3; /* May not match real cr3 */`
			`unsigned long cr4; /* May not match real cr4 */`
			`unsigned long gs_base;`
			`unsigned long fs_base;`
KVM: nVMX: Cache host_rsp on a per-VMCS basis Currently, host_rsp is cached on a per-vCPU basis, i.e. it's stored in struct vcpu_vmx. In non-nested usage the caching is for all intents and purposes 100% effective, e.g. only the first VMLAUNCH needs to synchronize VMCS.HOST_RSP since the call stack to vmx_vcpu_run() is identical each and every time. But when running a nested guest, KVM must invalidate the cache when switching the current VMCS as it can't guarantee the new VMCS has the same HOST_RSP as the previous VMCS. In other words, the cache loses almost all of its efficacy when running a nested VM. Move host_rsp to struct vmcs_host_state, which is per-VMCS, so that it is cached on a per-VMCS basis and restores its 100% hit rate when nested VMs are in play. Note that the host_rsp cache for vmcs02 essentially "breaks" when nested early checks are enabled as nested_vmx_check_vmentry_hw() will see a different RSP at the time of its VM-Enter. While it's possible to avoid even that VMCS.HOST_RSP synchronization, e.g. by employing a dedicated VM-Exit stack, there is little motivation for doing so as the overhead of two VMWRITEs (~55 cycles) is dwarfed by the overhead of the extra VMX transition (600+ cycles) and is a proverbial drop in the ocean relative to the total cost of a nested transtion (10s of thousands of cycles). Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Reviewed-by: Jim Mattson <jmattson@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2019-01-25 16:41:02 +01:00			`unsigned long rsp;`
KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00
			`u16 fs_sel, gs_sel, ldt_sel;`
			`#ifdef CONFIG_X86_64`
			`u16 ds_sel, es_sel;`
			`#endif`
			`};`

KVM: nVMX: Shadow VMCS controls on a per-VMCS basis ... to pave the way for not preserving the shadow copies across switches between vmcs01 and vmcs02, and eventually to avoid VMWRITEs to vmcs02 when the desired value is unchanged across nested VM-Enters. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2019-05-07 21:17:58 +02:00			`struct vmcs_controls_shadow {`
			`u32 vm_entry;`
			`u32 vm_exit;`
			`u32 pin;`
			`u32 exec;`
			`u32 secondary_exec;`
			`};`

KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00			`/*`
			`* Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also`
			`* remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs`
			`* loaded on this CPU (so we can clear them if the CPU goes down).`
			`*/`
			`struct loaded_vmcs {`
			`struct vmcs *vmcs;`
			`struct vmcs *shadow_vmcs;`
			`int cpu;`
			`bool launched;`
			`bool nmi_known_unmasked;`
KVM: VMX: Leave preemption timer running when it's disabled VMWRITEs to the major VMCS controls, pin controls included, are deceptively expensive. CPUs with VMCS caching (Westmere and later) also optimize away consistency checks on VM-Entry, i.e. skip consistency checks if the relevant fields have not changed since the last successful VM-Entry (of the cached VMCS). Because uops are a precious commodity, uCode's dirty VMCS field tracking isn't as precise as software would prefer. Notably, writing any of the major VMCS fields effectively marks the entire VMCS dirty, i.e. causes the next VM-Entry to perform all consistency checks, which consumes several hundred cycles. As it pertains to KVM, toggling PIN_BASED_VMX_PREEMPTION_TIMER more than doubles the latency of the next VM-Entry (and again when/if the flag is toggled back). In a non-nested scenario, running a "standard" guest with the preemption timer enabled, toggling the timer flag is uncommon but not rare, e.g. roughly 1 in 10 entries. Disabling the preemption timer can change these numbers due to its use for "immediate exits", even when explicitly disabled by userspace. Nested virtualization in particular is painful, as the timer flag is set for the majority of VM-Enters, but prepare_vmcs02() initializes vmcs02's pin controls to clear the flag since its the timer's final state isn't known until vmx_vcpu_run(). I.e. the majority of nested VM-Enters end up unnecessarily writing pin controls twice. Rather than toggle the timer flag in pin controls, set the timer value itself to the largest allowed value to put it into a "soft disabled" state, and ignore any spurious preemption timer exits. Sadly, the timer is a 32-bit value and so theoretically it can fire before the head death of the universe, i.e. spurious exits are possible. But because KVM does not save the timer value on VM-Exit and because the timer runs at a slower rate than the TSC, the maximuma timer value is still sufficiently large for KVM's purposes. E.g. on a modern CPU with a timer that runs at 1/32 the frequency of a 2.4ghz constant-rate TSC, the timer will fire after ~55 seconds of uninterrupted guest execution. In other words, spurious VM-Exits are effectively only possible if the host is completely tickless on the logical CPU, the guest is not using the preemption timer, and the guest is not generating VM-Exits for any other reason. To be safe from bad/weird hardware, disable the preemption timer if its maximum delay is less than ten seconds. Ten seconds is mostly arbitrary and was selected in no small part because it's a nice round number. For simplicity and paranoia, fall back to __kvm_request_immediate_exit() if the preemption timer is disabled by KVM or userspace. Previously KVM continued to use the preemption timer to force immediate exits even when the timer was disabled by userspace. Now that KVM leaves the timer running instead of truly disabling it, allow userspace to kill it entirely in the unlikely event the timer (or KVM) malfunctions. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2019-05-07 21:18:05 +02:00			`bool hv_timer_soft_disabled;`
KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00			`/* Support for vnmi-less CPUs */`
			`int soft_vnmi_blocked;`
			`ktime_t entry_time;`
			`s64 vnmi_blocked_time;`
			`unsigned long *msr_bitmap;`
			`struct list_head loaded_vmcss_on_cpu_link;`
			`struct vmcs_host_state host_state;`
KVM: nVMX: Shadow VMCS controls on a per-VMCS basis ... to pave the way for not preserving the shadow copies across switches between vmcs01 and vmcs02, and eventually to avoid VMWRITEs to vmcs02 when the desired value is unchanged across nested VM-Enters. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2019-05-07 21:17:58 +02:00			`struct vmcs_controls_shadow controls_shadow;`
KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00			`};`

			`static inline bool is_exception_n(u32 intr_info, u8 vector)`
			`{`
			`return (intr_info & (INTR_INFO_INTR_TYPE_MASK \| INTR_INFO_VECTOR_MASK \|`
			`INTR_INFO_VALID_MASK)) ==`
			`(INTR_TYPE_HARD_EXCEPTION \| vector \| INTR_INFO_VALID_MASK);`
			`}`

			`static inline bool is_debug(u32 intr_info)`
			`{`
			`return is_exception_n(intr_info, DB_VECTOR);`
			`}`

			`static inline bool is_breakpoint(u32 intr_info)`
			`{`
			`return is_exception_n(intr_info, BP_VECTOR);`
			`}`

			`static inline bool is_page_fault(u32 intr_info)`
			`{`
			`return is_exception_n(intr_info, PF_VECTOR);`
			`}`

			`static inline bool is_invalid_opcode(u32 intr_info)`
			`{`
			`return is_exception_n(intr_info, UD_VECTOR);`
			`}`

			`static inline bool is_gp_fault(u32 intr_info)`
			`{`
			`return is_exception_n(intr_info, GP_VECTOR);`
			`}`

			`static inline bool is_machine_check(u32 intr_info)`
			`{`
			`return (intr_info & (INTR_INFO_INTR_TYPE_MASK \| INTR_INFO_VECTOR_MASK \|`
			`INTR_INFO_VALID_MASK)) ==`
			`(INTR_TYPE_HARD_EXCEPTION \| MC_VECTOR \| INTR_INFO_VALID_MASK);`
			`}`

			`/* Undocumented: icebp/int1 */`
			`static inline bool is_icebp(u32 intr_info)`
			`{`
			`return (intr_info & (INTR_INFO_INTR_TYPE_MASK \| INTR_INFO_VALID_MASK))`
			`== (INTR_TYPE_PRIV_SW_EXCEPTION \| INTR_INFO_VALID_MASK);`
			`}`

			`static inline bool is_nmi(u32 intr_info)`
			`{`
			`return (intr_info & (INTR_INFO_INTR_TYPE_MASK \| INTR_INFO_VALID_MASK))`
			`== (INTR_TYPE_NMI_INTR \| INTR_INFO_VALID_MASK);`
			`}`

KVM: VMX: Read cached VM-Exit reason to detect external interrupt Generic x86 code invokes the kvm_x86_ops external interrupt handler on all VM-Exits regardless of the actual exit type. Use the already-cached EXIT_REASON to determine if the VM-Exit was due to an interrupt, thus avoiding an extra VMREAD (to query VM_EXIT_INTR_INFO) for all other types of VM-Exit. In addition to avoiding the extra VMREAD, checking the EXIT_REASON instead of VM_EXIT_INTR_INFO makes it more obvious that vmx_handle_external_intr() is called for all VM-Exits, e.g. someone unfamiliar with the flow might wonder under what condition(s) VM_EXIT_INTR_INFO does not contain a valid interrupt, which is simply not possible since KVM always runs with "ack interrupt on exit". WARN once if VM_EXIT_INTR_INFO doesn't contain a valid interrupt on an EXTERNAL_INTERRUPT VM-Exit, as such a condition would indicate a hardware bug. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2019-04-20 07:50:56 +02:00			`static inline bool is_external_intr(u32 intr_info)`
			`{`
			`return (intr_info & (INTR_INFO_VALID_MASK \| INTR_INFO_INTR_TYPE_MASK))`
			`== (INTR_INFO_VALID_MASK \| INTR_TYPE_EXT_INTR);`
			`}`

KVM: VMX: Move VMCS definitions to dedicated file This isn't intended to be a pure reflection of hardware, e.g. struct loaded_vmcs and struct vmcs_host_state are KVM-defined constructs. Similar to capabilities.h, this is a standalone file to avoid circular dependencies between yet-to-be-created vmx.h and nested.h files. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> 2018-12-03 22:53:04 +01:00			`enum vmcs_field_width {`
			`VMCS_FIELD_WIDTH_U16 = 0,`
			`VMCS_FIELD_WIDTH_U64 = 1,`
			`VMCS_FIELD_WIDTH_U32 = 2,`
			`VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3`
			`};`

			`static inline int vmcs_field_width(unsigned long field)`
			`{`
			`if (0x1 & field) /* the _HIGH fields are all 32 bit /`
			`return VMCS_FIELD_WIDTH_U32;`
			`return (field >> 13) & 0x3;`
			`}`

			`static inline int vmcs_field_readonly(unsigned long field)`
			`{`
			`return (((field >> 10) & 0x3) == 1);`
			`}`

			`#endif /* __KVM_X86_VMX_VMCS_H */`