diff --git a/Makefile b/Makefile index 1dca17a..cb8cdfd 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 4 SUBLEVEL = 91 -EXTRAVERSION = -2.9 +EXTRAVERSION = -2.11 NAME = Kleptomaniac Octopus # *DOCUMENTATION* diff --git a/arch/e2k/include/asm-l/epic.h b/arch/e2k/include/asm-l/epic.h index d965572..973b099 100644 --- a/arch/e2k/include/asm-l/epic.h +++ b/arch/e2k/include/asm-l/epic.h @@ -77,6 +77,15 @@ extern void epic_send_IPI_mask_allbutself(const struct cpumask *mask, extern void epic_wait_icr_idle(void); extern void clear_cepic(void); +extern bool pcsm_adjust_enable; + +struct pcs_handle { + void (*pcs_interrupt)(void); +}; + +extern void register_pcs_handle(const struct pcs_handle *handle); +extern void unregister_pcs_handle(void); + extern __visible void epic_smp_timer_interrupt(struct pt_regs *regs); extern __visible void epic_smp_spurious_interrupt(struct pt_regs *regs); extern __visible void epic_smp_error_interrupt(struct pt_regs *regs); diff --git a/arch/e2k/include/asm/copy-hw-stacks.h b/arch/e2k/include/asm/copy-hw-stacks.h new file mode 100644 index 0000000..7a20489 --- /dev/null +++ b/arch/e2k/include/asm/copy-hw-stacks.h @@ -0,0 +1,852 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * include/asm-e2k/copy-hw-stacks.h + * + * Copyright 2021 mcst.ru + */ + +#ifndef _E2K_COPY_HW_STACKS_H +#define _E2K_COPY_HW_STACKS_H + +#include + +#include +#include +#include + +#include + +#undef DEBUG_PV_UST_MODE +#undef DebugUST +#define DEBUG_PV_UST_MODE 0 /* guest user stacks debug */ +#define DebugUST(fmt, args...) \ +({ \ + if (debug_guest_ust) \ + pr_info("%s(): " fmt, __func__, ##args); \ +}) + +#undef DEBUG_PV_SYSCALL_MODE +#define DEBUG_PV_SYSCALL_MODE 0 /* syscall injection debugging */ + +#if DEBUG_PV_UST_MODE || DEBUG_PV_SYSCALL_MODE +extern bool debug_guest_ust; +#else +#define debug_guest_ust false +#endif /* DEBUG_PV_UST_MODE || DEBUG_PV_SYSCALL_MODE */ + +#ifndef CONFIG_VIRTUALIZATION +/* it native kernel without virtualization support */ +#else /* CONFIG_VIRTUALIZATION */ +/* It is native host kernel with virtualization support */ +/* or paravirtualized host and guest */ +/* or native guest kernel + #include + */ +#endif /* ! CONFIG_VIRTUALIZATION */ + +typedef void (*trace_ps_frame_func_t)(kernel_mem_ps_t *base, kernel_mem_ps_t *frame); +typedef void (*trace_pcs_frame_func_t)(e2k_mem_crs_t *base, e2k_mem_crs_t *frame); + +static inline void trace_proc_stack_frames(kernel_mem_ps_t *dst_ps_base, + kernel_mem_ps_t *src_ps_base, u64 ps_size, + trace_ps_frame_func_t trace_func) +{ + int qreg, qreg_num; + kernel_mem_ps_t *dst_ps_frame, *src_ps_frame; + kernel_mem_ps_t rw; + + qreg_num = ps_size / EXT_4_NR_SZ; + for (qreg = qreg_num - 1; qreg >= 0; qreg--) { + dst_ps_frame = &dst_ps_base[qreg]; + src_ps_frame = &src_ps_base[qreg]; + rw.word_lo = src_ps_frame->word_lo; + if (machine.native_iset_ver < E2K_ISET_V5) { + rw.word_hi = src_ps_frame->word_hi; + rw.ext_lo = src_ps_frame->ext_lo; + rw.ext_hi = src_ps_frame->ext_hi; + } else { + rw.word_hi = src_ps_frame->ext_lo; + rw.ext_lo = src_ps_frame->word_hi; + rw.ext_hi = src_ps_frame->ext_hi; + } + + trace_func(dst_ps_frame, &rw); + } +} + +static inline void trace_chain_stack_frames(e2k_mem_crs_t *dst_pcs_base, + e2k_mem_crs_t *src_pcs_base, u64 pcs_size, + trace_pcs_frame_func_t trace_func) +{ + int crs_no, crs_num; + e2k_mem_crs_t *dst_pcs_frame, *src_pcs_frame; + e2k_mem_crs_t crs; + unsigned long flags; + + crs_num = pcs_size / sizeof(crs); + raw_all_irq_save(flags); + for (crs_no = crs_num - 1; crs_no >= 0; crs_no--) { + dst_pcs_frame = &dst_pcs_base[crs_no]; + src_pcs_frame = &src_pcs_base[crs_no]; + crs = *src_pcs_frame; + + trace_func(dst_pcs_frame, &crs); + } + raw_all_irq_restore(flags); +} + +static inline void trace_host_hva_area(u64 *hva_base, u64 hva_size) +{ + int line_no, line_num; + u64 *dst_hva_line; + unsigned long flags; + + line_num = hva_size / (sizeof(u64) * 4); + raw_all_irq_save(flags); + for (line_no = line_num - 1; line_no >= 0; line_no--) { + dst_hva_line = &hva_base[line_no * 4]; + trace_host_hva_area_line(dst_hva_line, (sizeof(u64) * 4)); + } + if (line_num * (sizeof(u64) * 4) < hva_size) { + dst_hva_line = &hva_base[line_no * 4]; + trace_host_hva_area_line(dst_hva_line, + hva_size - line_num * (sizeof(u64) * 4)); + } + raw_all_irq_restore(flags); +} + +static __always_inline void +native_kernel_hw_stack_frames_copy(u64 *dst, const u64 *src, unsigned long size) +{ + void *dst_tail; + const void *src_tail; + u64 copied; + int i; + + /* + * Kernel does not use FP registers so do not copy them. + * This only applies to CPUs before V5 instruction set + * (since V5 FP registers become general-purpose QP registers). + */ + if (cpu_has(CPU_FEAT_QPREG)) { +#pragma loop count (10) + for (i = 0; i < size / 64; i++) + E2K_TAGGED_MEMMOVE_64(&dst[8 * i], &src[8 * i]); + + copied = round_down(size, 64); + dst_tail = (void *) dst + copied; + src_tail = (void *) src + copied; + } else { +#pragma loop count (5) + for (i = 0; i < size / 128; i++) + E2K_TAGGED_MEMMOVE_128_RF_V2(&dst[16 * i], + &src[16 * i]); + + copied = round_down(size, 128); + dst_tail = (void *) dst + copied; + src_tail = (void *) src + copied; + + if (size & 64) { + E2K_TAGGED_MEMMOVE_64(dst_tail, src_tail); + dst_tail += 64; + src_tail += 64; + } + } + + if (size & 32) + E2K_TAGGED_MEMMOVE_32(dst_tail, src_tail); +} + +static __always_inline void +native_collapse_kernel_pcs(u64 *dst, const u64 *src, u64 spilled_size) +{ + e2k_pcsp_hi_t k_pcsp_hi; + u64 size; + int i; + + DebugUST("current host chain stack index 0x%x, PCSHTP 0x%llx\n", + NATIVE_NV_READ_PCSP_HI_REG().PCSP_hi_ind, + NATIVE_READ_PCSHTP_REG_SVALUE()); + + NATIVE_FLUSHC; + k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); + + size = k_pcsp_hi.PCSP_hi_ind - spilled_size; + BUG_ON(!IS_ALIGNED(size, ALIGN_PCSTACK_TOP_SIZE) || (s64) size < 0); +#pragma loop count (2) + for (i = 0; i < size / 32; i++) { + u64 v0, v1, v2, v3; + + v0 = src[4 * i]; + v1 = src[4 * i + 1]; + v2 = src[4 * i + 2]; + v3 = src[4 * i + 3]; + dst[4 * i] = v0; + dst[4 * i + 1] = v1; + dst[4 * i + 2] = v2; + dst[4 * i + 3] = v3; + } + + k_pcsp_hi.PCSP_hi_ind -= spilled_size; + NATIVE_NV_NOIRQ_WRITE_PCSP_HI_REG(k_pcsp_hi); + + DebugUST("move spilled chain part from host top %px to " + "bottom %px, size 0x%llx\n", + src, dst, size); + DebugUST("host kernel chain stack index is now 0x%x, " + "guest user PCSHTP 0x%llx\n", + k_pcsp_hi.PCSP_hi_ind, spilled_size); +} + +static __always_inline void +native_collapse_kernel_ps(u64 *dst, const u64 *src, u64 spilled_size) +{ + e2k_psp_hi_t k_psp_hi; + u64 size; + + DebugUST("current host procedure stack index 0x%x, PSHTP 0x%x\n", + NATIVE_NV_READ_PSP_HI_REG().PSP_hi_ind, + NATIVE_NV_READ_PSHTP_REG().PSHTP_ind); + + NATIVE_FLUSHR; + k_psp_hi = NATIVE_NV_READ_PSP_HI_REG(); + + size = k_psp_hi.PSP_hi_ind - spilled_size; + BUG_ON(!IS_ALIGNED(size, ALIGN_PSTACK_TOP_SIZE) || (s64) size < 0); + + prefetchw_range(src, size); + native_kernel_hw_stack_frames_copy(dst, src, size); + + k_psp_hi.PSP_hi_ind -= spilled_size; + NATIVE_NV_NOIRQ_WRITE_PSP_HI_REG(k_psp_hi); + + DebugUST("move spilled procedure part from host top %px to " + "bottom %px, size 0x%llx\n", + src, dst, size); + DebugUST("host kernel procedure stack index is now 0x%x, " + "guest user PSHTP 0x%llx\n", + k_psp_hi.PSP_hi_ind, spilled_size); +} + +#if defined(CONFIG_PARAVIRT_GUEST) +/* paravirtualized kernel (host and guest) */ +#include +#elif defined(CONFIG_KVM_GUEST_KERNEL) +/* It is native guest kernel (without paravirtualization) */ +#include +#elif defined(CONFIG_VIRTUALIZATION) || !defined(CONFIG_VIRTUALIZATION) +/* native kernel with virtualization support */ +/* native kernel without virtualization support */ + +static __always_inline void +kernel_hw_stack_frames_copy(u64 *dst, const u64 *src, unsigned long size) +{ + native_kernel_hw_stack_frames_copy(dst, src, size); +} +static __always_inline void +collapse_kernel_pcs(u64 *dst, const u64 *src, u64 spilled_size) +{ + native_collapse_kernel_pcs(dst, src, spilled_size); +} +static __always_inline void +collapse_kernel_ps(u64 *dst, const u64 *src, u64 spilled_size) +{ + native_collapse_kernel_ps(dst, src, spilled_size); +} + +#else /* ??? */ + #error "Undefined virtualization mode" +#endif /* CONFIG_PARAVIRT_GUEST */ + +static __always_inline u64 get_wsz(enum restore_caller from) +{ + return NATIVE_READ_WD_REG().size >> 4; +} + +static __always_inline u64 get_ps_clear_size(u64 cur_window_q, + e2k_pshtp_t pshtp) +{ + s64 u_pshtp_size_q; + + u_pshtp_size_q = GET_PSHTP_Q_INDEX(pshtp); + if (u_pshtp_size_q > E2K_MAXSR - cur_window_q) + u_pshtp_size_q = E2K_MAXSR - cur_window_q; + + return E2K_MAXSR - (cur_window_q + u_pshtp_size_q); +} + +static __always_inline s64 get_ps_copy_size(u64 cur_window_q, s64 u_pshtp_size) +{ + return u_pshtp_size - (E2K_MAXSR - cur_window_q) * EXT_4_NR_SZ; +} + +#ifdef CONFIG_CPU_HAS_FILL_INSTRUCTION +# define E2K_CF_MAX_FILL (E2K_CF_MAX_FILL_FILLC_q * 0x10) +#else +extern int cf_max_fill_return; +# define E2K_CF_MAX_FILL cf_max_fill_return +#endif + +static __always_inline s64 get_pcs_copy_size(s64 u_pcshtp_size) +{ + /* Before v6 it was possible to fill no more than 16 registers. + * Since E2K_MAXCR_q is much bigger than 16 we can be sure that + * there is enough space in CF for the FILL, so there is no + * need to take into account space taken by current window. */ + return u_pcshtp_size - E2K_CF_MAX_FILL; +} + +/* + * Copy hardware stack from user to *current* kernel stack. + * One has to be careful to avoid hardware FILL of this stack. + */ +static inline int __copy_user_to_current_hw_stack(void *dst, void __user *src, + unsigned long size, const pt_regs_t *regs, bool chain) +{ + unsigned long min_flt, maj_flt, ts_flag; + + if (likely(!host_test_intc_emul_mode(regs))) { + if (!__range_ok((unsigned long __force) src, size, + PAGE_OFFSET)) + return -EFAULT; + } + + ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); + + /* + * Every page fault here has a chance of FILL'ing the frame + * that is being copied, in which case we repeat the copy. + */ + do { + min_flt = READ_ONCE(current->min_flt); + maj_flt = READ_ONCE(current->maj_flt); + + if (chain) + E2K_FLUSHC; + else + E2K_FLUSHR; + + SET_USR_PFAULT("$.recovery_memcpy_fault"); + fast_tagged_memory_copy_from_user(dst, src, size, regs, + TAGGED_MEM_STORE_REC_OPC | + MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, + TAGGED_MEM_LOAD_REC_OPC | + MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, + true); + if (RESTORE_USR_PFAULT) { + clear_ts_flag(ts_flag); + return -EFAULT; + } + } while (unlikely(min_flt != READ_ONCE(current->min_flt) || + maj_flt != READ_ONCE(current->maj_flt))); + + clear_ts_flag(ts_flag); + return 0; +} + + +static inline int copy_user_to_current_hw_stack(void *dst, void __user *src, + unsigned long size, pt_regs_t *regs, bool chain) +{ + unsigned long flags; + int ret; + + raw_all_irq_save(flags); + ret = __copy_user_to_current_hw_stack(dst, src, size, regs, chain); + raw_all_irq_restore(flags); + + return ret; +} + +static inline int copy_e2k_stack_from_user(void *dst, void __user *src, + unsigned long size, pt_regs_t *regs) +{ + unsigned long ts_flag; + int ret; + + if (likely(!host_test_intc_emul_mode(regs))) { + if (!__range_ok((unsigned long __force) src, size, PAGE_OFFSET)) + return -EFAULT; + } + + ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); + ret = host_copy_from_user_with_tags(dst, src, size, regs); + clear_ts_flag(ts_flag); + + return (ret) ? -EFAULT : 0; +} + +static inline int copy_e2k_stack_to_user(void __user *dst, void *src, + unsigned long size, pt_regs_t *regs) +{ + unsigned long ts_flag; + int ret; + + if (likely(!host_test_intc_emul_mode(regs))) { + if (!__range_ok((unsigned long __force) dst, size, PAGE_OFFSET)) + return -EFAULT; + } + + ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); + ret = host_copy_to_user_with_tags(dst, src, size, regs); + clear_ts_flag(ts_flag); + + return (ret) ? -EFAULT : 0; +} + +static __always_inline int +user_hw_stack_frames_copy(void __user *dst, void *src, unsigned long copy_size, + const pt_regs_t *regs, unsigned long hw_stack_ind, bool is_pcsp) +{ + unsigned long ts_flag; + + if (unlikely(hw_stack_ind < copy_size)) { + unsigned long flags; + raw_all_irq_save(flags); + if (is_pcsp) { + E2K_FLUSHC; + } else { + E2K_FLUSHR; + } + raw_all_irq_restore(flags); + } + + SET_USR_PFAULT("$.recovery_memcpy_fault"); + + ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); + fast_tagged_memory_copy_to_user(dst, src, copy_size, regs, + TAGGED_MEM_STORE_REC_OPC | + MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, + TAGGED_MEM_LOAD_REC_OPC | + MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, true); + clear_ts_flag(ts_flag); + + if (RESTORE_USR_PFAULT) { + pr_err("process %s (%d) %s stack could not be copied " + "from %px to %px size 0x%lx (out of memory?)\n", + current->comm, current->pid, + (is_pcsp) ? "chain" : "procedure", + src, dst, copy_size); + return -EFAULT; + } + DebugUST("copying guest %s stack spilled to host from %px " + "to guest kernel stack from %px, size 0x%lx\n", + (is_pcsp) ? "chain" : "procedure", src, dst, copy_size); + + return 0; +} + +static __always_inline int +user_crs_frames_copy(e2k_mem_crs_t __user *u_frame, pt_regs_t *regs, + e2k_mem_crs_t *crs) +{ + unsigned long ts_flag; + int ret; + + ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); + ret = host_copy_to_user(u_frame, crs, sizeof(*crs), regs); + clear_ts_flag(ts_flag); + if (unlikely(ret)) + return -EFAULT; + + return 0; +} + +static __always_inline int user_psp_stack_copy(e2k_psp_lo_t u_psp_lo, + e2k_psp_hi_t u_psp_hi, s64 u_pshtp_size, + e2k_psp_lo_t k_psp_lo, e2k_psp_hi_t k_psp_hi, + unsigned long copy_size, const pt_regs_t *regs) +{ + void __user *dst; + void *src; + int ret; + + dst = (void __user *) (AS(u_psp_lo).base + AS(u_psp_hi).ind - + u_pshtp_size); + src = (void *) AS(k_psp_lo).base; + + if (host_test_intc_emul_mode(regs) && trace_host_copy_hw_stack_enabled()) + trace_host_copy_hw_stack(dst, src, copy_size, false); + + ret = user_hw_stack_frames_copy(dst, src, copy_size, + regs, k_psp_hi.PSP_hi_ind, false); + + if (host_test_intc_emul_mode(regs) && trace_host_proc_stack_frame_enabled()) + trace_proc_stack_frames((kernel_mem_ps_t *)dst, + (kernel_mem_ps_t *)src, copy_size, + trace_host_proc_stack_frame); + + return ret; + +} + +static __always_inline int user_pcsp_stack_copy(e2k_pcsp_lo_t u_pcsp_lo, + e2k_pcsp_hi_t u_pcsp_hi, s64 u_pcshtp_size, + e2k_pcsp_lo_t k_pcsp_lo, e2k_pcsp_hi_t k_pcsp_hi, + unsigned long copy_size, const pt_regs_t *regs) +{ + void __user *dst; + void *src; + int ret; + + dst = (void __user *)(AS(u_pcsp_lo).base + AS(u_pcsp_hi).ind - + u_pcshtp_size); + src = (void *) AS(k_pcsp_lo).base; + + if (host_test_intc_emul_mode(regs) && trace_host_copy_hw_stack_enabled()) + trace_host_copy_hw_stack(dst, src, copy_size, true); + + ret = user_hw_stack_frames_copy(dst, src, copy_size, + regs, k_pcsp_hi.PCSP_hi_ind, true); + + if (host_test_intc_emul_mode(regs) && trace_host_chain_stack_frame_enabled()) + trace_chain_stack_frames((e2k_mem_crs_t *)dst, + (e2k_mem_crs_t *)src, copy_size, + trace_host_chain_stack_frame); + + return ret; +} + +/** + * user_hw_stacks_copy - copy user hardware stacks that have been + * SPILLed to kernel back to user space + * @stacks - saved user stack registers + * @cur_window_q - size of current window in procedure stack, + * needed only if @copy_full is not set + * @copy_full - set if want to copy _all_ of SPILLed stacks + * + * This does not update stacks->pshtp and stacks->pcshtp. Main reason is + * signals: if a signal arrives after copying then it must see a coherent + * state where saved stacks->pshtp and stacks->pcshtp values show how much + * data from user space is spilled to kernel space. + */ +static __always_inline int +native_user_hw_stacks_copy(struct e2k_stacks *stacks, + pt_regs_t *regs, u64 cur_window_q, bool copy_full) +{ + trap_pt_regs_t *trap = regs->trap; + e2k_psp_lo_t u_psp_lo = stacks->psp_lo, + k_psp_lo = current_thread_info()->k_psp_lo; + e2k_psp_hi_t u_psp_hi = stacks->psp_hi; + e2k_pcsp_lo_t u_pcsp_lo = stacks->pcsp_lo, + k_pcsp_lo = current_thread_info()->k_pcsp_lo; + e2k_pcsp_hi_t u_pcsp_hi = stacks->pcsp_hi; + s64 u_pshtp_size, u_pcshtp_size, ps_copy_size, pcs_copy_size; + int ret; + + u_pshtp_size = GET_PSHTP_MEM_INDEX(stacks->pshtp); + u_pcshtp_size = PCSHTP_SIGN_EXTEND(stacks->pcshtp); + + /* + * Copy user's part from kernel stacks into user stacks + * Update user's stack registers + */ + if (copy_full) { + pcs_copy_size = u_pcshtp_size; + ps_copy_size = u_pshtp_size; + } else { + pcs_copy_size = get_pcs_copy_size(u_pcshtp_size); + ps_copy_size = get_ps_copy_size(cur_window_q, u_pshtp_size); + + /* Make sure there is enough space in CF for the FILL */ + BUG_ON((E2K_MAXCR_q - 4) * 16 < E2K_CF_MAX_FILL); + } + + if (likely(pcs_copy_size <= 0 && ps_copy_size <= 0)) + return 0; + + if (unlikely(pcs_copy_size > 0)) { + e2k_pcsp_hi_t k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); + + /* Since not all user data has been SPILL'ed it is possible + * that we have already overflown user's hardware stack. */ + if (unlikely(AS(u_pcsp_hi).ind > AS(u_pcsp_hi).size)) { + ret = handle_chain_stack_bounds(stacks, trap); + if (unlikely(ret)) { + pr_warning("process %s (%d) chain stack overflow (out of memory?)\n", + current->comm, current->pid); + return ret; + } + + u_pcsp_lo = stacks->pcsp_lo; + u_pcsp_hi = stacks->pcsp_hi; + } + + ret = user_pcsp_stack_copy(u_pcsp_lo, u_pcsp_hi, u_pcshtp_size, + k_pcsp_lo, k_pcsp_hi, pcs_copy_size, regs); + if (ret) + return ret; + } + + if (unlikely(ps_copy_size > 0)) { + e2k_psp_hi_t k_psp_hi = NATIVE_NV_READ_PSP_HI_REG(); + + /* Since not all user data has been SPILL'ed it is possible + * that we have already overflowed user's hardware stack. */ + if (unlikely(AS(u_psp_hi).ind > AS(u_psp_hi).size)) { + ret = handle_proc_stack_bounds(stacks, trap); + if (unlikely(ret)) { + pr_warning("process %s (%d) procedure stack overflow (out of memory?)\n", + current->comm, current->pid); + return ret; + } + + u_psp_lo = stacks->psp_lo; + u_psp_hi = stacks->psp_hi; + } + + ret = user_psp_stack_copy(u_psp_lo, u_psp_hi, u_pshtp_size, + k_psp_lo, k_psp_hi, ps_copy_size, regs); + if (ret) + return ret; + } + + return 0; +} + +static inline void collapse_kernel_hw_stacks(struct e2k_stacks *stacks) +{ + e2k_pcsp_lo_t k_pcsp_lo = current_thread_info()->k_pcsp_lo; + e2k_psp_lo_t k_psp_lo = current_thread_info()->k_psp_lo; + unsigned long flags, spilled_pc_size, spilled_p_size; + e2k_pshtp_t pshtp = stacks->pshtp; + u64 *dst; + const u64 *src; + + spilled_pc_size = PCSHTP_SIGN_EXTEND(stacks->pcshtp); + spilled_p_size = GET_PSHTP_MEM_INDEX(pshtp); + DebugUST("guest user spilled to host kernel stack part: chain 0x%lx " + "procedure 0x%lx\n", + spilled_pc_size, spilled_p_size); + /* When user tries to return from the last user frame + * we will have pcshtp = pcsp_hi.ind = 0. But situation + * with pcsp_hi.ind != 0 and pcshtp = 0 is impossible. */ + if (WARN_ON_ONCE(spilled_pc_size < SZ_OF_CR && + AS(stacks->pcsp_hi).ind != 0)) + do_exit(SIGKILL); + + /* Keep the last user frame (see user_hw_stacks_copy_full()) */ + if (spilled_pc_size >= SZ_OF_CR) { + spilled_pc_size -= SZ_OF_CR; + DebugUST("Keep the prev user chain frame, so spilled chain " + "size is now 0x%lx\n", + spilled_pc_size); + } + + raw_all_irq_save(flags); + + if (spilled_pc_size) { + dst = (u64 *) AS(k_pcsp_lo).base; + src = (u64 *) (AS(k_pcsp_lo).base + spilled_pc_size); + collapse_kernel_pcs(dst, src, spilled_pc_size); + + stacks->pcshtp = SZ_OF_CR; + + apply_graph_tracer_delta(-spilled_pc_size); + } + + if (spilled_p_size) { + dst = (u64 *) AS(k_psp_lo).base; + src = (u64 *) (AS(k_psp_lo).base + spilled_p_size); + collapse_kernel_ps(dst, src, spilled_p_size); + + AS(pshtp).ind = 0; + stacks->pshtp = pshtp; + } + + raw_all_irq_restore(flags); +} + +/** + * user_hw_stacks_prepare - prepare user hardware stacks that have been + * SPILLed to kernel back to user space + * @stacks - saved user stack registers + * @cur_window_q - size of current window in procedure stack, + * needed only if @copy_full is not set + * @syscall - true if called upon direct system call exit (no signal handlers) + * + * This does two things: + * + * 1) It is possible that upon kernel entry pcshtp == 0 in some cases: + * - user signal handler had pcshtp==0x20 before return to sigreturn() + * - user context had pcshtp==0x20 before return to makecontext_trampoline() + * - chain stack underflow happened + * So it is possible in sigreturn() and traps, but not in system calls. + * If we are using the trick with return to FILL user hardware stacks than + * we must have frame in chain stack to return to. So in this case kernel's + * chain stack is moved up by one frame (0x20 bytes). + * We also fill the new frame with actual user data and update stacks->pcshtp, + * this is needed to keep the coherent state where saved stacks->pcshtp values + * shows how much data from user space has been spilled to kernel space. + * + * 2) It is not possible to always FILL all of user data that have been + * SPILLed to kernel stacks. So we manually copy the leftovers that can + * not be FILLed to user space. + * This copy does not update stacks->pshtp and stacks->pcshtp. Main reason + * is signals: if a signal arrives after copying then it must see a coherent + * state where saved stacks->pshtp and stacks->pcshtp values show how much + * data from user space has been spilled to kernel space. + */ +static __always_inline void native_user_hw_stacks_prepare( + struct e2k_stacks *stacks, pt_regs_t *regs, + u64 cur_window_q, enum restore_caller from, int syscall) +{ + e2k_pcshtp_t u_pcshtp = stacks->pcshtp; + int ret; + + BUG_ON(from & FROM_PV_VCPU_MODE); + + /* + * 1) Make sure there is free space in kernel chain stack to return to + */ + if (!syscall && u_pcshtp == 0) { + unsigned long flags; + e2k_pcsp_lo_t u_pcsp_lo = stacks->pcsp_lo, + k_pcsp_lo = current_thread_info()->k_pcsp_lo; + e2k_pcsp_hi_t u_pcsp_hi = stacks->pcsp_hi, k_pcsp_hi; + e2k_mem_crs_t __user *u_cframe; + e2k_mem_crs_t *k_crs; + u64 u_cbase; + int ret = -EINVAL; + + raw_all_irq_save(flags); + E2K_FLUSHC; + k_pcsp_hi = READ_PCSP_HI_REG(); + BUG_ON(AS(k_pcsp_hi).ind); + AS(k_pcsp_hi).ind += SZ_OF_CR; + WRITE_PCSP_HI_REG(k_pcsp_hi); + + k_crs = (e2k_mem_crs_t *) AS(k_pcsp_lo).base; + u_cframe = (e2k_mem_crs_t __user *) (AS(u_pcsp_lo).base + + AS(u_pcsp_hi).ind); + u_cbase = ((from & FROM_RETURN_PV_VCPU_TRAP) || + host_test_intc_emul_mode(regs)) ? + u_pcsp_lo.PCSP_lo_base : + (u64) CURRENT_PCS_BASE(); + if ((u64) u_cframe > u_cbase) { + ret = __copy_user_to_current_hw_stack(k_crs, + u_cframe - 1, sizeof(*k_crs), regs, true); + } + raw_all_irq_restore(flags); + + /* Can happen if application returns until runs out of + * chain stack or there is no free memory for stacks. + * There is no user stack to return to - die. */ + if (ret) { + SIGDEBUG_PRINT("SIGKILL. %s\n", + (ret == -EINVAL) ? "tried to return to kernel" : + "ran into Out-of-Memory on user stacks"); + force_sig(SIGKILL); + return; + } + + if (AS(u_pcsp_hi).ind < SZ_OF_CR) { + update_pcsp_regs(AS(u_pcsp_lo).base, + &u_pcsp_lo, &u_pcsp_hi); + stacks->pcsp_lo = u_pcsp_lo; + stacks->pcsp_hi = u_pcsp_hi; + BUG_ON(AS(u_pcsp_hi).ind < SZ_OF_CR); + } + + u_pcshtp = SZ_OF_CR; + stacks->pcshtp = u_pcshtp; + } + + /* + * 2) Copy user data that cannot be FILLed + */ + ret = native_user_hw_stacks_copy(stacks, regs, cur_window_q, false); + if (unlikely(ret)) + do_exit(SIGKILL); +} + +#ifndef CONFIG_VIRTUALIZATION +/* native kernel without virtualization support */ +static __always_inline int +user_hw_stacks_copy(struct e2k_stacks *stacks, + pt_regs_t *regs, u64 cur_window_q, bool copy_full) +{ + return native_user_hw_stacks_copy(stacks, regs, cur_window_q, copy_full); +} + +static __always_inline void +host_user_hw_stacks_prepare(struct e2k_stacks *stacks, pt_regs_t *regs, + u64 cur_window_q, enum restore_caller from, int syscall) +{ + native_user_hw_stacks_prepare(stacks, regs, cur_window_q, + from, syscall); +} +#elif defined(CONFIG_KVM_GUEST_KERNEL) +/* It is native guest kernel (without paravirtualization) */ +#include +#elif defined(CONFIG_PARAVIRT_GUEST) +/* It is paravirtualized kernel (host and guest) */ +#include +#elif defined(CONFIG_KVM_HOST_MODE) +/* It is host kernel with virtualization support */ +#include +#else /* unknow mode */ +#error "unknown virtualization mode" +#endif /* !CONFIG_VIRTUALIZATION */ + +/** + * user_hw_stacks_copy_full - copy part of user stacks that was SPILLed + * into kernel back to user stacks. + * @stacks - saved user stack registers + * @regs - pt_regs pointer + * @crs - last frame to copy + * + * If @crs is not NULL then the frame pointed to by it will also be copied + * to userspace. Note that 'stacks->pcsp_hi.ind' is _not_ updated after + * copying since it would leave stack in inconsistent state (with two + * copies of the same @crs frame), this is left to the caller. * + * + * Inlining this reduces the amount of memory to copy in + * collapse_kernel_hw_stacks(). + */ +static inline int do_user_hw_stacks_copy_full(struct e2k_stacks *stacks, + pt_regs_t *regs, e2k_mem_crs_t *crs) +{ + int ret; + + /* + * Copy part of user stacks that were SPILLed into kernel stacks + */ + ret = user_hw_stacks_copy(stacks, regs, 0, true); + if (unlikely(ret)) + return ret; + + /* + * Nothing to FILL so remove the resulting hole from kernel stacks. + * + * IMPORTANT: there is always at least one user frame at the top of + * kernel stack - the one that issued a system call (in case of an + * exception we uphold this rule manually, see user_hw_stacks_prepare()) + * We keep this ABI and _always_ leave space for one user frame, + * this way we can later FILL using return trick (otherwise there + * would be no space in chain stack for the trick). + */ + collapse_kernel_hw_stacks(stacks); + + /* + * Copy saved %cr registers + * + * Caller must take care of filling of resulting hole + * (last user frame from pcshtp == SZ_OF_CR). + */ + if (crs) { + e2k_mem_crs_t __user *u_frame; + int ret; + + u_frame = (void __user *) (AS(stacks->pcsp_lo).base + + AS(stacks->pcsp_hi).ind); + ret = user_crs_frames_copy(u_frame, regs, ®s->crs); + if (unlikely(ret)) + return ret; + } + + return 0; +} + +#endif /* _E2K_COPY_HW_STACKS_H */ + diff --git a/arch/e2k/include/asm/cpu_regs_types.h b/arch/e2k/include/asm/cpu_regs_types.h index b287fc0..13bde84 100644 --- a/arch/e2k/include/asm/cpu_regs_types.h +++ b/arch/e2k/include/asm/cpu_regs_types.h @@ -2241,6 +2241,7 @@ typedef union e2k_fpsr { #define FPSR_reg word typedef union { + u32 half_word[2]; struct { u32 user : 1; u32 system : 1; diff --git a/arch/e2k/include/asm/e2k_api.h b/arch/e2k/include/asm/e2k_api.h index 2b60ca1..db6948a 100644 --- a/arch/e2k/include/asm/e2k_api.h +++ b/arch/e2k/include/asm/e2k_api.h @@ -1038,19 +1038,26 @@ _Pragma("no_asm_inline") \ : clobbers); \ }) - #define NATIVE_EXIT_HANDLE_SYSCALL(sbr, usd_hi, usd_lo, upsr) \ ({ \ - asm volatile ("{rwd %0, %%sbr}" \ - "{rwd %1, %%usd.hi}" \ + asm volatile (ALTERNATIVE_1_ALTINSTR \ + /* CPU_HWBUG_USD_ALIGNMENT version */ \ + "{rwd %0, %%sbr;" \ + " nop}" \ + ALTERNATIVE_2_OLDINSTR \ + /* Default version */ \ + "{rwd %0, %%sbr}" \ + ALTERNATIVE_3_FEATURE(%[facility]) \ "{rwd %2, %%usd.lo}" \ + "{rwd %1, %%usd.hi}" \ "{rws %3, %%upsr;" \ " nop 4}\n" \ : \ : "ri" ((__e2k_u64_t) (sbr)), \ "ri" ((__e2k_u64_t) (usd_hi)), \ "ri" ((__e2k_u64_t) (usd_lo)), \ - "ri" ((__e2k_u32_t) (upsr))); \ + "ri" ((__e2k_u32_t) (upsr)), \ + [facility] "i" (CPU_HWBUG_USD_ALIGNMENT)); \ }) @@ -1093,6 +1100,15 @@ _Pragma("no_asm_inline") \ : "r" ((__e2k_u64_t) (val))); \ }) +#define NATIVE_SET_MMUREG_CLOSED(reg_mnemonic, val, nop) \ +({ \ + asm volatile ("{nop " #nop "\n" \ + " mmurw %0, %%" #reg_mnemonic "}" \ + : \ + : "r" ((u64) (val))); \ +}) + + #define NATIVE_TAGGED_LOAD_TO_MMUREG(reg_mnemonic, _addr) \ do { \ unsigned long long _tmp; \ diff --git a/arch/e2k/include/asm/e2k_debug.h b/arch/e2k/include/asm/e2k_debug.h index fc1bb8c..a9383d4 100644 --- a/arch/e2k/include/asm/e2k_debug.h +++ b/arch/e2k/include/asm/e2k_debug.h @@ -23,8 +23,6 @@ #include #include -#define CHK_DEBUGGER(trapnr, signr, error_code, address, regs, after) - #define IS_KERNEL_THREAD(task, mm) \ ({ \ e2k_addr_t ps_base; \ @@ -248,6 +246,8 @@ host_ftrace_dump(void) { return; } + +static const bool kvm_debug = false; #else /* CONFIG_VIRTUALIZATION */ /* it is native host kernel with virtualization support */ /* or it is paravirtualized host/guest kernel */ diff --git a/arch/e2k/include/asm/epic.h b/arch/e2k/include/asm/epic.h index 88650d8..372271d 100644 --- a/arch/e2k/include/asm/epic.h +++ b/arch/e2k/include/asm/epic.h @@ -11,6 +11,8 @@ #ifndef __ASSEMBLY__ +void do_sic_error_interrupt(void); + static inline bool cpu_has_epic(void) { if (cpu_has(CPU_FEAT_EPIC)) diff --git a/arch/e2k/include/asm/hw_breakpoint.h b/arch/e2k/include/asm/hw_breakpoint.h index c25bff8..ed813ba 100644 --- a/arch/e2k/include/asm/hw_breakpoint.h +++ b/arch/e2k/include/asm/hw_breakpoint.h @@ -35,18 +35,12 @@ extern int hw_breakpoint_exceptions_notify( extern void hw_breakpoint_pmu_read(struct perf_event *bp); #ifdef CONFIG_HAVE_HW_BREAKPOINT -extern int bp_data_overflow_handle(struct pt_regs *regs); -extern int bp_instr_overflow_handle(struct pt_regs *regs); +extern void bp_data_overflow_handle(struct pt_regs *regs); +extern void bp_instr_overflow_handle(struct pt_regs *regs); extern void clear_ptrace_hw_breakpoint(struct task_struct *tsk); #else /* ! CONFIG_HAVE_HW_BREAKPOINT */ -static inline int bp_data_overflow_handle(struct pt_regs *regs) -{ - return 0; -} -static inline int bp_instr_overflow_handle(struct pt_regs *regs) -{ - return 0; -} +static inline void bp_data_overflow_handle(struct pt_regs *regs) { } +static inline void bp_instr_overflow_handle(struct pt_regs *regs) { } static inline void clear_ptrace_hw_breakpoint(struct task_struct *tsk) {} #endif /* CONFIG_HAVE_HW_BREAKPOINT */ diff --git a/arch/e2k/include/asm/kprobes.h b/arch/e2k/include/asm/kprobes.h index b45d43c..99970be 100644 --- a/arch/e2k/include/asm/kprobes.h +++ b/arch/e2k/include/asm/kprobes.h @@ -61,18 +61,14 @@ static inline int is_kprobe_break1_trap(struct pt_regs *regs) return *instr == KPROBE_BREAK_1; } -extern int kprobe_instr_debug_handle(struct pt_regs *); - +extern void kprobe_instr_debug_handle(struct pt_regs *); #else static inline int is_kprobe_break1_trap(struct pt_regs *regs) { return false; } -static inline int kprobe_instr_debug_handle(struct pt_regs *regs) -{ - return 0; -} +static inline void kprobe_instr_debug_handle(struct pt_regs *regs) { } #endif /* #ifdef CONFIG_KPROBES */ #ifdef CONFIG_KRETPROBES diff --git a/arch/e2k/include/asm/kvm/copy-hw-stacks.h b/arch/e2k/include/asm/kvm/copy-hw-stacks.h new file mode 100644 index 0000000..55a35e6 --- /dev/null +++ b/arch/e2k/include/asm/kvm/copy-hw-stacks.h @@ -0,0 +1,462 @@ +/* + * KVM guest kernel processes support + * Copyright 2011 Salavat S. Guiliazov (atic@mcst.ru) + */ + +#ifndef _E2K_KVM_COPY_HW_STACKS_H +#define _E2K_KVM_COPY_HW_STACKS_H + +#include + +#include +#include +#include +#include +#include +#include + +#undef DEBUG_KVM_GUEST_STACKS_MODE +#undef DebugGUST +#define DEBUG_KVM_GUEST_STACKS_MODE 0 /* guest user stacks */ + /* copy debug */ +#define DebugGUST(fmt, args...) \ +({ \ + if (DEBUG_KVM_GUEST_STACKS_MODE) \ + pr_info("%s(): " fmt, __func__, ##args); \ +}) + +#ifdef CONFIG_KVM_HOST_MODE +static inline void +prepare_pv_vcpu_inject_stacks(struct kvm_vcpu *vcpu, pt_regs_t *regs) +{ + e2k_stacks_t *stacks, *g_stacks; + gthread_info_t *gti = pv_vcpu_get_gti(vcpu); + + if (regs->g_stacks_valid) { + /* already prepared */ + return; + } + + /* all stacks at empty state, because of guest user recursion */ + /* of trap/system calls can not be */ + g_stacks = ®s->g_stacks; + g_stacks->usd_lo = gti->g_usd_lo; + g_stacks->usd_hi = gti->g_usd_hi; + g_stacks->top = gti->g_sbr.SBR_base; + g_stacks->psp_lo = gti->g_psp_lo; + g_stacks->psp_hi = gti->g_psp_hi; + g_stacks->pcsp_lo = gti->g_pcsp_lo; + g_stacks->pcsp_hi = gti->g_pcsp_hi; + + /* pshtp & pcshtp from guest user stack real state upon trap/syscall */ + stacks = ®s->stacks; + g_stacks->pshtp = stacks->pshtp; + g_stacks->pcshtp = stacks->pcshtp; + + regs->g_stacks_valid = true; + regs->g_stacks_active = false; + regs->need_inject = false; +} + +#undef EMULATE_EMPTY_CHAIN_STACK /* only to debug */ + +#ifdef EMULATE_EMPTY_CHAIN_STACK +static __always_inline void +pv_vcpu_emulate_empty_chain_staks(struct kvm_vcpu *vcpu, pt_regs_t *regs, + e2k_stacks_t *stacks, bool guest_user) +{ + e2k_pcshtp_t pcshtp; + unsigned long flags; + e2k_pcsp_lo_t g_pcsp_lo, k_pcsp_lo; + e2k_pcsp_hi_t g_pcsp_hi, k_pcsp_hi; + e2k_mem_crs_t __user *g_cframe; + e2k_mem_crs_t *k_crs; + int ret; + + pcshtp = stacks->pcshtp; + if (!(guest_user && pcshtp <= 0x40)) + return; + + g_pcsp_lo = regs->stacks.pcsp_lo; + g_pcsp_hi = regs->stacks.pcsp_hi; + + raw_all_irq_save(flags); + NATIVE_FLUSHC; + k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); + k_pcsp_lo = NATIVE_NV_READ_PCSP_LO_REG(); + BUG_ON(AS(k_pcsp_hi).ind != pcshtp); + + k_crs = (e2k_mem_crs_t *) AS(k_pcsp_lo).base; + g_cframe = (e2k_mem_crs_t __user *) (AS(g_pcsp_lo).base + + AS(g_pcsp_hi).ind - pcshtp); + ret = user_hw_stack_frames_copy(g_cframe, k_crs, pcshtp, regs, + k_pcsp_hi.PCSP_hi_ind, true); + if (ret) { + pr_err("%s(): copy to user stack failed\n", __func__); + BUG_ON(true); + } + k_pcsp_hi.PCSP_hi_ind -= pcshtp; + pcshtp = 0; + regs->stacks.pcshtp = pcshtp; + stacks->pcshtp = pcshtp; + NATIVE_NV_NOIRQ_WRITE_PCSP_HI_REG(k_pcsp_hi); + raw_all_irq_restore(flags); +} +#else /* !EMULATE_EMPTY_CHAIN_STACK */ +static __always_inline void +pv_vcpu_emulate_empty_chain_staks(struct kvm_vcpu *vcpu, pt_regs_t *regs, + e2k_stacks_t *stacks, bool guest_user) +{ +} +#endif /* EMULATE_EMPTY_CHAIN_STACK */ + +/** + * pv_vcpu_user_hw_stacks_copy - check size of user hardware stacks that have + * been SPILLed to kernel back to guest space + * @regs - saved guest user stack registers + * @cur_window_q - size of current window in procedure stack + * + * All guest user's stacks part were already copied to guest kernel stacks, + * so it need only check that it was full size and nothing to copy here + */ +static __always_inline int +pv_vcpu_user_hw_stacks_copy(pt_regs_t *regs, e2k_stacks_t *stacks, + u64 cur_window_q, bool guest_user) +{ + e2k_psp_lo_t g_psp_lo = stacks->psp_lo, + k_psp_lo = current_thread_info()->k_psp_lo; + e2k_psp_hi_t g_psp_hi = stacks->psp_hi; + e2k_pcsp_lo_t g_pcsp_lo = stacks->pcsp_lo, + k_pcsp_lo = current_thread_info()->k_pcsp_lo; + e2k_pcsp_hi_t g_pcsp_hi = stacks->pcsp_hi; + s64 g_pshtp_size, g_pcshtp_size, ps_copy_size, pcs_copy_size; + int ret; + + DebugUST("guest kernel chain state: base 0x%llx ind 0x%x size 0x%x\n", + g_pcsp_lo.PCSP_lo_base, g_pcsp_hi.PCSP_hi_ind, + g_pcsp_hi.PCSP_hi_size); + DebugUST("guest kernel proc state: base 0x%llx ind 0x%x size 0x%x\n", + g_psp_lo.PSP_lo_base, g_psp_hi.PSP_hi_ind, + g_psp_hi.PSP_hi_size); + g_pshtp_size = GET_PSHTP_MEM_INDEX(stacks->pshtp); + g_pcshtp_size = PCSHTP_SIGN_EXTEND(stacks->pcshtp); + DebugUST("guest kernel chain stack PCSHTP 0x%llx, " + "proc stack PSHTP 0x%llx cur window 0x%llx\n", + g_pcshtp_size, g_pshtp_size, cur_window_q); + + /* + * FIXME: the current implementation of the guest user signal handler + * injection uses direct copying to guest hardware stacks. + * It is bad decision, needs to be corrected + KVM_BUG_ON(is_paging(current_thread_info()->vcpu) && + (g_psp_lo.PSP_lo_base < GUEST_TASK_SIZE || + g_pcsp_lo.PCSP_lo_base < GUEST_TASK_SIZE)); + */ + + /* + * Calculate size of user's part to copy from kernel stacks + * into guest kernel stacks + */ + pcs_copy_size = get_pcs_copy_size(g_pcshtp_size); + ps_copy_size = get_ps_copy_size(cur_window_q, g_pshtp_size); + /* Make sure there is enough space in CF for the FILL */ + BUG_ON((E2K_MAXCR_q - 4) * 16 < E2K_CF_MAX_FILL); + DebugUST("to copy chain stack 0x%llx, proc stack 0x%llx\n", + pcs_copy_size, ps_copy_size); + + if (likely(pcs_copy_size <= 0 && ps_copy_size <= 0)) + return 0; + + if (unlikely(pcs_copy_size > 0)) { + e2k_pcsp_hi_t k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); + void __user *dst; + void *src; + + /* Since SPILL'ed guest user data will be copyed to guest */ + /* kernel stacks then cannot be any overflow of user's */ + /* hardware stack. */ + if (unlikely(AS(g_pcsp_hi).ind > AS(g_pcsp_hi).size)) { + pr_err("%s(): guest kernel chain stack overflow " + "(out of memory?): ind 0x%x size 0x%x\n", + __func__, g_pcsp_hi.PCSP_hi_ind, + g_pcsp_hi.PCSP_hi_size); + KVM_BUG_ON(true); + } + dst = (void __user *)(g_pcsp_lo.PCSP_lo_base + + g_pcsp_hi.PCSP_hi_ind); + if (!guest_user) { + /* stack index has been incremented on PCSHTP */ + dst -= g_pcshtp_size; + } + src = (void *)k_pcsp_lo.PCSP_lo_base; + + if (trace_host_copy_hw_stack_enabled()) + trace_host_copy_hw_stack(dst, src, pcs_copy_size, true); + + ret = user_hw_stack_frames_copy(dst, src, pcs_copy_size, regs, + k_pcsp_hi.PCSP_hi_ind, true); + if (trace_host_chain_stack_frame_enabled()) + trace_chain_stack_frames((e2k_mem_crs_t *)dst, + (e2k_mem_crs_t *)src, pcs_copy_size, + trace_host_chain_stack_frame); + if (ret) + return ret; + if (guest_user) { + g_pcsp_hi.PCSP_hi_ind += pcs_copy_size; + stacks->pcsp_hi = g_pcsp_hi; + DebugGUST("guest user chain stack frames copied from " + "host %px to guest kernel from %px size 0x%llx " + "PCSP.ind 0x%x\n", + src, dst, pcs_copy_size, g_pcsp_hi.PCSP_hi_ind); + } + } + + if (unlikely(ps_copy_size > 0)) { + e2k_psp_hi_t k_psp_hi = NATIVE_NV_READ_PSP_HI_REG(); + void __user *dst; + void *src; + + /* Since SPILL'ed guest user data will be copyed to guest */ + /* kernel stacks then cannot be any overflow of user's */ + /* hardware stack. */ + if (unlikely(AS(g_psp_hi).ind > AS(g_psp_hi).size)) { + pr_err("%s(): guest kernel proc stack overflow " + "(out of memory?): ind 0x%x size 0x%x\n", + __func__, g_psp_hi.PSP_hi_ind, + g_psp_hi.PSP_hi_size); + KVM_BUG_ON(true); + } + dst = (void __user *)(g_psp_lo.PSP_lo_base + + g_psp_hi.PSP_hi_ind); + if (!guest_user) { + /* stack index has been incremented on PSHTP */ + dst -= g_pshtp_size; + } + src = (void *)k_psp_lo.PSP_lo_base; + + if (trace_host_copy_hw_stack_enabled()) + trace_host_copy_hw_stack(dst, src, ps_copy_size, false); + + ret = user_hw_stack_frames_copy(dst, src, ps_copy_size, regs, + k_psp_hi.PSP_hi_ind, false); + if (trace_host_proc_stack_frame_enabled()) + trace_proc_stack_frames((kernel_mem_ps_t *)dst, + (kernel_mem_ps_t *)src, ps_copy_size, + trace_host_proc_stack_frame); + if (ret) + return ret; + if (guest_user) { + g_psp_hi.PSP_hi_ind += ps_copy_size; + stacks->psp_hi = g_psp_hi; + DebugGUST("guest user proc stack frames copied from " + "host %px to guest kernel from %px size 0x%llx " + "PSP.ind 0x%x\n", + src, dst, ps_copy_size, g_psp_hi.PSP_hi_ind); + } + } + + return 0; +} + +/** + * pv_vcpu_user_hw_stacks_prepare - prepare guest user hardware stacks + that have been SPILLed to kernel back + to guest user space + * @regs - saved guest user stack registers + * @cur_window_q - size of current window in procedure stack + * @syscall - true if called upon direct system call exit (no signal handlers) + * + * This does two things: + * + * 1) It is possible that upon kernel entry pcshtp == 0 in some cases: + * - user signal handler had pcshtp==0x20 before return to sigreturn() + * - user context had pcshtp==0x20 before return to makecontext_trampoline() + * - chain stack underflow happened + * So it is possible in sigreturn() and traps, but not in system calls. + * If we are using the trick with return to FILL user hardware stacks than + * we must have frame in chain stack to return to. So in this case kernel's + * chain stack is moved up by one frame (0x20 bytes). + * We also fill the new frame with actual user data and update stacks->pcshtp, + * this is needed to keep the coherent state where saved stacks->pcshtp values + * shows how much data from user space has been spilled to kernel space. + * + * 2) It is not possible to always FILL all of user data that have been + * SPILLed to kernel stacks. So we manually copy the leftovers that can + * not be FILLed to user space. + * This copy does not update stacks->pshtp and stacks->pcshtp. Main reason + * is signals: if a signal arrives after copying then it must see a coherent + * state where saved stacks->pshtp and stacks->pcshtp values show how much + * data from user space has been spilled to kernel space. + */ +static __always_inline void +pv_vcpu_user_hw_stacks_prepare(struct kvm_vcpu *vcpu, pt_regs_t *regs, + u64 cur_window_q, enum restore_caller from, int syscall) +{ + e2k_stacks_t *stacks; + e2k_pcshtp_t pcshtp; + bool guest_user; + bool paging = is_paging(vcpu); + int ret; + + if (likely(paging)) { + guest_user = !!(syscall || !pv_vcpu_trap_on_guest_kernel(regs)); + } else { + guest_user = false; + } + if (guest_user) { + if (from & FROM_PV_VCPU_MODE) { + /* all preparation has been made */ + /* by host & guest handler */ + return; + } + + /* trap on/syscall from guest user, so regs keeps user */ + /* registers state and it need use guest kernel stacks */ + /* in empty state to handle this trap/syscall */ + if (!regs->g_stacks_valid) { + prepare_pv_vcpu_inject_stacks(vcpu, regs); + } + stacks = ®s->g_stacks; + } else { + /* trap on guest kernel, so regs already points to guest */ + /* kernel stacks and trap will be handled by host */ + /* same as other user's processes traps */ + stacks = ®s->stacks; + } + + /* only to debug on simulator : pcshtp == 0 */ + pv_vcpu_emulate_empty_chain_staks(vcpu, regs, stacks, guest_user); + + pcshtp = stacks->pcshtp; + DebugUST("guest kernel chain stack state: base 0x%llx ind 0x%x " + "size 0x%x\n", + stacks->pcsp_lo.PCSP_lo_base, + stacks->pcsp_hi.PCSP_hi_ind, + stacks->pcsp_hi.PCSP_hi_size); + DebugUST("host kernel chain stack state: base 0x%llx ind 0x%x " + "size 0x%x\n", + NATIVE_NV_READ_PCSP_LO_REG().PCSP_lo_base, + NATIVE_NV_READ_PCSP_HI_REG().PCSP_hi_ind, + NATIVE_NV_READ_PCSP_HI_REG().PCSP_hi_size); + DebugUST("guest kernel chain stack size to fill PCSHTP 0x%x\n", + pcshtp); + /* + * 1) Make sure there is free space in kernel chain stack to return to + */ + if (!syscall && pcshtp == 0 && !guest_user) { + unsigned long flags; + e2k_pcsp_lo_t g_pcsp_lo = stacks->pcsp_lo, + k_pcsp_lo = current_thread_info()->k_pcsp_lo; + e2k_pcsp_hi_t g_pcsp_hi = stacks->pcsp_hi, k_pcsp_hi; + e2k_mem_crs_t __user *g_cframe; + e2k_mem_crs_t *k_crs; + int ret = -EINVAL; + + raw_all_irq_save(flags); + NATIVE_FLUSHC; + k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); + BUG_ON(AS(k_pcsp_hi).ind); + AS(k_pcsp_hi).ind += SZ_OF_CR; + NATIVE_NV_NOIRQ_WRITE_PCSP_HI_REG(k_pcsp_hi); + + k_crs = (e2k_mem_crs_t *) AS(k_pcsp_lo).base; + g_cframe = (e2k_mem_crs_t __user *) (AS(g_pcsp_lo).base + + AS(g_pcsp_hi).ind); + if ((u64) g_cframe > (u64) AS(g_pcsp_lo).base) { + ret = __copy_user_to_current_hw_stack(k_crs, + g_cframe - 1, sizeof(*k_crs), regs, true); + } + raw_all_irq_restore(flags); + + /* Can happen if application returns until runs out of + * chain stack or there is no free memory for stacks. + * There is no user stack to return to - die. */ + if (ret) { + E2K_LMS_HALT_OK; + pr_err("%s(): SIGKILL. %s\n", + __func__, + (ret == -EINVAL) ? + "tried to return to kernel" + : + "ran into Out-of-Memory on user stacks"); + force_sig(SIGKILL); + return; + } + DebugUST("copy guest user chain frame from %px to kernel " + "bottom from %px\n", + g_cframe - 1, k_crs); + + if (AS(g_pcsp_hi).ind < SZ_OF_CR) { + pr_err("%s(): guest kernel chain stack underflow\n", + __func__); + KVM_BUG_ON(true); + } + + pcshtp = SZ_OF_CR; + stacks->pcshtp = pcshtp; + DebugUST("guest kernel chain stack to FILL PCSHTP " + "set to 0x%x\n", + stacks->pcshtp); + } else if (!syscall && pcshtp == 0 && guest_user) { + e2k_pcsp_hi_t k_pcsp_hi; + unsigned long flags; + + /* set flag for unconditional injection to do not copy */ + /* from guest user space */ + regs->need_inject = true; + + /* reserve one bottom frames for trampoline */ + /* the guest handler replaces guest user trapped frame */ + raw_all_irq_save(flags); + NATIVE_FLUSHC; + k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); + BUG_ON(k_pcsp_hi.PCSP_hi_ind); + k_pcsp_hi.PCSP_hi_ind += 1 * SZ_OF_CR; + NATIVE_NV_NOIRQ_WRITE_PCSP_HI_REG(k_pcsp_hi); + raw_all_irq_restore(flags); + } + + /* + * 2) Copy user data that cannot be FILLed + */ + ret = pv_vcpu_user_hw_stacks_copy(regs, stacks, cur_window_q, + guest_user); + if (unlikely(ret)) + do_exit(SIGKILL); +} + +/* Same as for native kernel without virtualization support */ +static __always_inline int +user_hw_stacks_copy(struct e2k_stacks *stacks, + pt_regs_t *regs, u64 cur_window_q, bool copy_full) +{ + return native_user_hw_stacks_copy(stacks, regs, cur_window_q, copy_full); +} + +static __always_inline void +host_user_hw_stacks_prepare(struct e2k_stacks *stacks, pt_regs_t *regs, + u64 cur_window_q, enum restore_caller from, int syscall) +{ + struct kvm_vcpu *vcpu; + + if (likely(!kvm_test_intc_emul_flag(regs))) { + /* trap on/syscall from host user processes */ + return native_user_hw_stacks_prepare(stacks, regs, + cur_window_q, from, syscall); + } + + vcpu = current_thread_info()->vcpu; + KVM_BUG_ON(vcpu == NULL); + pv_vcpu_user_hw_stacks_prepare(vcpu, regs, cur_window_q, from, syscall); +} +#endif /* CONFIG_KVM_HOST_MODE */ + +#ifdef CONFIG_KVM_GUEST_KERNEL +/* it is native guest kernel */ +#include +#else /* CONFIG_VIRTUALIZATION && ! CONFIG_KVM_GUEST_KERNEL */ +/* it is native host kernel with virtualization support */ +/* or it is paravirtualized host and guest kernel */ +#endif /* ! CONFIG_KVM_GUEST_KERNEL */ + +#endif /* ! _E2K_KVM_COPY_HW_STACKS_H */ diff --git a/arch/e2k/include/asm/kvm/debug.h b/arch/e2k/include/asm/kvm/debug.h index d6cce8b..9c99d45 100644 --- a/arch/e2k/include/asm/kvm/debug.h +++ b/arch/e2k/include/asm/kvm/debug.h @@ -11,6 +11,8 @@ #include #include +extern bool kvm_debug; + /* * Some definitions to print/dump/show stacks */ diff --git a/arch/e2k/include/asm/kvm/gmmu_context.h b/arch/e2k/include/asm/kvm/gmmu_context.h index 15b4e2f..eaa0114 100644 --- a/arch/e2k/include/asm/kvm/gmmu_context.h +++ b/arch/e2k/include/asm/kvm/gmmu_context.h @@ -86,6 +86,7 @@ kvm_mmu_set_init_gmm_root(struct kvm_vcpu *vcpu, hpa_t root) } gmm->u_pptb = vcpu->arch.mmu.get_vcpu_u_pptb(vcpu); gmm->os_pptb = vcpu->arch.mmu.get_vcpu_os_pptb(vcpu); + gmm->u_vptb = vcpu->arch.mmu.get_vcpu_u_vptb(vcpu); } static inline pgd_t * kvm_mmu_get_gmm_root(struct gmm_struct *gmm) @@ -208,15 +209,12 @@ switch_guest_pgd(pgd_t *next_pgd) pgd_to_set = next_pgd; } + KVM_BUG_ON(PCSHTP_SIGN_EXTEND(NATIVE_READ_PCSHTP_REG_SVALUE()) != 0); + reload_root_pgd(pgd_to_set); /* FIXME: support of guest secondary space is not yet implemented reload_secondary_page_dir(mm); */ - - /* any function call can fill old state of hardware stacks */ - /* so after all calls do flush stacks again */ - NATIVE_FLUSHCPU; - E2K_WAIT(_all_e); } #define DO_NOT_USE_ACTIVE_GMM /* turn OFF optimization */ diff --git a/arch/e2k/include/asm/kvm/guest/copy-hw-stacks.h b/arch/e2k/include/asm/kvm/guest/copy-hw-stacks.h new file mode 100644 index 0000000..e616bcf --- /dev/null +++ b/arch/e2k/include/asm/kvm/guest/copy-hw-stacks.h @@ -0,0 +1,568 @@ +/* + * KVM guest kernel processes support + * Copyright 2011 Salavat S. Guiliazov (atic@mcst.ru) + */ + +#ifndef _E2K_KVM_GUEST_COPY_HW_STACKS_H +#define _E2K_KVM_GUEST_COPY_HW_STACKS_H + +#include +#include +#include + +#include + +extern bool debug_ustacks; +#undef DEBUG_USER_STACKS_MODE +#undef DebugUST +#define DEBUG_USER_STACKS_MODE 0 /* guest user stacks debug mode */ +#define DebugUST(fmt, args...) \ +({ \ + if (debug_ustacks) \ + pr_info("%s(): " fmt, __func__, ##args); \ +}) + +static inline void +kvm_kernel_hw_stack_frames_copy(u64 *dst, const u64 *src, unsigned long size) +{ + fast_tagged_memory_copy(dst, src, size, + TAGGED_MEM_STORE_REC_OPC | + MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, + TAGGED_MEM_LOAD_REC_OPC | + MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, true); +} + +static __always_inline void +kvm_collapse_kernel_ps(u64 *dst, const u64 *src, u64 spilled_size) +{ + e2k_psp_hi_t k_psp_hi; + u64 ps_ind, ps_size; + u64 size; + + DebugUST("current host procedure stack index 0x%x, PSHTP 0x%x\n", + NATIVE_NV_READ_PSP_HI_REG().PSP_hi_ind, + NATIVE_NV_READ_PSHTP_REG().PSHTP_ind); + + KVM_COPY_STACKS_TO_MEMORY(); + ATOMIC_GET_HW_PS_SIZES(ps_ind, ps_size); + + size = ps_ind - spilled_size; + BUG_ON(!IS_ALIGNED(size, ALIGN_PSTACK_TOP_SIZE) || (s64) size < 0); + + kvm_kernel_hw_stack_frames_copy(dst, src, size); + + k_psp_hi = NATIVE_NV_READ_PSP_HI_REG(); + k_psp_hi.PSP_hi_ind = size; + HYPERVISOR_update_psp_hi(k_psp_hi.PSP_hi_half); + + DebugUST("move spilled procedure part from host top %px to " + "bottom %px, size 0x%llx\n", + src, dst, size); + DebugUST("host kernel procedure stack index is now 0x%x, " + "guest user PSHTP 0x%llx\n", + k_psp_hi.PSP_hi_ind, spilled_size); +} + +static __always_inline void +kvm_collapse_kernel_pcs(u64 *dst, const u64 *src, u64 spilled_size) +{ + e2k_pcsp_hi_t k_pcsp_hi; + u64 pcs_ind, pcs_size; + u64 size; + + DebugUST("current host chain stack index 0x%x, PCSHTP 0x%llx\n", + NATIVE_NV_READ_PCSP_HI_REG().PCSP_hi_ind, + NATIVE_READ_PCSHTP_REG_SVALUE()); + + KVM_COPY_STACKS_TO_MEMORY(); + ATOMIC_GET_HW_PCS_SIZES(pcs_ind, pcs_size); + + size = pcs_ind - spilled_size; + BUG_ON(!IS_ALIGNED(size, ALIGN_PCSTACK_TOP_SIZE) || (s64) size < 0); + + kvm_kernel_hw_stack_frames_copy(dst, src, size); + + k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); + k_pcsp_hi.PCSP_hi_ind = size; + HYPERVISOR_update_pcsp_hi(k_pcsp_hi.PCSP_hi_half); + + DebugUST("move spilled chain part from host top %px to " + "bottom %px, size 0x%llx\n", + src, dst, size); + DebugUST("host kernel chain stack index is now 0x%x, " + "guest user PCSHTP 0x%llx\n", + k_pcsp_hi.PCSP_hi_ind, spilled_size); +} + +static __always_inline int +copy_stack_page_from_kernel(void __user *dst, void *src, e2k_size_t to_copy, + bool is_chain) +{ + int ret; + + ret = HYPERVISOR_copy_hw_stacks_frames(dst, src, to_copy, is_chain); + return ret; +} + +static __always_inline int +copy_stack_page_to_user(void __user *dst, void *src, e2k_size_t to_copy, + bool is_chain) +{ + struct page *page = NULL; + unsigned long addr = (unsigned long)dst; + void *k_dst; + e2k_size_t offset; + mm_segment_t seg; + unsigned long ts_flag; + int npages; + int ret; + + if (to_copy == 0) + return 0; + + DebugUST("started to copy %s stack from kernel stack %px to user %px " + "size 0x%lx\n", + (is_chain) ? "chain" : "procedure", + src, dst, to_copy); + seg = get_fs(); + set_fs(K_USER_DS); + ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); + do { + npages = __get_user_pages_fast(addr, 1, 1, &page); + if (npages == 1) + break; + npages = get_user_pages_unlocked(addr, 1, &page, FOLL_WRITE); + if (npages == 1) + break; + clear_ts_flag(ts_flag); + set_fs(seg); + ret = -EFAULT; + goto failed; + } while (npages != 1); + clear_ts_flag(ts_flag); + set_fs(seg); + + offset = addr & ~PAGE_MASK; + k_dst = page_address(page) + offset; + DebugUST("copy stack frames from kernel %px to user %px, size 0x%lx\n", + src, k_dst, to_copy); + ret = copy_stack_page_from_kernel(k_dst, src, to_copy, is_chain); + if (ret != 0) { + pr_err("%s(): copy %s stack to user %px from kernel %px, " + "size 0x%lx failed, error %d\n", + __func__, (is_chain) ? "chain" : "procedure", + src, k_dst, to_copy, ret); + goto failed_copy; + } + +failed_copy: + put_page(page); +failed: + return ret; +} + +static __always_inline int +kvm_copy_user_stack_from_kernel(void __user *dst, void *src, + e2k_size_t to_copy, bool is_chain) +{ + e2k_size_t offset, len, copied = 0; + int ret; + + if (to_copy == 0) + return 0; + + DebugUST("started to copy %s stack from kernel stack %px to user %px " + "size 0x%lx\n", + (is_chain) ? "chain" : "procedure", + src, dst, to_copy); + + if (trace_guest_copy_hw_stack_enabled()) + trace_guest_copy_hw_stack(dst, src, to_copy, is_chain); + + do { + offset = (unsigned long)dst & ~PAGE_MASK; + len = min(to_copy, PAGE_SIZE - offset); + ret = copy_stack_page_to_user(dst, src, len, is_chain); + if (ret != 0) + goto failed; + dst += len; + src += len; + to_copy -= len; + copied += len; + } while (to_copy > 0); + + if (!is_chain && trace_guest_proc_stack_frame_enabled()) { + if (trace_guest_va_tlb_state_enabled()) { + trace_guest_va_tlb_state((e2k_addr_t)dst); + } + trace_proc_stack_frames((kernel_mem_ps_t *)(src - copied), + (kernel_mem_ps_t *)(src - copied), copied, + trace_guest_proc_stack_frame); + trace_proc_stack_frames((kernel_mem_ps_t *)(dst - copied), + (kernel_mem_ps_t *)(dst - copied), copied, + trace_guest_proc_stack_frame); + } + if (is_chain && trace_guest_chain_stack_frame_enabled()) { + if (trace_guest_va_tlb_state_enabled()) { + trace_guest_va_tlb_state((e2k_addr_t)dst); + } + trace_chain_stack_frames((e2k_mem_crs_t *)(src - copied), + (e2k_mem_crs_t *)(src - copied), copied, + trace_guest_chain_stack_frame); + trace_chain_stack_frames((e2k_mem_crs_t *)(dst - copied), + (e2k_mem_crs_t *)(dst - copied), copied, + trace_guest_chain_stack_frame); + } + + return 0; + +failed: + pr_err("%s(): failed, error %d\n", __func__, ret); + return ret; +} + +static __always_inline int +kvm_user_hw_stacks_copy(pt_regs_t *regs) +{ + e2k_psp_lo_t psp_lo; + e2k_psp_hi_t psp_hi; + e2k_pshtp_t pshtp; + e2k_pcsp_lo_t pcsp_lo; + e2k_pcsp_hi_t pcsp_hi; + e2k_pcshtp_t pcshtp; + e2k_stacks_t *stacks; + void __user *dst; + void *src; + long copyed_ps_size, copyed_pcs_size, to_copy, from, there_are; + int ret; + + if (unlikely(irqs_disabled())) { + pr_err("%s() called with IRQs disabled PSP: 0x%lx UPSR: 0x%lx " + "under UPSR %d\n", + __func__, KVM_READ_PSR_REG_VALUE(), + KVM_READ_UPSR_REG_VALUE(), + kvm_get_vcpu_state()->irqs_under_upsr); + local_irq_enable(); + WARN_ON(true); + } + + stacks = ®s->stacks; + copyed_ps_size = regs->copyed.ps_size; + copyed_pcs_size = regs->copyed.pcs_size; + if (unlikely(copyed_ps_size || copyed_pcs_size)) { + /* stacks have been already copyed */ + BUG_ON(copyed_ps_size != GET_PSHTP_MEM_INDEX(stacks->pshtp) && + GET_PSHTP_MEM_INDEX(stacks->pshtp) != 0); + BUG_ON(copyed_pcs_size != PCSHTP_SIGN_EXTEND(stacks->pcshtp) && + PCSHTP_SIGN_EXTEND(stacks->pcshtp) != SZ_OF_CR); + return 0; + } + + ret = HYPERVISOR_copy_stacks_to_memory(); + if (ret != 0) { + pr_err("%s(): flush of kernel stacks failed, error %d\n", + __func__, ret); + goto failed; + } + + /* copy user part of procedure stack from kernel back to user */ + ATOMIC_READ_HW_STACKS_REGS(psp_lo.PSP_lo_half, psp_hi.PSP_hi_half, + pshtp.PSHTP_reg, + pcsp_lo.PCSP_lo_half, pcsp_hi.PCSP_hi_half, + pcshtp); + src = (void *)psp_lo.PSP_lo_base; + DebugUST("procedure stack at kernel from %px, size 0x%x, ind 0x%x, " + "pshtp 0x%llx\n", + src, psp_hi.PSP_hi_size, psp_hi.PSP_hi_ind, pshtp.PSHTP_reg); + BUG_ON(psp_hi.PSP_hi_ind > psp_hi.PSP_hi_size); + + if (stacks->psp_hi.PSP_hi_ind >= stacks->psp_hi.PSP_hi_size) { + /* procedure stack overflow, need expand */ + ret = handle_proc_stack_bounds(stacks, regs->trap); + if (unlikely(ret)) { + pr_err("%s(): could not handle process %s (%d) " + "procedure stack overflow, error %d\n", + __func__, current->comm, current->pid, ret); + goto failed; + } + } + to_copy = GET_PSHTP_MEM_INDEX(stacks->pshtp); + BUG_ON(to_copy < 0); + from = stacks->psp_hi.PSP_hi_ind - to_copy; + BUG_ON(from < 0); + dst = (void __user *)stacks->psp_lo.PSP_lo_base + from; + DebugUST("procedure stack at user from %px, ind 0x%x, " + "pshtp size to copy 0x%lx\n", + dst, stacks->psp_hi.PSP_hi_ind, to_copy); + there_are = stacks->psp_hi.PSP_hi_size - from; + if (there_are < to_copy) { + pr_err("%s(): user procedure stack overflow, there are 0x%lx " + "to copy need 0x%lx, not yet implemented\n", + __func__, there_are, to_copy); + BUG_ON(true); + } + if (to_copy > 0) { + ret = kvm_copy_user_stack_from_kernel(dst, src, to_copy, false); + if (ret != 0) { + pr_err("%s(): procedure stack copying from kernel %px " + "to user %px, size 0x%lx failed, error %d\n", + __func__, src, dst, to_copy, ret); + goto failed; + } + regs->copyed.ps_size = to_copy; + } + + /* copy user part of chain stack from kernel back to user */ + src = (void *)pcsp_lo.PCSP_lo_base; + DebugUST("chain stack at kernel from %px, size 0x%x, ind 0x%x, " + "pcshtp 0x%x\n", + src, pcsp_hi.PCSP_hi_size, pcsp_hi.PCSP_hi_ind, pcshtp); + BUG_ON(pcsp_hi.PCSP_hi_ind + PCSHTP_SIGN_EXTEND(pcshtp) > + pcsp_hi.PCSP_hi_size); + if (stacks->pcsp_hi.PCSP_hi_ind >= stacks->pcsp_hi.PCSP_hi_size) { + /* chain stack overflow, need expand */ + ret = handle_chain_stack_bounds(stacks, regs->trap); + if (unlikely(ret)) { + pr_err("%s(): could not handle process %s (%d) " + "chain stack overflow, error %d\n", + __func__, current->comm, current->pid, ret); + goto failed; + } + } + to_copy = PCSHTP_SIGN_EXTEND(stacks->pcshtp); + BUG_ON(to_copy < 0); + from = stacks->pcsp_hi.PCSP_hi_ind - to_copy; + BUG_ON(from < 0); + dst = (void *)stacks->pcsp_lo.PCSP_lo_base + from; + BUG_ON(to_copy > pcsp_hi.PCSP_hi_ind + PCSHTP_SIGN_EXTEND(pcshtp)); + DebugUST("chain stack at user from %px, ind 0x%x, " + "pcshtp size to copy 0x%lx\n", + dst, stacks->pcsp_hi.PCSP_hi_ind, to_copy); + there_are = stacks->pcsp_hi.PCSP_hi_size - from; + if (there_are < to_copy) { + pr_err("%s(): user chain stack overflow, there are 0x%lx " + "to copy need 0x%lx, not yet implemented\n", + __func__, there_are, to_copy); + BUG_ON(true); + } + if (to_copy > 0) { + ret = kvm_copy_user_stack_from_kernel(dst, src, to_copy, true); + if (ret != 0) { + pr_err("%s(): chain stack copying from kernel %px " + "to user %px, size 0x%lx failed, error %d\n", + __func__, src, dst, to_copy, ret); + goto failed; + } + regs->copyed.pcs_size = to_copy; + } + +failed: + if (DEBUG_USER_STACKS_MODE) + debug_ustacks = false; + return ret; +} + +/* + * Copy additional frames injected to the guest kernel stack, but these frames + * are for guest user stack and should be copyed from kernel back to the top + * of user. + */ +static __always_inline int +kvm_copy_injected_pcs_frames_to_user(pt_regs_t *regs, int frames_num) +{ + e2k_size_t pcs_ind, pcs_size; + e2k_addr_t pcs_base; + int pcsh_top; + e2k_stacks_t *stacks; + void __user *dst; + void *src; + long copyed_frames_size, to_copy, from, there_are, frames_size; + int ret; + + BUG_ON(irqs_disabled()); + + frames_size = frames_num * SZ_OF_CR; + copyed_frames_size = regs->copyed.pcs_injected_frames_size; + if (unlikely(copyed_frames_size >= frames_size)) { + /* all frames have been already copyed */ + return 0; + } else { + /* copyed only part of frames - not implemented case */ + BUG_ON(copyed_frames_size != 0); + } + + stacks = ®s->stacks; + ATOMIC_GET_HW_PCS_SIZES_BASE_TOP(pcs_ind, pcs_size, pcs_base, pcsh_top); + + /* guest user stacks part spilled to kernel should be already copyed */ + BUG_ON(PCSHTP_SIGN_EXTEND(regs->copyed.pcs_size != stacks->pcshtp)); + + src = (void *)(pcs_base + regs->copyed.pcs_size); + DebugUST("chain stack at kernel from %px, size 0x%lx + 0x%lx, " + "ind 0x%lx, pcsh top 0x%x\n", + src, pcs_size, frames_size, pcs_ind, pcsh_top); + BUG_ON(regs->copyed.pcs_size + frames_size > pcs_ind + pcsh_top); + if (stacks->pcsp_hi.PCSP_hi_ind + frames_size > + stacks->pcsp_hi.PCSP_hi_size) { + /* user chain stack can overflow, need expand */ + ret = handle_chain_stack_bounds(stacks, regs->trap); + if (unlikely(ret)) { + pr_err("%s(): could not handle process %s (%d) " + "chain stack overflow, error %d\n", + __func__, current->comm, current->pid, ret); + goto failed; + } + } + to_copy = frames_size; + BUG_ON(to_copy < 0); + from = stacks->pcsp_hi.PCSP_hi_ind; + BUG_ON(from < regs->copyed.pcs_size); + dst = (void *)stacks->pcsp_lo.PCSP_lo_base + from; + DebugUST("chain stack at user from %px, ind 0x%x, " + "frames size to copy 0x%lx\n", + dst, stacks->pcsp_hi.PCSP_hi_ind, to_copy); + there_are = stacks->pcsp_hi.PCSP_hi_size - from; + if (there_are < to_copy) { + pr_err("%s(): user chain stack overflow, there are 0x%lx " + "to copy need 0x%lx, not yet implemented\n", + __func__, there_are, to_copy); + BUG_ON(true); + } + if (likely(to_copy > 0)) { + ret = kvm_copy_user_stack_from_kernel(dst, src, to_copy, true); + if (ret != 0) { + pr_err("%s(): chain stack copying from kernel %px " + "to user %px, size 0x%lx failed, error %d\n", + __func__, src, dst, to_copy, ret); + goto failed; + } + regs->copyed.pcs_injected_frames_size = to_copy; + /* increment chain stack pointer */ + stacks->pcsp_hi.PCSP_hi_ind += to_copy; + } else { + BUG_ON(true); + ret = 0; + } + +failed: + if (DEBUG_USER_STACKS_MODE) + debug_ustacks = false; + return ret; +} + +/** + * user_hw_stacks_prepare - prepare user hardware stacks that have been + * SPILLed to kernel back to user space + * @stacks - saved user stack registers + * @cur_window_q - size of current window in procedure stack, + * needed only if @copy_full is not set + * @syscall - true if called upon direct system call exit (no signal handlers) + * + * This does two things: + * + * 1) It is possible that upon kernel entry pcshtp == 0 in some cases: + * - user signal handler had pcshtp==0x20 before return to sigreturn() + * - user context had pcshtp==0x20 before return to makecontext_trampoline() + * - chain stack underflow happened + * So it is possible in sigreturn() and traps, but not in system calls. + * If we are using the trick with return to FILL user hardware stacks than + * we must have frame in chain stack to return to. So in this case kernel's + * chain stack is moved up by one frame (0x20 bytes). + * We also fill the new frame with actual user data and update stacks->pcshtp, + * this is needed to keep the coherent state where saved stacks->pcshtp values + * shows how much data from user space has been spilled to kernel space. + * + * 2) It is not possible to always FILL all of user data that have been + * SPILLed to kernel stacks. So we manually copy the leftovers that can + * not be FILLed to user space. + * This copy does not update stacks->pshtp and stacks->pcshtp. Main reason + * is signals: if a signal arrives after copying then it must see a coherent + * state where saved stacks->pshtp and stacks->pcshtp values show how much + * data from user space has been spilled to kernel space. + */ +static __always_inline int kvm_user_hw_stacks_prepare( + struct e2k_stacks *stacks, pt_regs_t *regs, + u64 cur_window_q, enum restore_caller from, int syscall) +{ + e2k_pcshtp_t u_pcshtp = stacks->pcshtp; + int ret; + + BUG_ON(!kvm_trap_user_mode(regs)); + + BUG_ON(from & FROM_PV_VCPU_MODE); + + /* + * 1) Make sure there is free space in kernel chain stack to return to + */ + if (!syscall && u_pcshtp == 0) { + DebugUST("%s(): PCSHTP is empty\n", __func__); + } + + /* + * 2) User data copying will be done some later at + * kvm_prepare_user_hv_stacks() + */ + ret = kvm_user_hw_stacks_copy(regs); + if (ret != 0) { + pr_err("%s(): copying of hardware stacks failed< error %d\n", + __func__, ret); + do_exit(SIGKILL); + } + return ret; +} + +static inline int +kvm_ret_from_fork_prepare_hv_stacks(struct pt_regs *regs) +{ + return kvm_user_hw_stacks_copy(regs); +} + +#ifdef CONFIG_KVM_GUEST_KERNEL +/* native guest kernel */ + +static __always_inline void +kernel_hw_stack_frames_copy(u64 *dst, const u64 *src, unsigned long size) +{ + kvm_kernel_hw_stack_frames_copy(dst, src, size); +} + +static __always_inline void +collapse_kernel_ps(u64 *dst, const u64 *src, u64 spilled_size) +{ + kvm_collapse_kernel_ps(dst, src, spilled_size); +} + +static __always_inline void +collapse_kernel_pcs(u64 *dst, const u64 *src, u64 spilled_size) +{ + kvm_collapse_kernel_pcs(dst, src, spilled_size); +} + +static __always_inline int +user_hw_stacks_copy(struct e2k_stacks *stacks, + pt_regs_t *regs, u64 cur_window_q, bool copy_full) +{ + return kvm_user_hw_stacks_copy(regs); +} + +static __always_inline void host_user_hw_stacks_prepare( + struct e2k_stacks *stacks, pt_regs_t *regs, + u64 cur_window_q, enum restore_caller from, int syscall) +{ + if (regs->sys_num == __NR_e2k_longjmp2) { + /* hardware stacks already are prepared */ + return; + } + kvm_user_hw_stacks_prepare(stacks, regs, cur_window_q, + from, syscall); +} + +static inline int +ret_from_fork_prepare_hv_stacks(struct pt_regs *regs) +{ + return kvm_ret_from_fork_prepare_hv_stacks(regs); +} + +#endif /* CONFIG_KVM_GUEST_KERNEL */ + +#endif /* !(_E2K_KVM_GUEST_COPY_HW_STACKS_H) */ diff --git a/arch/e2k/include/asm/kvm/guest/process.h b/arch/e2k/include/asm/kvm/guest/process.h index 6da1548..3f929ee 100644 --- a/arch/e2k/include/asm/kvm/guest/process.h +++ b/arch/e2k/include/asm/kvm/guest/process.h @@ -11,25 +11,6 @@ #include #include -#undef DEBUG_USER_STACKS_MODE -#undef DebugKVMUS -#define DEBUG_USER_STACKS_MODE 0 -#define DebugKVMUS(fmt, args...) \ -({ \ - if (DEBUG_USER_STACKS_MODE) \ - pr_info("%s(): " fmt, __func__, ##args); \ -}) - -extern bool debug_ustacks; -#undef DEBUG_USER_STACKS_MODE -#undef DebugUST -#define DEBUG_USER_STACKS_MODE 0 /* guest user stacks debug mode */ -#define DebugUST(fmt, args...) \ -({ \ - if (debug_ustacks) \ - pr_info("%s(): " fmt, __func__, ##args); \ -}) - /* real flush of hardware stacks should be done by host hypercall */ /* so here nothing to do */ #ifdef CONFIG_KVM_GUEST_HW_PV @@ -129,474 +110,8 @@ kvm_preserve_user_hw_stacks_to_copy(e2k_stacks_t *u_stacks, /* after copying and therefore are not preserve */ } -static inline void -kvm_kernel_hw_stack_frames_copy(u64 *dst, const u64 *src, unsigned long size) -{ - fast_tagged_memory_copy(dst, src, size, - TAGGED_MEM_STORE_REC_OPC | - MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, - TAGGED_MEM_LOAD_REC_OPC | - MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, true); -} - -static __always_inline void -kvm_collapse_kernel_ps(u64 *dst, const u64 *src, u64 spilled_size) -{ - e2k_psp_hi_t k_psp_hi; - u64 ps_ind, ps_size; - u64 size; - - DebugUST("current host procedure stack index 0x%x, PSHTP 0x%x\n", - NATIVE_NV_READ_PSP_HI_REG().PSP_hi_ind, - NATIVE_NV_READ_PSHTP_REG().PSHTP_ind); - - KVM_COPY_STACKS_TO_MEMORY(); - ATOMIC_GET_HW_PS_SIZES(ps_ind, ps_size); - - size = ps_ind - spilled_size; - BUG_ON(!IS_ALIGNED(size, ALIGN_PSTACK_TOP_SIZE) || (s64) size < 0); - - kvm_kernel_hw_stack_frames_copy(dst, src, size); - - k_psp_hi = NATIVE_NV_READ_PSP_HI_REG(); - k_psp_hi.PSP_hi_ind = size; - HYPERVISOR_update_psp_hi(k_psp_hi.PSP_hi_half); - - DebugUST("move spilled procedure part from host top %px to " - "bottom %px, size 0x%llx\n", - src, dst, size); - DebugUST("host kernel procedure stack index is now 0x%x, " - "guest user PSHTP 0x%llx\n", - k_psp_hi.PSP_hi_ind, spilled_size); -} - -static __always_inline void -kvm_collapse_kernel_pcs(u64 *dst, const u64 *src, u64 spilled_size) -{ - e2k_pcsp_hi_t k_pcsp_hi; - u64 pcs_ind, pcs_size; - u64 size; - - DebugUST("current host chain stack index 0x%x, PCSHTP 0x%llx\n", - NATIVE_NV_READ_PCSP_HI_REG().PCSP_hi_ind, - NATIVE_READ_PCSHTP_REG_SVALUE()); - - KVM_COPY_STACKS_TO_MEMORY(); - ATOMIC_GET_HW_PCS_SIZES(pcs_ind, pcs_size); - - size = pcs_ind - spilled_size; - BUG_ON(!IS_ALIGNED(size, ALIGN_PCSTACK_TOP_SIZE) || (s64) size < 0); - - kvm_kernel_hw_stack_frames_copy(dst, src, size); - - k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); - k_pcsp_hi.PCSP_hi_ind = size; - HYPERVISOR_update_pcsp_hi(k_pcsp_hi.PCSP_hi_half); - - DebugUST("move spilled chain part from host top %px to " - "bottom %px, size 0x%llx\n", - src, dst, size); - DebugUST("host kernel chain stack index is now 0x%x, " - "guest user PCSHTP 0x%llx\n", - k_pcsp_hi.PCSP_hi_ind, spilled_size); -} - -static __always_inline int -copy_stack_page_from_kernel(void __user *dst, void *src, e2k_size_t to_copy, - bool is_chain) -{ - int ret; - - ret = HYPERVISOR_copy_hw_stacks_frames(dst, src, to_copy, is_chain); - return ret; -} - -static __always_inline int -copy_stack_page_to_user(void __user *dst, void *src, e2k_size_t to_copy, - bool is_chain) -{ - struct page *page = NULL; - unsigned long addr = (unsigned long)dst; - void *k_dst; - e2k_size_t offset; - mm_segment_t seg; - unsigned long ts_flag; - int npages; - int ret; - - if (to_copy == 0) - return 0; - - DebugUST("started to copy %s stack from kernel stack %px to user %px " - "size 0x%lx\n", - (is_chain) ? "chain" : "procedure", - src, dst, to_copy); - seg = get_fs(); - set_fs(K_USER_DS); - ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); - do { - npages = __get_user_pages_fast(addr, 1, 1, &page); - if (npages == 1) - break; - npages = get_user_pages_unlocked(addr, 1, &page, FOLL_WRITE); - if (npages == 1) - break; - clear_ts_flag(ts_flag); - set_fs(seg); - ret = -EFAULT; - goto failed; - } while (npages != 1); - clear_ts_flag(ts_flag); - set_fs(seg); - - offset = addr & ~PAGE_MASK; - k_dst = page_address(page) + offset; - DebugUST("copy stack frames from kernel %px to user %px, size 0x%lx\n", - src, k_dst, to_copy); - ret = copy_stack_page_from_kernel(k_dst, src, to_copy, is_chain); - if (ret != 0) { - pr_err("%s(): copy %s stack to user %px from kernel %px, " - "size 0x%lx failed, error %d\n", - __func__, (is_chain) ? "chain" : "procedure", - src, k_dst, to_copy, ret); - goto failed_copy; - } - -failed_copy: - put_page(page); -failed: - return ret; -} - -static __always_inline int -kvm_copy_user_stack_from_kernel(void __user *dst, void *src, - e2k_size_t to_copy, bool is_chain) -{ - e2k_size_t offset, len; - int ret; - - if (to_copy == 0) - return 0; - - DebugUST("started to copy %s stack from kernel stack %px to user %px " - "size 0x%lx\n", - (is_chain) ? "chain" : "procedure", - src, dst, to_copy); - do { - offset = (unsigned long)dst & ~PAGE_MASK; - len = min(to_copy, PAGE_SIZE - offset); - ret = copy_stack_page_to_user(dst, src, len, is_chain); - if (ret != 0) - goto failed; - dst += len; - src += len; - to_copy -= len; - } while (to_copy > 0); - - return 0; - -failed: - return ret; -} - -static __always_inline int -kvm_user_hw_stacks_copy(pt_regs_t *regs) -{ - e2k_psp_lo_t psp_lo; - e2k_psp_hi_t psp_hi; - e2k_pshtp_t pshtp; - e2k_pcsp_lo_t pcsp_lo; - e2k_pcsp_hi_t pcsp_hi; - e2k_pcshtp_t pcshtp; - e2k_stacks_t *stacks; - void __user *dst; - void *src; - long copyed_ps_size, copyed_pcs_size, to_copy, from, there_are; - int ret; - - if (unlikely(irqs_disabled())) { - pr_err("%s() called with IRQs disabled PSP: 0x%lx UPSR: 0x%lx " - "under UPSR %d\n", - __func__, KVM_READ_PSR_REG_VALUE(), - KVM_READ_UPSR_REG_VALUE(), - kvm_get_vcpu_state()->irqs_under_upsr); - local_irq_enable(); - WARN_ON(true); - } - - stacks = ®s->stacks; - copyed_ps_size = regs->copyed.ps_size; - copyed_pcs_size = regs->copyed.pcs_size; - if (unlikely(copyed_ps_size || copyed_pcs_size)) { - /* stacks have been already copyed */ - BUG_ON(copyed_ps_size != GET_PSHTP_MEM_INDEX(stacks->pshtp) && - GET_PSHTP_MEM_INDEX(stacks->pshtp) != 0); - BUG_ON(copyed_pcs_size != PCSHTP_SIGN_EXTEND(stacks->pcshtp) && - PCSHTP_SIGN_EXTEND(stacks->pcshtp) != SZ_OF_CR); - return 0; - } - - ret = HYPERVISOR_copy_stacks_to_memory(); - if (ret != 0) { - pr_err("%s(): flush of kernel stacks failed, error %d\n", - __func__, ret); - goto failed; - } - - /* copy user part of procedure stack from kernel back to user */ - ATOMIC_READ_HW_STACKS_REGS(psp_lo.PSP_lo_half, psp_hi.PSP_hi_half, - pshtp.PSHTP_reg, - pcsp_lo.PCSP_lo_half, pcsp_hi.PCSP_hi_half, - pcshtp); - src = (void *)psp_lo.PSP_lo_base; - DebugUST("procedure stack at kernel from %px, size 0x%x, ind 0x%x, " - "pshtp 0x%llx\n", - src, psp_hi.PSP_hi_size, psp_hi.PSP_hi_ind, pshtp.PSHTP_reg); - BUG_ON(psp_hi.PSP_hi_ind > psp_hi.PSP_hi_size); - - if (stacks->psp_hi.PSP_hi_ind >= stacks->psp_hi.PSP_hi_size) { - /* procedure stack overflow, need expand */ - ret = handle_proc_stack_bounds(stacks, regs->trap); - if (unlikely(ret)) { - pr_err("%s(): could not handle process %s (%d) " - "procedure stack overflow, error %d\n", - __func__, current->comm, current->pid, ret); - goto failed; - } - } - to_copy = GET_PSHTP_MEM_INDEX(stacks->pshtp); - BUG_ON(to_copy < 0); - from = stacks->psp_hi.PSP_hi_ind - to_copy; - BUG_ON(from < 0); - dst = (void __user *)stacks->psp_lo.PSP_lo_base + from; - DebugUST("procedure stack at user from %px, ind 0x%x, " - "pshtp size to copy 0x%lx\n", - dst, stacks->psp_hi.PSP_hi_ind, to_copy); - there_are = stacks->psp_hi.PSP_hi_size - from; - if (there_are < to_copy) { - pr_err("%s(): user procedure stack overflow, there are 0x%lx " - "to copy need 0x%lx, not yet implemented\n", - __func__, there_are, to_copy); - BUG_ON(true); - } - if (to_copy > 0) { - ret = kvm_copy_user_stack_from_kernel(dst, src, to_copy, false); - if (ret != 0) { - pr_err("%s(): procedure stack copying from kernel %px " - "to user %px, size 0x%lx failed, error %d\n", - __func__, src, dst, to_copy, ret); - goto failed; - } - regs->copyed.ps_size = to_copy; - } - - /* copy user part of chain stack from kernel back to user */ - src = (void *)pcsp_lo.PCSP_lo_base; - DebugUST("chain stack at kernel from %px, size 0x%x, ind 0x%x, " - "pcshtp 0x%x\n", - src, pcsp_hi.PCSP_hi_size, pcsp_hi.PCSP_hi_ind, pcshtp); - BUG_ON(pcsp_hi.PCSP_hi_ind + PCSHTP_SIGN_EXTEND(pcshtp) > - pcsp_hi.PCSP_hi_size); - if (stacks->pcsp_hi.PCSP_hi_ind >= stacks->pcsp_hi.PCSP_hi_size) { - /* chain stack overflow, need expand */ - ret = handle_chain_stack_bounds(stacks, regs->trap); - if (unlikely(ret)) { - pr_err("%s(): could not handle process %s (%d) " - "chain stack overflow, error %d\n", - __func__, current->comm, current->pid, ret); - goto failed; - } - } - to_copy = PCSHTP_SIGN_EXTEND(stacks->pcshtp); - BUG_ON(to_copy < 0); - from = stacks->pcsp_hi.PCSP_hi_ind - to_copy; - BUG_ON(from < 0); - dst = (void *)stacks->pcsp_lo.PCSP_lo_base + from; - BUG_ON(to_copy > pcsp_hi.PCSP_hi_ind + PCSHTP_SIGN_EXTEND(pcshtp)); - DebugUST("chain stack at user from %px, ind 0x%x, " - "pcshtp size to copy 0x%lx\n", - dst, stacks->pcsp_hi.PCSP_hi_ind, to_copy); - there_are = stacks->pcsp_hi.PCSP_hi_size - from; - if (there_are < to_copy) { - pr_err("%s(): user chain stack overflow, there are 0x%lx " - "to copy need 0x%lx, not yet implemented\n", - __func__, there_are, to_copy); - BUG_ON(true); - } - if (to_copy > 0) { - ret = kvm_copy_user_stack_from_kernel(dst, src, to_copy, true); - if (ret != 0) { - pr_err("%s(): chain stack copying from kernel %px " - "to user %px, size 0x%lx failed, error %d\n", - __func__, src, dst, to_copy, ret); - goto failed; - } - regs->copyed.pcs_size = to_copy; - } - -failed: - if (DEBUG_USER_STACKS_MODE) - debug_ustacks = false; - return ret; -} - -/* - * Copy additional frames injected to the guest kernel stack, but these frames - * are for guest user stack and should be copyed from kernel back to the top - * of user. - */ -static __always_inline int -kvm_copy_injected_pcs_frames_to_user(pt_regs_t *regs, int frames_num) -{ - e2k_size_t pcs_ind, pcs_size; - e2k_addr_t pcs_base; - int pcsh_top; - e2k_stacks_t *stacks; - void __user *dst; - void *src; - long copyed_frames_size, to_copy, from, there_are, frames_size; - int ret; - - BUG_ON(irqs_disabled()); - - frames_size = frames_num * SZ_OF_CR; - copyed_frames_size = regs->copyed.pcs_injected_frames_size; - if (unlikely(copyed_frames_size >= frames_size)) { - /* all frames have been already copyed */ - return 0; - } else { - /* copyed only part of frames - not implemented case */ - BUG_ON(copyed_frames_size != 0); - } - - stacks = ®s->stacks; - ATOMIC_GET_HW_PCS_SIZES_BASE_TOP(pcs_ind, pcs_size, pcs_base, pcsh_top); - - /* guest user stacks part spilled to kernel should be already copyed */ - BUG_ON(PCSHTP_SIGN_EXTEND(regs->copyed.pcs_size != stacks->pcshtp)); - - src = (void *)(pcs_base + regs->copyed.pcs_size); - DebugUST("chain stack at kernel from %px, size 0x%lx + 0x%lx, " - "ind 0x%lx, pcsh top 0x%x\n", - src, pcs_size, frames_size, pcs_ind, pcsh_top); - BUG_ON(regs->copyed.pcs_size + frames_size > pcs_ind + pcsh_top); - if (stacks->pcsp_hi.PCSP_hi_ind + frames_size > - stacks->pcsp_hi.PCSP_hi_size) { - /* user chain stack can overflow, need expand */ - ret = handle_chain_stack_bounds(stacks, regs->trap); - if (unlikely(ret)) { - pr_err("%s(): could not handle process %s (%d) " - "chain stack overflow, error %d\n", - __func__, current->comm, current->pid, ret); - goto failed; - } - } - to_copy = frames_size; - BUG_ON(to_copy < 0); - from = stacks->pcsp_hi.PCSP_hi_ind; - BUG_ON(from < regs->copyed.pcs_size); - dst = (void *)stacks->pcsp_lo.PCSP_lo_base + from; - DebugUST("chain stack at user from %px, ind 0x%x, " - "frames size to copy 0x%lx\n", - dst, stacks->pcsp_hi.PCSP_hi_ind, to_copy); - there_are = stacks->pcsp_hi.PCSP_hi_size - from; - if (there_are < to_copy) { - pr_err("%s(): user chain stack overflow, there are 0x%lx " - "to copy need 0x%lx, not yet implemented\n", - __func__, there_are, to_copy); - BUG_ON(true); - } - if (likely(to_copy > 0)) { - ret = kvm_copy_user_stack_from_kernel(dst, src, to_copy, true); - if (ret != 0) { - pr_err("%s(): chain stack copying from kernel %px " - "to user %px, size 0x%lx failed, error %d\n", - __func__, src, dst, to_copy, ret); - goto failed; - } - regs->copyed.pcs_injected_frames_size = to_copy; - /* increment chain stack pointer */ - stacks->pcsp_hi.PCSP_hi_ind += to_copy; - } else { - BUG_ON(true); - ret = 0; - } - -failed: - if (DEBUG_USER_STACKS_MODE) - debug_ustacks = false; - return ret; -} - extern void kvm_get_mm_notifier(thread_info_t *ti, struct mm_struct *mm); -/** - * user_hw_stacks_prepare - prepare user hardware stacks that have been - * SPILLed to kernel back to user space - * @stacks - saved user stack registers - * @cur_window_q - size of current window in procedure stack, - * needed only if @copy_full is not set - * @syscall - true if called upon direct system call exit (no signal handlers) - * - * This does two things: - * - * 1) It is possible that upon kernel entry pcshtp == 0 in some cases: - * - user signal handler had pcshtp==0x20 before return to sigreturn() - * - user context had pcshtp==0x20 before return to makecontext_trampoline() - * - chain stack underflow happened - * So it is possible in sigreturn() and traps, but not in system calls. - * If we are using the trick with return to FILL user hardware stacks than - * we must have frame in chain stack to return to. So in this case kernel's - * chain stack is moved up by one frame (0x20 bytes). - * We also fill the new frame with actual user data and update stacks->pcshtp, - * this is needed to keep the coherent state where saved stacks->pcshtp values - * shows how much data from user space has been spilled to kernel space. - * - * 2) It is not possible to always FILL all of user data that have been - * SPILLed to kernel stacks. So we manually copy the leftovers that can - * not be FILLed to user space. - * This copy does not update stacks->pshtp and stacks->pcshtp. Main reason - * is signals: if a signal arrives after copying then it must see a coherent - * state where saved stacks->pshtp and stacks->pcshtp values show how much - * data from user space has been spilled to kernel space. - */ -static __always_inline int kvm_user_hw_stacks_prepare( - struct e2k_stacks *stacks, pt_regs_t *regs, - u64 cur_window_q, enum restore_caller from, int syscall) -{ - e2k_pcshtp_t u_pcshtp = stacks->pcshtp; - int ret; - - BUG_ON(!kvm_trap_user_mode(regs)); - - BUG_ON(from & FROM_PV_VCPU_MODE); - - /* - * 1) Make sure there is free space in kernel chain stack to return to - */ - if (!syscall && u_pcshtp == 0) { - DebugUST("%s(): PCSHTP is empty\n", __func__); - } - - /* - * 2) User data copying will be done some later at - * kvm_prepare_user_hv_stacks() - */ - ret = kvm_user_hw_stacks_copy(regs); - if (ret != 0) { - pr_err("%s(): copying of hardware stacks failed< error %d\n", - __func__, ret); - do_exit(SIGKILL); - } - return ret; -} - -static inline int -kvm_ret_from_fork_prepare_hv_stacks(struct pt_regs *regs) -{ - return kvm_user_hw_stacks_copy(regs); -} - static __always_inline void kvm_jump_to_ttable_entry(struct pt_regs *regs, enum restore_caller from) { @@ -823,55 +338,12 @@ preserve_user_hw_stacks_to_copy(e2k_stacks_t *u_stacks, kvm_preserve_user_hw_stacks_to_copy(u_stacks, cur_stacks); } -static __always_inline void -kernel_hw_stack_frames_copy(u64 *dst, const u64 *src, unsigned long size) -{ - kvm_kernel_hw_stack_frames_copy(dst, src, size); -} - -static __always_inline void -collapse_kernel_ps(u64 *dst, const u64 *src, u64 spilled_size) -{ - kvm_collapse_kernel_ps(dst, src, spilled_size); -} - -static __always_inline void -collapse_kernel_pcs(u64 *dst, const u64 *src, u64 spilled_size) -{ - kvm_collapse_kernel_pcs(dst, src, spilled_size); -} - -static __always_inline int -user_hw_stacks_copy(struct e2k_stacks *stacks, - pt_regs_t *regs, u64 cur_window_q, bool copy_full) -{ - return kvm_user_hw_stacks_copy(regs); -} - -static __always_inline void host_user_hw_stacks_prepare( - struct e2k_stacks *stacks, pt_regs_t *regs, - u64 cur_window_q, enum restore_caller from, int syscall) -{ - if (regs->sys_num == __NR_e2k_longjmp2) { - /* hardware stacks already are prepared */ - return; - } - kvm_user_hw_stacks_prepare(stacks, regs, cur_window_q, - from, syscall); -} - static __always_inline void host_exit_to_usermode_loop(struct pt_regs *regs, bool syscall, bool has_signal) { /* native & guest kernels cannot be as host */ } -static inline int -ret_from_fork_prepare_hv_stacks(struct pt_regs *regs) -{ - return kvm_ret_from_fork_prepare_hv_stacks(regs); -} - static __always_inline void jump_to_ttable_entry(struct pt_regs *regs, enum restore_caller from) { @@ -885,8 +357,6 @@ virt_cpu_thread_init(struct task_struct *boot_task) KVM_GET_VCPU_STATE_BASE(vcpu_state_base); task_thread_info(boot_task)->vcpu_state_base = vcpu_state_base; - if (!IS_HV_GM()) - kvm_vcpu_boot_thread_init(boot_task); } static inline int diff --git a/arch/e2k/include/asm/kvm/guest/trace-defs.h b/arch/e2k/include/asm/kvm/guest/trace-defs.h new file mode 100644 index 0000000..1d6d009 --- /dev/null +++ b/arch/e2k/include/asm/kvm/guest/trace-defs.h @@ -0,0 +1,41 @@ +#ifndef _E2K_KVM_GUEST_TRACE_DEFS_H_ +#define _E2K_KVM_GUEST_TRACE_DEFS_H_ + +#include + +#include +#include +#include + +static inline void +trace_kvm_get_gva_spt_translation(e2k_addr_t address, + pgdval_t *pgd, pudval_t *pud, pmdval_t *pmd, pteval_t *pte, int *pt_level) +{ + mmu_spt_trans_t spt_trans; + int ret; + + ret = HYPERVISOR_get_spt_translation(address, &spt_trans); + if (unlikely(ret != 0)) { + pr_err("%s() : host could not get guest address 0x%lx " + "translation at SPTs, error %d\n", + __func__, address, ret); + *pgd = -1; + *pt_level = E2K_PGD_LEVEL_NUM; + return; + } + *pt_level = spt_trans.pt_levels; + if (*pt_level <= E2K_PGD_LEVEL_NUM) { + *pgd = spt_trans.pgd; + } + if (*pt_level <= E2K_PUD_LEVEL_NUM) { + *pud = spt_trans.pud; + } + if (*pt_level <= E2K_PMD_LEVEL_NUM) { + *pmd = spt_trans.pmd; + } + if (*pt_level <= E2K_PTE_LEVEL_NUM) { + *pte = spt_trans.pte; + } +} + +#endif /* _E2K_KVM_GUEST_TRACE_DEFS_H_ */ diff --git a/arch/e2k/include/asm/kvm/guest/trace-hw-stacks.h b/arch/e2k/include/asm/kvm/guest/trace-hw-stacks.h new file mode 100644 index 0000000..d0567da --- /dev/null +++ b/arch/e2k/include/asm/kvm/guest/trace-hw-stacks.h @@ -0,0 +1,308 @@ +#if !defined(_KVM_GUEST_TRACE_COPY_HW_STACKS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _KVM_GUEST_TRACE_COPY_HW_STACKS_H + +#include +#include + +#include +#include +#include +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM guest + +TRACE_EVENT( + guest_copy_hw_stack, + + TP_PROTO(void *dst, void *src, unsigned long size, bool is_chain), + + TP_ARGS(dst, src, size, is_chain), + + TP_STRUCT__entry( + __field( void *, dst ) + __field( void *, src ) + __field( u64, size ) + __field( bool, is_chain ) + __field( pgdval_t, dst_pgd ) + __field( pudval_t, dst_pud ) + __field( pmdval_t, dst_pmd ) + __field( pteval_t, dst_pte ) + __field( int, dst_pt_level ) + __field( pgdval_t, src_pgd ) + __field( pudval_t, src_pud ) + __field( pmdval_t, src_pmd ) + __field( pteval_t, src_pte ) + __field( int, src_pt_level ) + __field( pgdval_t, dst_spt_pgd ) + __field( pudval_t, dst_spt_pud ) + __field( pmdval_t, dst_spt_pmd ) + __field( pteval_t, dst_spt_pte ) + __field( int, dst_spt_level ) + __field( pgdval_t, src_spt_pgd ) + __field( pudval_t, src_spt_pud ) + __field( pmdval_t, src_spt_pmd ) + __field( pteval_t, src_spt_pte ) + __field( int, src_spt_level ) + ), + + TP_fast_assign( + __entry->dst = dst; + __entry->src = src; + __entry->size = size; + __entry->is_chain = is_chain; + + trace_get_va_translation(current->mm, (e2k_addr_t)dst, + &__entry->dst_pgd, &__entry->dst_pud, &__entry->dst_pmd, + &__entry->dst_pte, &__entry->dst_pt_level); + trace_kvm_get_gva_spt_translation((e2k_addr_t)dst, + &__entry->dst_spt_pgd, &__entry->dst_spt_pud, + &__entry->dst_spt_pmd, &__entry->dst_spt_pte, + &__entry->dst_spt_level); + + trace_get_va_translation(current->mm, (e2k_addr_t)src, + &__entry->src_pgd, &__entry->src_pud, &__entry->src_pmd, + &__entry->src_pte, &__entry->src_pt_level); + trace_kvm_get_gva_spt_translation((e2k_addr_t)src, + &__entry->src_spt_pgd, &__entry->src_spt_pud, + &__entry->src_spt_pmd, &__entry->src_spt_pte, + &__entry->src_spt_level); + ), + + TP_printk("copy %s stack guest user <- guest kernel: dst %px " + "src %px size %llx\n" + " user guest dst %px : pgd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pud 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pmd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pte 0x%016lx : %s\n" + " Access mode: %s%s\n" + " user guest dst spt %px : pgd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pud 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pmd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pte 0x%016lx : %s\n" + " Access mode: %s%s\n" + " kernel guest src %px : pgd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pud 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pmd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pte 0x%016lx : %s\n" + " Access mode: %s%s\n" + " kernel guest src spt %px : pgd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pud 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pmd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pte 0x%016lx : %s\n" + " Access mode: %s%s", + (__entry->is_chain) ? "chain" : "procedure", + __entry->dst, + __entry->src, + __entry->size, + __entry->dst, + (__entry->dst_pt_level <= E2K_PGD_LEVEL_NUM) ? + __entry->dst_pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pgd, + __entry->dst_pt_level <= E2K_PGD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PUD_LEVEL_NUM) ? + __entry->dst_pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pud, + __entry->dst_pt_level <= E2K_PUD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PMD_LEVEL_NUM) ? + __entry->dst_pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pmd, + __entry->dst_pt_level <= E2K_PMD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PTE_LEVEL_NUM) ? + __entry->dst_pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pte, + __entry->dst_pt_level <= E2K_PTE_LEVEL_NUM), + __entry->dst, + (__entry->dst_spt_level <= E2K_PGD_LEVEL_NUM) ? + __entry->dst_spt_pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_spt_pgd, + __entry->dst_spt_level <= E2K_PGD_LEVEL_NUM), + (__entry->dst_spt_level <= E2K_PUD_LEVEL_NUM) ? + __entry->dst_spt_pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_spt_pud, + __entry->dst_spt_level <= E2K_PUD_LEVEL_NUM), + (__entry->dst_spt_level <= E2K_PMD_LEVEL_NUM) ? + __entry->dst_spt_pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_spt_pmd, + __entry->dst_spt_level <= E2K_PMD_LEVEL_NUM), + (__entry->dst_spt_level <= E2K_PTE_LEVEL_NUM) ? + __entry->dst_spt_pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_spt_pte, + __entry->dst_spt_level <= E2K_PTE_LEVEL_NUM), + __entry->src, + (__entry->src_pt_level <= E2K_PGD_LEVEL_NUM) ? + __entry->src_pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_pgd, + __entry->src_pt_level <= E2K_PGD_LEVEL_NUM), + (__entry->src_pt_level <= E2K_PUD_LEVEL_NUM) ? + __entry->src_pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_pud, + __entry->src_pt_level <= E2K_PUD_LEVEL_NUM), + (__entry->src_pt_level <= E2K_PMD_LEVEL_NUM) ? + __entry->src_pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_pmd, + __entry->src_pt_level <= E2K_PMD_LEVEL_NUM), + (__entry->src_pt_level <= E2K_PTE_LEVEL_NUM) ? + __entry->src_pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_pte, + __entry->src_pt_level <= E2K_PTE_LEVEL_NUM), + __entry->src, + (__entry->src_spt_level <= E2K_PGD_LEVEL_NUM) ? + __entry->src_spt_pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_spt_pgd, + __entry->src_spt_level <= E2K_PGD_LEVEL_NUM), + (__entry->src_spt_level <= E2K_PUD_LEVEL_NUM) ? + __entry->src_spt_pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_spt_pud, + __entry->src_spt_level <= E2K_PUD_LEVEL_NUM), + (__entry->src_spt_level <= E2K_PMD_LEVEL_NUM) ? + __entry->src_spt_pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_spt_pmd, + __entry->src_spt_level <= E2K_PMD_LEVEL_NUM), + (__entry->src_spt_level <= E2K_PTE_LEVEL_NUM) ? + __entry->src_spt_pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_spt_pte, + __entry->src_spt_level <= E2K_PTE_LEVEL_NUM) + ) +); + +TRACE_EVENT( + guest_proc_stack_frame, + + TP_PROTO(kernel_mem_ps_t *ps_base, kernel_mem_ps_t *ps_frame), + + TP_ARGS(ps_base, ps_frame), + + TP_STRUCT__entry( + __field( kernel_mem_ps_t *, ps_base ) + __field_struct( kernel_mem_ps_t, ps_frame ) + __field( pgprotval_t, dtlb_entry ) + ), + + TP_fast_assign( + __entry->ps_base = ps_base; + __entry->ps_frame = *ps_frame; + __entry->dtlb_entry = HYPERVISOR_mmu_probe((e2k_addr_t)ps_base, + KVM_MMU_PROBE_ENTRY); + ), + + TP_printk(" %px (dtlb 0x%016lx) : 0x%016lx 0x%016lx", + __entry->ps_base, __entry->dtlb_entry, + __entry->ps_frame.word_lo, + __entry->ps_frame.word_hi) +); + +TRACE_EVENT( + guest_chain_stack_frame, + + TP_PROTO(e2k_mem_crs_t *pcs_base, e2k_mem_crs_t *pcs_frame), + + TP_ARGS(pcs_base, pcs_frame), + + TP_STRUCT__entry( + __field( e2k_mem_crs_t *, pcs_base ) + __field_struct( e2k_mem_crs_t, pcs_frame ) + __field( pgprotval_t, dtlb_entry ) + ), + + TP_fast_assign( + __entry->pcs_base = pcs_base; + __entry->pcs_frame = *pcs_frame; + __entry->dtlb_entry = HYPERVISOR_mmu_probe((e2k_addr_t)pcs_base, + KVM_MMU_PROBE_ENTRY); + ), + + TP_printk(" %px (dtlb 0x%016lx) : 0x%016llx 0x%016llx " + "0x%016llx 0x%016llx", + __entry->pcs_base, __entry->dtlb_entry, + __entry->pcs_frame.cr0_lo.CR0_lo_half, + __entry->pcs_frame.cr0_hi.CR0_hi_half, + __entry->pcs_frame.cr1_lo.CR1_lo_half, + __entry->pcs_frame.cr1_hi.CR1_hi_half) +); + +TRACE_EVENT( + guest_va_tlb_state, + + TP_PROTO(e2k_addr_t address), + + TP_ARGS(address), + + TP_STRUCT__entry( + __field( e2k_addr_t, address ) + __field( tlb_tag_t, set0_tag ) + __field_struct( pte_t, set0_entry ) + __field( tlb_tag_t, set1_tag ) + __field_struct( pte_t, set1_entry ) + __field( tlb_tag_t, set2_tag ) + __field_struct( pte_t, set2_entry ) + __field( tlb_tag_t, set3_tag ) + __field_struct( pte_t, set3_entry ) + __field( tlb_tag_t, setH_tag ) + __field_struct( pte_t, setH_entry ) + __field( u64, dtlb_entry ) + __field( unsigned long, mmu_pptb ) + __field( unsigned long, mmu_pid ) + ), + + TP_fast_assign( + __entry->address = address; + __entry->set0_tag = HYPERVISOR_get_tlb_set_tag(address, 0, false); + pte_val(__entry->set0_entry) = + HYPERVISOR_get_tlb_set_entry(address, 0, false); + __entry->set1_tag = HYPERVISOR_get_tlb_set_tag(address, 1, false); + pte_val(__entry->set1_entry) = + HYPERVISOR_get_tlb_set_entry(address, 1, false); + __entry->set2_tag = HYPERVISOR_get_tlb_set_tag(address, 2, false); + pte_val(__entry->set2_entry) = + HYPERVISOR_get_tlb_set_entry(address, 2, false); + __entry->set3_tag = HYPERVISOR_get_tlb_set_tag(address, 3, false); + pte_val(__entry->set3_entry) = + HYPERVISOR_get_tlb_set_entry(address, 3, false); + __entry->setH_tag = HYPERVISOR_get_tlb_set_tag(address, 3, true); + pte_val(__entry->setH_entry) = + HYPERVISOR_get_tlb_set_entry(address, 3, true); + __entry->dtlb_entry = HYPERVISOR_mmu_probe(address, + KVM_MMU_PROBE_ENTRY); + __entry->mmu_pptb = HYPERVISOR_get_host_mmu_pptb(); + __entry->mmu_pid = HYPERVISOR_get_host_mmu_pid(); + ), + + TP_printk(" 0x%016lx : dtlb 0x%016llx U_PPTB 0x%lx PID 0x%lx\n" + " TLB set #0 tag 0x%016lx entry 0x%016lx\n" + " TLB set #1 tag 0x%016lx entry 0x%016lx\n" + " TLB set #2 tag 0x%016lx entry 0x%016lx\n" + " TLB set #3 tag 0x%016lx entry 0x%016lx\n" + " TLB set #H tag 0x%016lx entry 0x%016lx", + __entry->address, __entry->dtlb_entry, + __entry->mmu_pptb, __entry->mmu_pid, + __entry->set0_tag, pte_val(__entry->set0_entry), + __entry->set1_tag, pte_val(__entry->set1_entry), + __entry->set2_tag, pte_val(__entry->set2_entry), + __entry->set3_tag, pte_val(__entry->set3_entry), + __entry->setH_tag, pte_val(__entry->setH_entry) + ) +); + +#endif /* _KVM_GUEST_TRACE_COPY_HW_STACKS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/e2k/include/asm/kvm/guest +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace-hw-stacks + +/* This part must be outside protection */ +#include diff --git a/arch/e2k/include/asm/kvm/hypercall.h b/arch/e2k/include/asm/kvm/hypercall.h index 83cf8e5..0415119 100644 --- a/arch/e2k/include/asm/kvm/hypercall.h +++ b/arch/e2k/include/asm/kvm/hypercall.h @@ -205,8 +205,10 @@ static inline unsigned long generic_hypercall6(unsigned long nr, /* time (cycles) */ #define KVM_HCALL_GET_GUEST_RUNNING_TIME 6 /* get running time of guest */ /* VCPU at cycles */ -#define KVM_HCALL_GET_VCPU_START_THREAD 8 /* register on host the guest */ - /* kernel VCPU booting thread */ +#define KVM_HCALL_GET_HOST_MMU_PPTB 7 /* get the host MMU PPTB register */ + /* state */ +#define KVM_HCALL_GET_TLB_SET_TAG 8 /* get tag of TLB line set */ +#define KVM_HCALL_GET_TLB_SET_ENTRY 9 /* get entry of TLB line set */ #define KVM_HCALL_UPDATE_PSP_HI 10 /* write updated value to */ /* PSP_hi register */ #define KVM_HCALL_UPDATE_PCSP_HI 11 /* write updated value to */ @@ -215,6 +217,8 @@ static inline unsigned long generic_hypercall6(unsigned long nr, /* guest as task */ #define KVM_HCALL_UPDATE_WD_PSIZE 13 /* write updated psize field */ /* to the WD register */ +#define KVM_HCALL_GET_HOST_MMU_PID 14 /* get the host MMU PID register */ + /* state */ #define KVM_HCALL_MOVE_TAGGED_DATA 15 /* move quad value from to */ #define KVM_HCALL_UNFREEZE_TRAPS 16 /* unfreeze TIRs & trap */ /* cellar */ @@ -303,9 +307,27 @@ HYPERVISOR_get_guest_running_time(void) } static inline unsigned long -HYPERVISOR_get_vcpu_start_thread(void) +HYPERVISOR_get_tlb_set_tag(e2k_addr_t va, int set_no, bool huge) { - return light_hypercall0(KVM_HCALL_GET_VCPU_START_THREAD); + return light_hypercall3(KVM_HCALL_GET_TLB_SET_TAG, va, set_no, huge); +} + +static inline unsigned long +HYPERVISOR_get_tlb_set_entry(e2k_addr_t va, int set_no, bool huge) +{ + return light_hypercall3(KVM_HCALL_GET_TLB_SET_ENTRY, va, set_no, huge); +} + +static inline unsigned long +HYPERVISOR_get_host_mmu_pptb(void) +{ + return light_hypercall0(KVM_HCALL_GET_HOST_MMU_PPTB); +} + +static inline unsigned long +HYPERVISOR_get_host_mmu_pid(void) +{ + return light_hypercall0(KVM_HCALL_GET_HOST_MMU_PID); } static inline unsigned long @@ -611,6 +633,8 @@ HYPERVISOR_switch_to_expanded_guest_chain_stack(long delta_size, /* in page tables and flush tlb */ #define KVM_HCALL_SYNC_ADDR_RANGE 135 /* sync ptes in page */ /* tables without flushing tlb */ +#define KVM_HCALL_GET_SPT_TRANSLATION 137 /* get full translation of guest */ + /* address at shadow PTs */ #define KVM_HCALL_RECOVERY_FAULTED_TAGGED_STORE 141 /* recovery faulted store */ /* tagged value operations */ @@ -1252,10 +1276,11 @@ HYPERVISOR_guest_csd_lock_try_wait(void *lock) } static inline unsigned long -HYPERVISOR_pt_atomic_update(unsigned long gpa, void __user *old_gpte, - unsigned atomic_op, unsigned long prot_mask) +HYPERVISOR_pt_atomic_update(int gmmid_nr, unsigned long gpa, + void __user *old_gpte, + unsigned atomic_op, unsigned long prot_mask) { - return generic_hypercall4(KVM_HCALL_PT_ATOMIC_UPDATE, + return generic_hypercall5(KVM_HCALL_PT_ATOMIC_UPDATE, (int)gmmid_nr, gpa, (unsigned long)old_gpte, atomic_op, prot_mask); } @@ -1400,6 +1425,28 @@ HYPERVISOR_host_printk(char *msg, int size) return generic_hypercall2(KVM_HCALL_HOST_PRINTK, (unsigned long)msg, (unsigned long)size); } + +/* + * The guest virtual address info of full track of translation + * at the host shadow PTs + */ + +typedef struct mmu_spt_trans { + int pt_levels; /* the last significant level of PT */ + unsigned long pgd; + unsigned long pud; + unsigned long pmd; + unsigned long pte; +} mmu_spt_trans_t; + +static inline unsigned long +HYPERVISOR_get_spt_translation(e2k_addr_t address, + mmu_spt_trans_t __user *trans_info) +{ + return generic_hypercall2(KVM_HCALL_GET_SPT_TRANSLATION, address, + (unsigned long)trans_info); +} + static inline unsigned long HYPERVISOR_print_guest_kernel_ptes(e2k_addr_t address) { diff --git a/arch/e2k/include/asm/kvm/mmu.h b/arch/e2k/include/asm/kvm/mmu.h index 4993a2b..a7c714f 100644 --- a/arch/e2k/include/asm/kvm/mmu.h +++ b/arch/e2k/include/asm/kvm/mmu.h @@ -124,6 +124,16 @@ static inline void reset_spt_gpa_fault(struct kvm_vcpu *vcpu) vcpu->arch.mmu.spt_gpa_fault = false; } +static inline unsigned long get_mmu_u_pptb_reg(void) +{ + return NATIVE_READ_MMU_U_PPTB_REG(); +} + +static inline unsigned long get_mmu_pid_reg(void) +{ + return NATIVE_READ_MMU_PID_REG(); +} + static inline hpa_t kvm_get_gp_phys_root(struct kvm_vcpu *vcpu) { @@ -286,6 +296,11 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } +extern void kvm_get_spt_translation(struct kvm_vcpu *vcpu, e2k_addr_t address, + pgdval_t *pgd, pudval_t *pud, pmdval_t *pmd, + pteval_t *pte, int *pt_level); +extern unsigned long kvm_get_gva_to_hva(struct kvm_vcpu *vcpu, gva_t gva); + static inline gpa_t kvm_hva_to_gpa(struct kvm *kvm, unsigned long hva) { struct kvm_memslots *slots; diff --git a/arch/e2k/include/asm/kvm/page_track.h b/arch/e2k/include/asm/kvm/page_track.h index 614dd9d..8fd16ef 100644 --- a/arch/e2k/include/asm/kvm/page_track.h +++ b/arch/e2k/include/asm/kvm/page_track.h @@ -31,8 +31,8 @@ struct kvm_page_track_notifier_node { * @new: the data was written to the address. * @bytes: the written length. */ - void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); + void (*track_write)(struct kvm_vcpu *vcpu, struct gmm_struct *gmm, + gpa_t gpa, const u8 *new, int bytes); /* * It is called when memory slot is being moved or removed * users can drop write-protection for the pages in that memory slot @@ -68,8 +68,8 @@ kvm_page_track_register_notifier(struct kvm *kvm, void kvm_page_track_unregister_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n); -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); +void kvm_page_track_write(struct kvm_vcpu *vcpu, struct gmm_struct *gmm, + gpa_t gpa, const u8 *new, int bytes); void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #else /* ! CONFIG_KVM_HV_MMU */ static inline void kvm_page_track_init(struct kvm *kvm) diff --git a/arch/e2k/include/asm/kvm/process.h b/arch/e2k/include/asm/kvm/process.h index 1b9e77e..2c52152 100644 --- a/arch/e2k/include/asm/kvm/process.h +++ b/arch/e2k/include/asm/kvm/process.h @@ -15,16 +15,6 @@ #include #include -#undef DEBUG_KVM_GUEST_STACKS_MODE -#undef DebugGUST -#define DEBUG_KVM_GUEST_STACKS_MODE 0 /* guest user stacks */ - /* copy debug */ -#define DebugGUST(fmt, args...) \ -({ \ - if (DEBUG_KVM_GUEST_STACKS_MODE) \ - pr_info("%s(): " fmt, __func__, ##args); \ -}) - extern void kvm_clear_host_thread_info(thread_info_t *ti); extern gthread_info_t *create_guest_start_thread_info(struct kvm_vcpu *vcpu); extern int kvm_resume_vm_thread(void); @@ -233,416 +223,6 @@ void go2guest(long fn, bool priv_guest); INIT_HOST_VCPU_STATE_GREG_COPY(__ti, vcpu); \ }) -static inline void -prepare_pv_vcpu_inject_stacks(struct kvm_vcpu *vcpu, pt_regs_t *regs) -{ - e2k_stacks_t *stacks, *g_stacks; - gthread_info_t *gti = pv_vcpu_get_gti(vcpu); - - if (regs->g_stacks_valid) { - /* already prepared */ - return; - } - - /* all stacks at empty state, because of guest user recursion */ - /* of trap/system calls can not be */ - g_stacks = ®s->g_stacks; - g_stacks->usd_lo = gti->g_usd_lo; - g_stacks->usd_hi = gti->g_usd_hi; - g_stacks->top = gti->g_sbr.SBR_base; - g_stacks->psp_lo = gti->g_psp_lo; - g_stacks->psp_hi = gti->g_psp_hi; - g_stacks->pcsp_lo = gti->g_pcsp_lo; - g_stacks->pcsp_hi = gti->g_pcsp_hi; - - /* pshtp & pcshtp from guest user stack real state upon trap/syscall */ - stacks = ®s->stacks; - g_stacks->pshtp = stacks->pshtp; - g_stacks->pcshtp = stacks->pcshtp; - - regs->g_stacks_valid = true; - regs->g_stacks_active = false; - regs->need_inject = false; -} - -#undef EMULATE_EMPTY_CHAIN_STACK /* only to debug */ - -#ifdef EMULATE_EMPTY_CHAIN_STACK -static __always_inline void -pv_vcpu_emulate_empty_chain_staks(struct kvm_vcpu *vcpu, pt_regs_t *regs, - e2k_stacks_t *stacks, bool guest_user) -{ - e2k_pcshtp_t pcshtp; - unsigned long flags; - e2k_pcsp_lo_t g_pcsp_lo, k_pcsp_lo; - e2k_pcsp_hi_t g_pcsp_hi, k_pcsp_hi; - e2k_mem_crs_t __user *g_cframe; - e2k_mem_crs_t *k_crs; - int ret; - - pcshtp = stacks->pcshtp; - if (!(guest_user && pcshtp <= 0x40)) - return; - - g_pcsp_lo = regs->stacks.pcsp_lo; - g_pcsp_hi = regs->stacks.pcsp_hi; - - raw_all_irq_save(flags); - NATIVE_FLUSHC; - k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); - k_pcsp_lo = NATIVE_NV_READ_PCSP_LO_REG(); - BUG_ON(AS(k_pcsp_hi).ind != pcshtp); - - k_crs = (e2k_mem_crs_t *) AS(k_pcsp_lo).base; - g_cframe = (e2k_mem_crs_t __user *) (AS(g_pcsp_lo).base + - AS(g_pcsp_hi).ind - pcshtp); - ret = user_hw_stack_frames_copy(g_cframe, k_crs, pcshtp, regs, - k_pcsp_hi.PCSP_hi_ind, true); - if (ret) { - pr_err("%s(): copy to user stack failed\n", __func__); - BUG_ON(true); - } - k_pcsp_hi.PCSP_hi_ind -= pcshtp; - pcshtp = 0; - regs->stacks.pcshtp = pcshtp; - stacks->pcshtp = pcshtp; - NATIVE_NV_NOIRQ_WRITE_PCSP_HI_REG(k_pcsp_hi); - raw_all_irq_restore(flags); -} -#else /* !EMULATE_EMPTY_CHAIN_STACK */ -static __always_inline void -pv_vcpu_emulate_empty_chain_staks(struct kvm_vcpu *vcpu, pt_regs_t *regs, - e2k_stacks_t *stacks, bool guest_user) -{ -} -#endif /* EMULATE_EMPTY_CHAIN_STACK */ - -/** - * pv_vcpu_user_hw_stacks_copy - check size of user hardware stacks that have - * been SPILLed to kernel back to guest space - * @regs - saved guest user stack registers - * @cur_window_q - size of current window in procedure stack - * - * All guest user's stacks part were already copied to guest kernel stacks, - * so it need only check that it was full size and nothing to copy here - */ -static __always_inline int -pv_vcpu_user_hw_stacks_copy(pt_regs_t *regs, e2k_stacks_t *stacks, - u64 cur_window_q, bool guest_user) -{ - e2k_psp_lo_t g_psp_lo = stacks->psp_lo, - k_psp_lo = current_thread_info()->k_psp_lo; - e2k_psp_hi_t g_psp_hi = stacks->psp_hi; - e2k_pcsp_lo_t g_pcsp_lo = stacks->pcsp_lo, - k_pcsp_lo = current_thread_info()->k_pcsp_lo; - e2k_pcsp_hi_t g_pcsp_hi = stacks->pcsp_hi; - s64 g_pshtp_size, g_pcshtp_size, ps_copy_size, pcs_copy_size; - int ret; - - DebugUST("guest kernel chain state: base 0x%llx ind 0x%x size 0x%x\n", - g_pcsp_lo.PCSP_lo_base, g_pcsp_hi.PCSP_hi_ind, - g_pcsp_hi.PCSP_hi_size); - DebugUST("guest kernel proc state: base 0x%llx ind 0x%x size 0x%x\n", - g_psp_lo.PSP_lo_base, g_psp_hi.PSP_hi_ind, - g_psp_hi.PSP_hi_size); - g_pshtp_size = GET_PSHTP_MEM_INDEX(stacks->pshtp); - g_pcshtp_size = PCSHTP_SIGN_EXTEND(stacks->pcshtp); - DebugUST("guest kernel chain stack PCSHTP 0x%llx, " - "proc stack PSHTP 0x%llx cur window 0x%llx\n", - g_pcshtp_size, g_pshtp_size, cur_window_q); - - /* - * FIXME: the current implementation of the guest user signal handler - * injection uses direct copying to guest hardware stacks. - * It is bad decision, needs to be corrected - KVM_BUG_ON(is_paging(current_thread_info()->vcpu) && - (g_psp_lo.PSP_lo_base < GUEST_TASK_SIZE || - g_pcsp_lo.PCSP_lo_base < GUEST_TASK_SIZE)); - */ - - /* - * Calculate size of user's part to copy from kernel stacks - * into guest kernel stacks - */ - pcs_copy_size = get_pcs_copy_size(g_pcshtp_size); - ps_copy_size = get_ps_copy_size(cur_window_q, g_pshtp_size); - /* Make sure there is enough space in CF for the FILL */ - BUG_ON((E2K_MAXCR_q - 4) * 16 < E2K_CF_MAX_FILL); - DebugUST("to copy chain stack 0x%llx, proc stack 0x%llx\n", - pcs_copy_size, ps_copy_size); - - if (likely(pcs_copy_size <= 0 && ps_copy_size <= 0)) - return 0; - - if (unlikely(pcs_copy_size > 0)) { - e2k_pcsp_hi_t k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); - void __user *dst; - void *src; - - /* Since SPILL'ed guest user data will be copyed to guest */ - /* kernel stacks then cannot be any overflow of user's */ - /* hardware stack. */ - if (unlikely(AS(g_pcsp_hi).ind > AS(g_pcsp_hi).size)) { - pr_err("%s(): guest kernel chain stack overflow " - "(out of memory?): ind 0x%x size 0x%x\n", - __func__, g_pcsp_hi.PCSP_hi_ind, - g_pcsp_hi.PCSP_hi_size); - KVM_BUG_ON(true); - } - dst = (void __user *)(g_pcsp_lo.PCSP_lo_base + - g_pcsp_hi.PCSP_hi_ind); - if (!guest_user) { - /* stack index has been incremented on PCSHTP */ - dst -= g_pcshtp_size; - } - src = (void *)k_pcsp_lo.PCSP_lo_base; - - ret = user_hw_stack_frames_copy(dst, src, pcs_copy_size, regs, - k_pcsp_hi.PCSP_hi_ind, true); - if (ret) - return ret; - if (guest_user) { - g_pcsp_hi.PCSP_hi_ind += pcs_copy_size; - stacks->pcsp_hi = g_pcsp_hi; - DebugGUST("guest user chain stack frames copied from " - "host %px to guest kernel from %px size 0x%llx " - "PCSP.ind 0x%x\n", - src, dst, pcs_copy_size, g_pcsp_hi.PCSP_hi_ind); - } - } - - if (unlikely(ps_copy_size > 0)) { - e2k_psp_hi_t k_psp_hi = NATIVE_NV_READ_PSP_HI_REG(); - void __user *dst; - void *src; - - /* Since SPILL'ed guest user data will be copyed to guest */ - /* kernel stacks then cannot be any overflow of user's */ - /* hardware stack. */ - if (unlikely(AS(g_psp_hi).ind > AS(g_psp_hi).size)) { - pr_err("%s(): guest kernel proc stack overflow " - "(out of memory?): ind 0x%x size 0x%x\n", - __func__, g_psp_hi.PSP_hi_ind, - g_psp_hi.PSP_hi_size); - KVM_BUG_ON(true); - } - dst = (void __user *)(g_psp_lo.PSP_lo_base + - g_psp_hi.PSP_hi_ind); - if (!guest_user) { - /* stack index has been incremented on PSHTP */ - dst -= g_pshtp_size; - } - src = (void *)k_psp_lo.PSP_lo_base; - - ret = user_hw_stack_frames_copy(dst, src, ps_copy_size, regs, - k_psp_hi.PSP_hi_ind, false); - if (ret) - return ret; - if (guest_user) { - g_psp_hi.PSP_hi_ind += ps_copy_size; - stacks->psp_hi = g_psp_hi; - DebugGUST("guest user proc stack frames copied from " - "host %px to guest kernel from %px size 0x%llx " - "PSP.ind 0x%x\n", - src, dst, ps_copy_size, g_psp_hi.PSP_hi_ind); - } - } - - return 0; -} - -/** - * pv_vcpu_user_hw_stacks_prepare - prepare guest user hardware stacks - that have been SPILLed to kernel back - to guest user space - * @regs - saved guest user stack registers - * @cur_window_q - size of current window in procedure stack - * @syscall - true if called upon direct system call exit (no signal handlers) - * - * This does two things: - * - * 1) It is possible that upon kernel entry pcshtp == 0 in some cases: - * - user signal handler had pcshtp==0x20 before return to sigreturn() - * - user context had pcshtp==0x20 before return to makecontext_trampoline() - * - chain stack underflow happened - * So it is possible in sigreturn() and traps, but not in system calls. - * If we are using the trick with return to FILL user hardware stacks than - * we must have frame in chain stack to return to. So in this case kernel's - * chain stack is moved up by one frame (0x20 bytes). - * We also fill the new frame with actual user data and update stacks->pcshtp, - * this is needed to keep the coherent state where saved stacks->pcshtp values - * shows how much data from user space has been spilled to kernel space. - * - * 2) It is not possible to always FILL all of user data that have been - * SPILLed to kernel stacks. So we manually copy the leftovers that can - * not be FILLed to user space. - * This copy does not update stacks->pshtp and stacks->pcshtp. Main reason - * is signals: if a signal arrives after copying then it must see a coherent - * state where saved stacks->pshtp and stacks->pcshtp values show how much - * data from user space has been spilled to kernel space. - */ -static __always_inline void -pv_vcpu_user_hw_stacks_prepare(struct kvm_vcpu *vcpu, pt_regs_t *regs, - u64 cur_window_q, enum restore_caller from, int syscall) -{ - e2k_stacks_t *stacks; - e2k_pcshtp_t pcshtp; - bool guest_user; - bool paging = is_paging(vcpu); - int ret; - - if (likely(paging)) { - guest_user = !!(syscall || !pv_vcpu_trap_on_guest_kernel(regs)); - } else { - guest_user = false; - } - if (guest_user) { - if (from & FROM_PV_VCPU_MODE) { - /* all preparation has been made */ - /* by host & guest handler */ - return; - } - - /* trap on/syscall from guest user, so regs keeps user */ - /* registers state and it need use guest kernel stacks */ - /* in empty state to handle this trap/syscall */ - if (!regs->g_stacks_valid) { - prepare_pv_vcpu_inject_stacks(vcpu, regs); - } - stacks = ®s->g_stacks; - } else { - /* trap on guest kernel, so regs already points to guest */ - /* kernel stacks and trap will be handled by host */ - /* same as other user's processes traps */ - stacks = ®s->stacks; - } - - /* only to debug on simulator : pcshtp == 0 */ - pv_vcpu_emulate_empty_chain_staks(vcpu, regs, stacks, guest_user); - - pcshtp = stacks->pcshtp; - DebugUST("guest kernel chain stack state: base 0x%llx ind 0x%x " - "size 0x%x\n", - stacks->pcsp_lo.PCSP_lo_base, - stacks->pcsp_hi.PCSP_hi_ind, - stacks->pcsp_hi.PCSP_hi_size); - DebugUST("host kernel chain stack state: base 0x%llx ind 0x%x " - "size 0x%x\n", - NATIVE_NV_READ_PCSP_LO_REG().PCSP_lo_base, - NATIVE_NV_READ_PCSP_HI_REG().PCSP_hi_ind, - NATIVE_NV_READ_PCSP_HI_REG().PCSP_hi_size); - DebugUST("guest kernel chain stack size to fill PCSHTP 0x%x\n", - pcshtp); - /* - * 1) Make sure there is free space in kernel chain stack to return to - */ - if (!syscall && pcshtp == 0 && !guest_user) { - unsigned long flags; - e2k_pcsp_lo_t g_pcsp_lo = stacks->pcsp_lo, - k_pcsp_lo = current_thread_info()->k_pcsp_lo; - e2k_pcsp_hi_t g_pcsp_hi = stacks->pcsp_hi, k_pcsp_hi; - e2k_mem_crs_t __user *g_cframe; - e2k_mem_crs_t *k_crs; - int ret = -EINVAL; - - raw_all_irq_save(flags); - NATIVE_FLUSHC; - k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); - BUG_ON(AS(k_pcsp_hi).ind); - AS(k_pcsp_hi).ind += SZ_OF_CR; - NATIVE_NV_NOIRQ_WRITE_PCSP_HI_REG(k_pcsp_hi); - - k_crs = (e2k_mem_crs_t *) AS(k_pcsp_lo).base; - g_cframe = (e2k_mem_crs_t __user *) (AS(g_pcsp_lo).base + - AS(g_pcsp_hi).ind); - if ((u64) g_cframe > (u64) AS(g_pcsp_lo).base) { - ret = __copy_user_to_current_hw_stack(k_crs, - g_cframe - 1, sizeof(*k_crs), regs, true); - } - raw_all_irq_restore(flags); - - /* Can happen if application returns until runs out of - * chain stack or there is no free memory for stacks. - * There is no user stack to return to - die. */ - if (ret) { - E2K_LMS_HALT_OK; - pr_err("%s(): SIGKILL. %s\n", - __func__, - (ret == -EINVAL) ? - "tried to return to kernel" - : - "ran into Out-of-Memory on user stacks"); - force_sig(SIGKILL); - return; - } - DebugUST("copy guest user chain frame from %px to kernel " - "bottom from %px\n", - g_cframe - 1, k_crs); - - if (AS(g_pcsp_hi).ind < SZ_OF_CR) { - pr_err("%s(): guest kernel chain stack underflow\n", - __func__); - KVM_BUG_ON(true); - } - - pcshtp = SZ_OF_CR; - stacks->pcshtp = pcshtp; - DebugUST("guest kernel chain stack to FILL PCSHTP " - "set to 0x%x\n", - stacks->pcshtp); - } else if (!syscall && pcshtp == 0 && guest_user) { - e2k_pcsp_hi_t k_pcsp_hi; - unsigned long flags; - - /* set flag for unconditional injection to do not copy */ - /* from guest user space */ - regs->need_inject = true; - - /* reserve one bottom frames for trampoline */ - /* the guest handler replaces guest user trapped frame */ - raw_all_irq_save(flags); - NATIVE_FLUSHC; - k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); - BUG_ON(k_pcsp_hi.PCSP_hi_ind); - k_pcsp_hi.PCSP_hi_ind += 1 * SZ_OF_CR; - NATIVE_NV_NOIRQ_WRITE_PCSP_HI_REG(k_pcsp_hi); - raw_all_irq_restore(flags); - } - - /* - * 2) Copy user data that cannot be FILLed - */ - ret = pv_vcpu_user_hw_stacks_copy(regs, stacks, cur_window_q, - guest_user); - if (unlikely(ret)) - do_exit(SIGKILL); -} - -/* Same as for native kernel without virtualization support */ -static __always_inline int -user_hw_stacks_copy(struct e2k_stacks *stacks, - pt_regs_t *regs, u64 cur_window_q, bool copy_full) -{ - return native_user_hw_stacks_copy(stacks, regs, cur_window_q, copy_full); -} - -static __always_inline void -host_user_hw_stacks_prepare(struct e2k_stacks *stacks, pt_regs_t *regs, - u64 cur_window_q, enum restore_caller from, int syscall) -{ - struct kvm_vcpu *vcpu; - - if (likely(!kvm_test_intc_emul_flag(regs))) { - /* trap on/syscall from host user processes */ - return native_user_hw_stacks_prepare(stacks, regs, - cur_window_q, from, syscall); - } - - vcpu = current_thread_info()->vcpu; - KVM_BUG_ON(vcpu == NULL); - pv_vcpu_user_hw_stacks_prepare(vcpu, regs, cur_window_q, from, syscall); -} - static __always_inline void host_exit_to_usermode_loop(struct pt_regs *regs, bool syscall, bool has_signal) { diff --git a/arch/e2k/include/asm/kvm/switch.h b/arch/e2k/include/asm/kvm/switch.h index 7f39f8b..d845c02 100644 --- a/arch/e2k/include/asm/kvm/switch.h +++ b/arch/e2k/include/asm/kvm/switch.h @@ -101,8 +101,7 @@ native_guest_syscall_enter(struct pt_regs *regs) #ifdef CONFIG_VIRTUALIZATION /* - * For interceptions just switch actual registers with saved values - * in 'sw_ctxt'. + * For interceptions just switch actual registers with saved values in 'sw_ctxt'. * * For hypercalls: * 1) Enter hypercall. @@ -128,8 +127,7 @@ static inline void kvm_switch_stack_regs(struct kvm_sw_cpu_context *sw_ctxt, AW(sbr) = NATIVE_NV_READ_SBR_REG_VALUE(); } - NATIVE_NV_WRITE_USBR_USD_REG(sw_ctxt->sbr, - sw_ctxt->usd_hi, sw_ctxt->usd_lo); + NATIVE_NV_WRITE_USBR_USD_REG(sw_ctxt->sbr, sw_ctxt->usd_hi, sw_ctxt->usd_lo); if (ctxt_save) { KVM_BUG_ON(sw_ctxt->saved.valid); @@ -225,14 +223,27 @@ static inline void kvm_switch_mmu_regs(struct kvm_sw_cpu_context *sw_ctxt, static inline void kvm_switch_to_guest_mmu_pid(struct kvm_vcpu *vcpu) { mm_context_t *gmm_context; + unsigned long mask; gmm_context = pv_vcpu_get_gmm_context(vcpu); - reload_mmu_pid(gmm_context, smp_processor_id()); + mask = get_mmu_pid(gmm_context, smp_processor_id()); + reload_context_mask(mask); +} + +static inline unsigned long kvm_get_guest_mmu_pid(struct kvm_vcpu *vcpu) +{ + mm_context_t *gmm_context; + + gmm_context = pv_vcpu_get_gmm_context(vcpu); + return gmm_context->cpumsk[smp_processor_id()]; } static inline void kvm_switch_to_host_mmu_pid(struct mm_struct *mm) { - reload_context(mm, smp_processor_id()); + unsigned long mask; + + mask = get_mmu_context(mm, smp_processor_id()); + reload_context_mask(mask); } static inline void kvm_switch_debug_regs(struct kvm_sw_cpu_context *sw_ctxt, @@ -479,6 +490,8 @@ static inline void host_guest_enter(struct thread_info *ti, if (flags & DEBUG_REGS_SWITCH) kvm_switch_debug_regs(sw_ctxt, true); + KVM_BUG_ON(vcpu->is_hv && !NATIVE_READ_MMU_US_CL_D()); + /* Switch data stack after all function calls */ if (flags & USD_CONTEXT_SWITCH) { if (!(flags & FROM_HYPERCALL_SWITCH) || !vcpu->is_hv) { @@ -506,6 +519,8 @@ static inline void host_guest_enter_light(struct thread_info *ti, kvm_switch_cu_regs(sw_ctxt); + KVM_BUG_ON(vcpu->is_hv && !NATIVE_READ_MMU_US_CL_D()); + /* Switch data stack after all function calls */ if (!from_sdisp) { if (!vcpu->is_hv) { @@ -513,6 +528,7 @@ static inline void host_guest_enter_light(struct thread_info *ti, } else { /* restore saved source pointers of host stack */ kvm_switch_stack_regs(sw_ctxt, false, true); + kvm_switch_clw_regs(sw_ctxt, true); } } } @@ -540,6 +556,8 @@ static inline void host_guest_exit(struct thread_info *ti, } } + KVM_BUG_ON(vcpu->is_hv && !NATIVE_READ_MMU_US_CL_D()); + if (flags & FROM_HYPERCALL_SWITCH) { /* * Hypercalls - both hardware and software virtualization @@ -649,6 +667,8 @@ static inline void host_guest_exit_light(struct thread_info *ti, KVM_BUG_ON(sw_ctxt->in_hypercall); sw_ctxt->in_hypercall = true; + KVM_BUG_ON(vcpu->is_hv && !NATIVE_READ_MMU_US_CL_D()); + HOST_SAVE_KERNEL_GREGS_AS_LIGHT(ti); ONLY_SET_KERNEL_GREGS(ti); diff --git a/arch/e2k/include/asm/kvm/trace-defs.h b/arch/e2k/include/asm/kvm/trace-defs.h new file mode 100644 index 0000000..492280b --- /dev/null +++ b/arch/e2k/include/asm/kvm/trace-defs.h @@ -0,0 +1,23 @@ +#ifndef _E2K_KVM_TRACE_DEFS_H_ +#define _E2K_KVM_TRACE_DEFS_H_ + +#include + +#include +#include +#include + +static inline void +trace_kvm_get_va_translation(struct kvm_vcpu *vcpu, e2k_addr_t address, + pgdval_t *pgd, pudval_t *pud, pmdval_t *pmd, pteval_t *pte, int *pt_level) +{ + kvm_get_spt_translation(vcpu, address, pgd, pud, pmd, pte, pt_level); +} + +static inline unsigned long +trace_kvm_get_gva_to_hva(struct kvm_vcpu *vcpu, gva_t gva) +{ + return kvm_get_gva_to_hva(vcpu, gva); +} + +#endif /* _E2K_KVM_TRACE_DEFS_H_ */ diff --git a/arch/e2k/include/asm/kvm/trace-hw-stacks.h b/arch/e2k/include/asm/kvm/trace-hw-stacks.h new file mode 100644 index 0000000..8c8db3f --- /dev/null +++ b/arch/e2k/include/asm/kvm/trace-hw-stacks.h @@ -0,0 +1,367 @@ +#if !defined(_KVM_TRACE_COPY_HW_STACKS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _KVM_TRACE_COPY_HW_STACKS_H + +#include +#include + +#include +#include +#include +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM host + +#ifdef CONFIG_KVM_HOST_MODE + +TRACE_EVENT( + host_copy_hw_stack, + + TP_PROTO(void *dst, void *src, unsigned long size, bool is_chain), + + TP_ARGS(dst, src, size, is_chain), + + TP_STRUCT__entry( + __field_struct(struct kvm_vcpu *, vcpu ) + __field( void *, dst ) + __field( void *, src ) + __field( u64, size ) + __field( bool, is_chain ) + __field( pgdval_t, dst_pgd ) + __field( pudval_t, dst_pud ) + __field( pmdval_t, dst_pmd ) + __field( pteval_t, dst_pte ) + __field( int, dst_pt_level ) + __field( pgdval_t, hva_pgd ) + __field( pudval_t, hva_pud ) + __field( pmdval_t, hva_pmd ) + __field( pteval_t, hva_pte ) + __field( int, hva_pt_level ) + __field( unsigned long, hva ) + ), + + TP_fast_assign( + unsigned long hva; + + __entry->vcpu = current_thread_info()->vcpu; + __entry->dst = dst; + __entry->src = src; + __entry->size = size; + __entry->is_chain = is_chain; + + trace_kvm_get_va_translation(__entry->vcpu, (e2k_addr_t)dst, + &__entry->dst_pgd, &__entry->dst_pud, &__entry->dst_pmd, + &__entry->dst_pte, &__entry->dst_pt_level); + + hva = trace_kvm_get_gva_to_hva(__entry->vcpu, (gva_t)dst); + __entry->hva = hva; + + trace_get_va_translation(current->mm, (e2k_addr_t)hva, + &__entry->hva_pgd, &__entry->hva_pud, &__entry->hva_pmd, + &__entry->hva_pte, &__entry->hva_pt_level); + ), + + TP_printk("VCPU #%d copy %s stack kernel guest <- kernel host: dst %px " + "src %px size %llx\n" + " kernel guest dst GVA %px : pgd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pud 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pmd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pte 0x%016lx : %s\n" + " Access mode: %s%s\n" + " kernel host dst HVA %px : pgd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pud 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pmd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pte 0x%016lx : %s\n" + " Access mode: %s%s", + __entry->vcpu->vcpu_id, + (__entry->is_chain) ? "chain" : "procedure", + __entry->dst, + __entry->src, + __entry->size, + __entry->dst, + (__entry->dst_pt_level <= E2K_PGD_LEVEL_NUM) ? + __entry->dst_pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pgd, + __entry->dst_pt_level <= E2K_PGD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PUD_LEVEL_NUM) ? + __entry->dst_pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pud, + __entry->dst_pt_level <= E2K_PUD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PMD_LEVEL_NUM) ? + __entry->dst_pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pmd, + __entry->dst_pt_level <= E2K_PMD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PTE_LEVEL_NUM) ? + __entry->dst_pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pte, + __entry->dst_pt_level <= E2K_PTE_LEVEL_NUM), + (void *)__entry->hva, + (__entry->hva_pt_level <= E2K_PGD_LEVEL_NUM) ? + __entry->hva_pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->hva_pgd, + __entry->hva_pt_level <= E2K_PGD_LEVEL_NUM), + (__entry->hva_pt_level <= E2K_PUD_LEVEL_NUM) ? + __entry->hva_pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->hva_pud, + __entry->hva_pt_level <= E2K_PUD_LEVEL_NUM), + (__entry->hva_pt_level <= E2K_PMD_LEVEL_NUM) ? + __entry->hva_pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->hva_pmd, + __entry->hva_pt_level <= E2K_PMD_LEVEL_NUM), + (__entry->hva_pt_level <= E2K_PTE_LEVEL_NUM) ? + __entry->hva_pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->hva_pte, + __entry->hva_pt_level <= E2K_PTE_LEVEL_NUM) + ) +); + +TRACE_EVENT( + host_proc_stack_frame, + + TP_PROTO(kernel_mem_ps_t *ps_base, kernel_mem_ps_t *ps_frame), + + TP_ARGS(ps_base, ps_frame), + + TP_STRUCT__entry( + __field( kernel_mem_ps_t *, ps_base ) + __field_struct( kernel_mem_ps_t, ps_frame ) + __field( pgprotval_t, dtlb_entry ) + ), + + TP_fast_assign( + __entry->ps_base = ps_base; + __entry->ps_frame = *ps_frame; + __entry->dtlb_entry = get_MMU_DTLB_ENTRY((e2k_addr_t)ps_base); + ), + + TP_printk(" %px (dtlb 0x%016lx) : 0x%016lx 0x%016lx", + __entry->ps_base, __entry->dtlb_entry, + __entry->ps_frame.word_lo, __entry->ps_frame.word_hi) +); + +TRACE_EVENT( + host_chain_stack_frame, + + TP_PROTO(e2k_mem_crs_t *pcs_base, e2k_mem_crs_t *pcs_frame), + + TP_ARGS(pcs_base, pcs_frame), + + TP_STRUCT__entry( + __field( e2k_mem_crs_t *, pcs_base ) + __field_struct( e2k_mem_crs_t, pcs_frame ) + __field( pgprotval_t, dtlb_entry ) + ), + + TP_fast_assign( + __entry->pcs_base = pcs_base; + __entry->pcs_frame = *pcs_frame; + __entry->dtlb_entry = get_MMU_DTLB_ENTRY((e2k_addr_t)pcs_base); + ), + + TP_printk(" %px (dtlb 0x%016lx) : 0x%016llx 0x%016llx " + "0x%016llx 0x%016llx", + __entry->pcs_base, __entry->dtlb_entry, + __entry->pcs_frame.cr0_lo.CR0_lo_half, + __entry->pcs_frame.cr0_hi.CR0_hi_half, + __entry->pcs_frame.cr1_lo.CR1_lo_half, + __entry->pcs_frame.cr1_hi.CR1_hi_half) +); + +TRACE_EVENT( + host_copy_hva_area, + + TP_PROTO(void *dst, void *src, unsigned long size), + + TP_ARGS(dst, src, size), + + TP_STRUCT__entry( + __field( void *, dst ) + __field( void *, src ) + __field( u64, size ) + __field( pgdval_t, dst_pgd ) + __field( pudval_t, dst_pud ) + __field( pmdval_t, dst_pmd ) + __field( pteval_t, dst_pte ) + __field( int, dst_pt_level ) + __field( pgdval_t, src_pgd ) + __field( pudval_t, src_pud ) + __field( pmdval_t, src_pmd ) + __field( pteval_t, src_pte ) + __field( int, src_pt_level ) + ), + + TP_fast_assign( + __entry->dst = dst; + __entry->src = src; + __entry->size = size; + + trace_get_va_translation(current->mm, (e2k_addr_t)dst, + &__entry->dst_pgd, &__entry->dst_pud, &__entry->dst_pmd, + &__entry->dst_pte, &__entry->dst_pt_level); + trace_get_va_translation(current->mm, (e2k_addr_t)src, + &__entry->src_pgd, &__entry->src_pud, &__entry->src_pmd, + &__entry->src_pte, &__entry->src_pt_level); + ), + + TP_printk("copy area user guest <- kernel guest: dst %px " + "src %px size %llx\n" + " kernel guest dst HVA %px : pgd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pud 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pmd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pte 0x%016lx : %s\n" + " Access mode: %s%s\n" + " kernel guest dst HVA %px : pgd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pud 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pmd 0x%016lx : %s\n" + " Access mode: %s%s\n" + " pte 0x%016lx : %s\n" + " Access mode: %s%s", + __entry->dst, + __entry->src, + __entry->size, + __entry->dst, + (__entry->dst_pt_level <= E2K_PGD_LEVEL_NUM) ? + __entry->dst_pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pgd, + __entry->dst_pt_level <= E2K_PGD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PUD_LEVEL_NUM) ? + __entry->dst_pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pud, + __entry->dst_pt_level <= E2K_PUD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PMD_LEVEL_NUM) ? + __entry->dst_pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pmd, + __entry->dst_pt_level <= E2K_PMD_LEVEL_NUM), + (__entry->dst_pt_level <= E2K_PTE_LEVEL_NUM) ? + __entry->dst_pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->dst_pte, + __entry->dst_pt_level <= E2K_PTE_LEVEL_NUM), + __entry->src, + (__entry->src_pt_level <= E2K_PGD_LEVEL_NUM) ? + __entry->src_pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_pgd, + __entry->src_pt_level <= E2K_PGD_LEVEL_NUM), + (__entry->src_pt_level <= E2K_PUD_LEVEL_NUM) ? + __entry->src_pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_pud, + __entry->src_pt_level <= E2K_PUD_LEVEL_NUM), + (__entry->src_pt_level <= E2K_PMD_LEVEL_NUM) ? + __entry->src_pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_pmd, + __entry->src_pt_level <= E2K_PMD_LEVEL_NUM), + (__entry->src_pt_level <= E2K_PTE_LEVEL_NUM) ? + __entry->src_pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->src_pte, + __entry->src_pt_level <= E2K_PTE_LEVEL_NUM) + ) +); + +TRACE_EVENT( + host_hva_area_line, + + TP_PROTO(u64 *hva_base, u64 size), + + TP_ARGS(hva_base, size), + + TP_STRUCT__entry( + __field( u64 *, hva_base ) + __array( u64, hva_line, 4 ) + __field( u64, size ) + __field( pgprotval_t, dtlb_entry ) + ), + + TP_fast_assign( + __entry->hva_base = hva_base; + __entry->hva_line[0] = + (size >= 1 * sizeof(u64)) ? hva_base[0] : -1; + __entry->hva_line[1] = + (size >= 2 * sizeof(u64)) ? hva_base[1] : -1; + __entry->hva_line[2] = + (size >= 3 * sizeof(u64)) ? hva_base[2] : -1; + __entry->hva_line[3] = + (size >= 4 * sizeof(u64)) ? hva_base[3] : -1; + __entry->size = size; + __entry->dtlb_entry = get_MMU_DTLB_ENTRY((e2k_addr_t)hva_base); + ), + + TP_printk(" %px (dtlb 0x%016lx) : 0x%016llx 0x%016llx " + "0x%016llx 0x%016llx", + __entry->hva_base, __entry->dtlb_entry, + __entry->hva_line[0], + __entry->hva_line[1], + __entry->hva_line[2], + __entry->hva_line[3] + ) +); +#else /* !CONFIG_KVM_HOST_MODE */ + +static inline bool trace_host_copy_hw_stack_enabled(void) +{ + return false; +} +static inline void +trace_host_copy_hw_stack(void *dst, void *src, unsigned long size, bool is_chain) +{ +} + +static inline bool trace_host_proc_stack_frame_enabled(void) +{ + return false; +} +static inline void +trace_host_proc_stack_frame(kernel_mem_ps_t *ps_base, kernel_mem_ps_t *ps_frame) +{ +} + +static inline bool trace_host_chain_stack_frame_enabled(void) +{ + return false; +} +static inline void +trace_host_chain_stack_frame(e2k_mem_crs_t *pcs_base, e2k_mem_crs_t *pcs_frame) +{ +} + +static inline bool trace_host_copy_hva_area_enabled(void) +{ + return false; +} +static inline void +trace_host_copy_hva_area(void *dst, void *src, unsigned long size) +{ +} + +static inline bool trace_host_hva_area_line_enabled(void) +{ + return false; +} +static inline void +trace_host_hva_area_line(u64 *hva_base, u64 size) +{ +} + +#endif /* CONFIG_KVM_HOST_MODE */ + +#endif /* _KVM_TRACE_COPY_HW_STACKS_H */ + +#ifdef CONFIG_KVM_HOST_MODE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/e2k/include/asm/kvm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace-hw-stacks + +/* This part must be outside protection */ +#include +#endif /* CONFIG_KVM_HOST_MODE */ diff --git a/arch/e2k/include/asm/kvm/trace_kvm.h b/arch/e2k/include/asm/kvm/trace_kvm.h index a877257..1f116ca 100644 --- a/arch/e2k/include/asm/kvm/trace_kvm.h +++ b/arch/e2k/include/asm/kvm/trace_kvm.h @@ -1090,6 +1090,79 @@ TRACE_EVENT( TP_printk("Light hypercall exit: %llu\n", __entry->ret) ); +TRACE_EVENT( + guest_switch_to, + + TP_PROTO(struct kvm_vcpu *vcpu, int gpid_from, int gmmid_from, + int gpid_to, int gmmid_to, struct sw_regs *next_gsw), + + TP_ARGS(vcpu, gpid_from, gmmid_from, gpid_to, gmmid_to, next_gsw), + + TP_STRUCT__entry( + __field( int, vcpu_id ) + __field( int, gpid_from ) + __field( int, gmmid_from ) + __field( int, gpid_to ) + __field( int, gmmid_to ) + __field( e2k_addr_t, top ) + __field_struct( e2k_usd_lo_t, usd_lo ) + __field_struct( e2k_usd_hi_t, usd_hi ) + __field_struct( e2k_psp_lo_t, psp_lo ) + __field_struct( e2k_psp_hi_t, psp_hi ) + __field_struct( e2k_pcsp_lo_t, pcsp_lo ) + __field_struct( e2k_pcsp_hi_t, pcsp_hi ) + __field( pgprotval_t, u_pptb ) + __field( gva_t, u_vptb ) + __field( hpa_t, root ) + __field( u64, mmu_pptb ) + __field( u64, mmu_pid ) + __field( u64, ctxt_pid ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gpid_from = gpid_from; + __entry->gmmid_from = gmmid_from; + __entry->gpid_to = gpid_to; + __entry->gmmid_to = gmmid_to; + __entry->top = next_gsw->top; + __entry->usd_lo = next_gsw->usd_lo; + __entry->usd_hi = next_gsw->usd_hi; + __entry->psp_lo = next_gsw->psp_lo; + __entry->psp_hi = next_gsw->psp_hi; + __entry->pcsp_lo = next_gsw->pcsp_lo; + __entry->pcsp_hi = next_gsw->pcsp_hi; + __entry->u_pptb = vcpu->arch.mmu.get_vcpu_u_pptb(vcpu); + __entry->u_vptb = vcpu->arch.mmu.get_vcpu_sh_u_vptb(vcpu); + __entry->root = kvm_get_space_type_spt_u_root(vcpu); + __entry->mmu_pptb = get_mmu_u_pptb_reg(); + __entry->mmu_pid = get_mmu_pid_reg(); + __entry->ctxt_pid = kvm_get_guest_mmu_pid(vcpu); + ), + + TP_printk("VCPU #%d: switch from gpid #%d gmm #%d to gpid #%d gmm #%d\n" + " USD: base 0x%llx size 0x%x top at 0x%lx\n" + " PSP: base 0x%llx ind 0x%x size 0x%x\n" + " PCSP: base 0x%llx ind 0x%x size 0x%x\n" + " MMU: u_pptb 0x%lx u_vptb 0x%lx sh_pptb 0x%llx\n" + " mmu_upptb 0x%llx mmu_pid 0x%llx ctxt pid 0x%llx", + __entry->vcpu_id, + __entry->gpid_from, __entry->gmmid_from, + __entry->gpid_to, __entry->gmmid_to, + __entry->usd_lo.USD_lo_base, + __entry->usd_hi.USD_hi_size, + __entry->top, + __entry->psp_lo.PSP_lo_base, + __entry->psp_hi.PSP_hi_ind, + __entry->psp_hi.PSP_hi_size, + __entry->pcsp_lo.PCSP_lo_base, + __entry->pcsp_hi.PCSP_hi_ind, + __entry->pcsp_hi.PCSP_hi_size, + __entry->u_pptb, __entry->u_vptb, __entry->root, + __entry->mmu_pptb, __entry->mmu_pid, __entry->ctxt_pid + ) +); + TRACE_EVENT( vcpu_put, diff --git a/arch/e2k/include/asm/kvm/trace_kvm_hv.h b/arch/e2k/include/asm/kvm/trace_kvm_hv.h index b617c13..0a3de33 100644 --- a/arch/e2k/include/asm/kvm/trace_kvm_hv.h +++ b/arch/e2k/include/asm/kvm/trace_kvm_hv.h @@ -368,6 +368,42 @@ TRACE_EVENT( TP_printk("pmirr#%d val 0x%llx\n", __entry->pmirr, __entry->val) ); +TRACE_EVENT( + save_pnmirr, + + TP_PROTO(u32 val), + + TP_ARGS(val), + + TP_STRUCT__entry( + __field( u32, val ) + ), + + TP_fast_assign( + __entry->val = val; + ), + + TP_printk("pnmirr val 0x%x\n", __entry->val) +); + +TRACE_EVENT( + restore_pnmirr, + + TP_PROTO(u32 val), + + TP_ARGS(val), + + TP_STRUCT__entry( + __field( u32, val ) + ), + + TP_fast_assign( + __entry->val = val; + ), + + TP_printk("pnmirr val 0x%x\n", __entry->val) +); + TRACE_EVENT( save_cir, diff --git a/arch/e2k/include/asm/kvm_host.h b/arch/e2k/include/asm/kvm_host.h index b057001..28d5489 100644 --- a/arch/e2k/include/asm/kvm_host.h +++ b/arch/e2k/include/asm/kvm_host.h @@ -744,6 +744,7 @@ typedef struct kvm_sw_cpu_context { u64 rpr_hi; u64 tcd; +#ifdef CONFIG_CLW_ENABLE mmu_reg_t us_cl_d; clw_reg_t us_cl_b; clw_reg_t us_cl_up; @@ -751,6 +752,7 @@ typedef struct kvm_sw_cpu_context { clw_reg_t us_cl_m1; clw_reg_t us_cl_m2; clw_reg_t us_cl_m3; +#endif } kvm_sw_cpu_context_t; /* diff --git a/arch/e2k/include/asm/machdep.h b/arch/e2k/include/asm/machdep.h index 61bb325..100441d 100644 --- a/arch/e2k/include/asm/machdep.h +++ b/arch/e2k/include/asm/machdep.h @@ -39,10 +39,13 @@ enum { CPU_HWBUG_SPURIOUS_EXC_DATA_DEBUG, CPU_HWBUG_TLB_FLUSH_L1D, CPU_HWBUG_GUEST_ASYNC_PM, + CPU_HWBUG_E16C_SLEEP, CPU_HWBUG_L1I_STOPS_WORKING, CPU_HWBUG_CLW_STALE_L1_ENTRY, CPU_HWBUG_C3_WAIT_MA_C, CPU_HWBUG_VIRT_SCLKM3_INTC, + CPU_HWBUG_VIRT_PSIZE_INTERCEPTION, + CPU_HWBUG_USD_ALIGNMENT, CPU_FEAT_WC_PCI_PREFETCH, CPU_FEAT_FLUSH_DC_IC, CPU_FEAT_EPIC, @@ -52,7 +55,6 @@ enum { CPU_FEAT_ISET_V3, CPU_FEAT_ISET_V5, CPU_FEAT_ISET_V6, - CPU_HWBUG_E16C_SLEEP, NR_CPU_FEATURES }; @@ -298,7 +300,8 @@ CPUHAS(CPU_HWBUG_DMA_AT_APIC_ADDR, false, cpu == IDR_ES2_DSP_MDL); /* #88644 - data profiling events are lost if overflow happens - * under closed NM interrupts. + * under closed NM interrupts; also DDMCR writing does not clear + * pending exc_data_debug exceptions. * Workaround - disable data monitor profiling in kernel. */ CPUHAS(CPU_HWBUG_KERNEL_DATA_MONITOR, !IS_ENABLED(CONFIG_CPU_ES2) && !IS_ENABLED(CONFIG_CPU_E2S) && @@ -491,6 +494,22 @@ CPUHAS(CPU_HWBUG_VIRT_SCLKM3_INTC, cpu == IDR_E16C_MDL && revision == 0 || cpu == IDR_E12C_MDL && revision == 0 || cpu == IDR_E2C3_MDL && revision == 0); +/* #130039 - intercepting some specific sequences of call/return/setwd + * (that change WD.psize in a specific way) does not work. + * Workaround - avoid those sequences. */ +CPUHAS(CPU_HWBUG_VIRT_PSIZE_INTERCEPTION, + IS_ENABLED(CONFIG_E2K_MACHINE), + IS_ENABLED(CONFIG_CPU_E16C) || IS_ENABLED(CONFIG_CPU_E2C3), + (cpu == IDR_E16C_MDL || cpu == IDR_E2C3_MDL) && revision == 0); +/* #129848 - alignment of usd_hi write depends on current usd_lo.p + * Workaround - write usd_lo before usd_hi, while keeping 2 tact distance from sbr write. + * Valid sequences are: sbr, nop, usd.lo, usd.hi OR sbr, usd.lo, usd.hi, usd.lo */ +CPUHAS(CPU_HWBUG_USD_ALIGNMENT, + IS_ENABLED(CONFIG_E2K_MACHINE) && !IS_ENABLED(CONFIG_CPU_E16C) && + !IS_ENABLED(CONFIG_CPU_E2C3), + !IS_ENABLED(CONFIG_CPU_E12C), + cpu == IDR_E16C_MDL && revision <= 1 || + cpu == IDR_E2C3_MDL && revision <= 1); /* Rely on IDR instead of iset version to choose between APIC and EPIC. * For guest we use it's own fake IDR so that we choose between APIC and * EPIC based on what hardware guest *thinks* it's being executed on. */ diff --git a/arch/e2k/include/asm/mmu_regs_types.h b/arch/e2k/include/asm/mmu_regs_types.h index 7dcf688..85e1ca7 100644 --- a/arch/e2k/include/asm/mmu_regs_types.h +++ b/arch/e2k/include/asm/mmu_regs_types.h @@ -733,6 +733,7 @@ typedef unsigned long clw_reg_t; #ifndef __ASSEMBLY__ typedef union { + u32 half_word[2]; struct { /* structure of register */ u32 user : 1; /* [ 0: 0] */ u32 system : 1; /* [ 1: 1] */ @@ -755,7 +756,10 @@ typedef union { u64 unus9 : 4; u64 b3 : 8; u64 unu10 : 4; - u64 unu11 : 16; + u64 unu11 : 1; + u64 m0 : 1; + u64 m1 : 1; + u64 unu12 : 13; }; union { struct { diff --git a/arch/e2k/include/asm/mmu_types.h b/arch/e2k/include/asm/mmu_types.h index d85b8f0..c991e50 100644 --- a/arch/e2k/include/asm/mmu_types.h +++ b/arch/e2k/include/asm/mmu_types.h @@ -29,8 +29,8 @@ typedef struct { pgprotval_t pgprot; } pgprot_t; #define pgprot_val(x) ((x).pgprot) #define __pte(x) ((pte_t) { (x) } ) -#define __pmd(x) ((pmd_t) { (x) } ) #define __pud(x) ((pud_t) { (x) } ) +#define __pmd(x) ((pmd_t) { (x) } ) #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/arch/e2k/include/asm/native_cpu_regs_access.h b/arch/e2k/include/asm/native_cpu_regs_access.h index 1146f39..634ddeb 100644 --- a/arch/e2k/include/asm/native_cpu_regs_access.h +++ b/arch/e2k/include/asm/native_cpu_regs_access.h @@ -246,19 +246,12 @@ native_read_TIR_hi_reg(void) NATIVE_SET_DSREG_OPEN(usd.lo, USD_lo_value) #define NATIVE_NV_WRITE_USD_HI_REG_VALUE(USD_hi_value) \ NATIVE_SET_DSREG_OPEN(usd.hi, USD_hi_value) -#define NATIVE_NV_WRITE_USD_REG_VALUE(USD_hi_value, USD_lo_value) \ -({ \ - NATIVE_NV_WRITE_USD_HI_REG_VALUE(USD_hi_value); \ - NATIVE_NV_WRITE_USD_LO_REG_VALUE(USD_lo_value); \ -}) -#define NATIVE_NV_WRITE_USD_REG(USD_hi, USD_lo) \ -({ \ - NATIVE_NV_WRITE_USD_REG_VALUE(USD_hi.USD_hi_half, USD_lo.USD_lo_half); \ -}) #define NATIVE_NV_WRITE_USBR_USD_REG_VALUE(usbr, usd_hi, usd_lo) \ do { \ NATIVE_NV_WRITE_USBR_REG_VALUE(usbr); \ + if (cpu_has(CPU_HWBUG_USD_ALIGNMENT)) \ + NATIVE_NV_WRITE_USD_LO_REG_VALUE(usd_lo); \ NATIVE_NV_WRITE_USD_HI_REG_VALUE(usd_hi); \ NATIVE_NV_WRITE_USD_LO_REG_VALUE(usd_lo); \ } while (0) @@ -266,6 +259,8 @@ do { \ #define NATIVE_NV_WRITE_USBR_USD_REG(usbr, usd_hi, usd_lo) \ do { \ NATIVE_NV_WRITE_USBR_REG(usbr); \ + if (cpu_has(CPU_HWBUG_USD_ALIGNMENT)) \ + NATIVE_NV_WRITE_USD_LO_REG(usd_lo); \ NATIVE_NV_WRITE_USD_HI_REG(usd_hi); \ NATIVE_NV_WRITE_USD_LO_REG(usd_lo); \ } while (0) @@ -429,8 +424,10 @@ extern void native_write_SCLKM2_reg_value(unsigned long reg_value); NATIVE_SET_SREG_CLOSED_NOEXC(dibcr, DIBCR_value, 4) #define NATIVE_WRITE_DIBSR_REG_VALUE(DIBSR_value) \ NATIVE_SET_SREG_CLOSED_NOEXC(dibsr, DIBSR_value, 4) +/* 6 cycles delay guarantess that all counting + * is stopped and %dibsr is updated accordingly. */ #define NATIVE_WRITE_DIMCR_REG_VALUE(DIMCR_value) \ - NATIVE_SET_DSREG_CLOSED_NOEXC(dimcr, DIMCR_value, 4) + NATIVE_SET_DSREG_CLOSED_NOEXC(dimcr, DIMCR_value, 5) #define NATIVE_WRITE_DIBAR0_REG_VALUE(DIBAR0_value) \ NATIVE_SET_DSREG_CLOSED_NOEXC(dibar0, DIBAR0_value, 4) #define NATIVE_WRITE_DIBAR1_REG_VALUE(DIBAR1_value) \ diff --git a/arch/e2k/include/asm/native_mmu_regs_access.h b/arch/e2k/include/asm/native_mmu_regs_access.h index fcc700c..c3dbdbd 100644 --- a/arch/e2k/include/asm/native_mmu_regs_access.h +++ b/arch/e2k/include/asm/native_mmu_regs_access.h @@ -242,46 +242,44 @@ native_flush_ICACHE_all(void) /* * native MMU DEBUG registers access */ -#define NATIVE_READ_MMU_DEBUG_REG(reg_mnemonic) \ - NATIVE_GET_MMUREG(reg_mnemonic) -#define NATIVE_WRITE_MMU_DEBUG_REG(reg_mnemonic, reg_value) \ - NATIVE_SET_MMUREG(reg_mnemonic, reg_value) #define NATIVE_READ_DDBAR0_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddbar0) + NATIVE_GET_MMUREG(ddbar0) #define NATIVE_READ_DDBAR1_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddbar1) + NATIVE_GET_MMUREG(ddbar1) #define NATIVE_READ_DDBAR2_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddbar2) + NATIVE_GET_MMUREG(ddbar2) #define NATIVE_READ_DDBAR3_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddbar3) + NATIVE_GET_MMUREG(ddbar3) #define NATIVE_READ_DDBCR_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddbcr) + NATIVE_GET_MMUREG(ddbcr) #define NATIVE_READ_DDBSR_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddbsr) + NATIVE_GET_MMUREG(ddbsr) #define NATIVE_READ_DDMAR0_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddmar0) + NATIVE_GET_MMUREG(ddmar0) #define NATIVE_READ_DDMAR1_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddmar1) + NATIVE_GET_MMUREG(ddmar1) #define NATIVE_READ_DDMCR_REG_VALUE() \ - NATIVE_READ_MMU_DEBUG_REG(ddmcr) + NATIVE_GET_MMUREG(ddmcr) #define NATIVE_WRITE_DDBAR0_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddbar0, value) + NATIVE_SET_MMUREG(ddbar0, value) #define NATIVE_WRITE_DDBAR1_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddbar1, value) + NATIVE_SET_MMUREG(ddbar1, value) #define NATIVE_WRITE_DDBAR2_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddbar2, value) + NATIVE_SET_MMUREG(ddbar2, value) #define NATIVE_WRITE_DDBAR3_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddbar3, value) + NATIVE_SET_MMUREG(ddbar3, value) #define NATIVE_WRITE_DDBCR_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddbcr, value) + NATIVE_SET_MMUREG(ddbcr, value) #define NATIVE_WRITE_DDBSR_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddbsr, value) + NATIVE_SET_MMUREG(ddbsr, value) #define NATIVE_WRITE_DDMAR0_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddmar0, value) + NATIVE_SET_MMUREG(ddmar0, value) #define NATIVE_WRITE_DDMAR1_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddmar1, value) + NATIVE_SET_MMUREG(ddmar1, value) +/* 4 cycles delay guarantess that all counting + * is stopped and %ddbsr is updated accordingly. */ #define NATIVE_WRITE_DDMCR_REG_VALUE(value) \ - NATIVE_WRITE_MMU_DEBUG_REG(ddmcr, value) + NATIVE_SET_MMUREG_CLOSED(ddmcr, value, 3) #define NATIVE_READ_DDBAR0_REG() \ NATIVE_READ_DDBAR0_REG_VALUE() diff --git a/arch/e2k/include/asm/perf_event.h b/arch/e2k/include/asm/perf_event.h index 347ae52..a450604 100644 --- a/arch/e2k/include/asm/perf_event.h +++ b/arch/e2k/include/asm/perf_event.h @@ -1,17 +1,16 @@ -#ifndef _ASM_E2K_PERF_EVENT_H -#define _ASM_E2K_PERF_EVENT_H +#pragma once #include +#include +#include #include #include static inline void set_perf_event_pending(void) {} static inline void clear_perf_event_pending(void) {} -#define PERF_EVENT_INDEX_OFFSET 0 - -int perf_data_overflow_handle(struct pt_regs *); -int perf_instr_overflow_handle(struct pt_regs *); +void perf_data_overflow_handle(struct pt_regs *); +void perf_instr_overflow_handle(struct pt_regs *); void dimtp_overflow(struct perf_event *event); #define perf_arch_fetch_caller_regs perf_arch_fetch_caller_regs @@ -22,35 +21,81 @@ static __always_inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, WARN_ON_ONCE(instruction_pointer(regs) != ip); } -#define ARCH_PERFMON_EVENT_MASK 0xffff -#define ARCH_PERFMON_OS (1 << 16) -#define ARCH_PERFMON_USR (1 << 17) -#define ARCH_PERFMON_ENABLED (1 << 18) +static inline e2k_dimcr_t dimcr_pause(void) +{ + e2k_dimcr_t dimcr, dimcr_old; + /* + * Stop counting for more precise group counting and also + * to avoid races when one counter overflows while another + * is being handled. + * + * Writing %dimcr also clears other pending exc_instr_debug + */ + dimcr = READ_DIMCR_REG(); + dimcr_old = dimcr; + AS(dimcr)[0].user = 0; + AS(dimcr)[0].system = 0; + AS(dimcr)[1].user = 0; + AS(dimcr)[1].system = 0; + WRITE_DIMCR_REG(dimcr); -DECLARE_PER_CPU(struct perf_event * [4], cpu_events); + return dimcr_old; +} -/* - * Bitmask for perf_monitors_used - * - * DIM0 has all counters from DIM1 and some more. So events for - * DIM1 are marked with DIM0_DIM1, and the actual used monitor - * will be determined at runtime. - */ -enum { - _DDM0 = 0, - _DDM1, - _DIM0, - _DIM1, - _DDM0_DDM1, - _DIM0_DIM1, - MAX_HW_MONITORS -}; -#define DDM0 (1 << _DDM0) -#define DDM1 (1 << _DDM1) -#define DIM0 (1 << _DIM0) -#define DIM1 (1 << _DIM1) -#define DDM0_DDM1 (1 << _DDM0_DDM1) -#define DIM0_DIM1 (1 << _DIM0_DIM1) +static inline e2k_ddmcr_t ddmcr_pause(void) +{ + e2k_ddmcr_t ddmcr, ddmcr_old; + /* + * Stop counting for more precise group counting and also + * to avoid races when one counter overflows while another + * is being handled. + * + * Writing %ddmcr also clears other pending exc_data_debug + */ + ddmcr = READ_DDMCR_REG(); + ddmcr_old = ddmcr; + AS(ddmcr)[0].user = 0; + AS(ddmcr)[0].system = 0; + AS(ddmcr)[1].user = 0; + AS(ddmcr)[1].system = 0; + WRITE_DDMCR_REG(ddmcr); + + return ddmcr_old; +} + +#ifdef CONFIG_PERF_EVENTS +extern void dimcr_continue(e2k_dimcr_t dimcr_old); +extern void ddmcr_continue(e2k_ddmcr_t ddmcr_old); +#else +static inline void dimcr_continue(e2k_dimcr_t dimcr_old) +{ + e2k_dimcr_t dimcr; + + /* + * Restart counting + */ + dimcr = READ_DIMCR_REG(); + AS(dimcr)[0].user = AS(dimcr_old)[0].user; + AS(dimcr)[0].system = AS(dimcr_old)[0].system; + AS(dimcr)[1].user = AS(dimcr_old)[1].user; + AS(dimcr)[1].system = AS(dimcr_old)[1].system; + WRITE_DIMCR_REG(dimcr); +} + +static inline void ddmcr_continue(e2k_ddmcr_t ddmcr_old) +{ + e2k_ddmcr_t ddmcr; + + /* + * Restart counting + */ + ddmcr = READ_DDMCR_REG(); + AS(ddmcr)[0].user = AS(ddmcr_old)[0].user; + AS(ddmcr)[0].system = AS(ddmcr_old)[0].system; + AS(ddmcr)[1].user = AS(ddmcr_old)[1].user; + AS(ddmcr)[1].system = AS(ddmcr_old)[1].system; + WRITE_DDMCR_REG(ddmcr); +} #endif diff --git a/arch/e2k/include/asm/perf_event_types.h b/arch/e2k/include/asm/perf_event_types.h new file mode 100644 index 0000000..7ffd1c2 --- /dev/null +++ b/arch/e2k/include/asm/perf_event_types.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include + +#define PERF_EVENT_INDEX_OFFSET 0 + +#define ARCH_PERFMON_EVENT_MASK 0xffff +#define ARCH_PERFMON_OS (1 << 16) +#define ARCH_PERFMON_USR (1 << 17) +#define ARCH_PERFMON_ENABLED (1 << 18) + +DECLARE_PER_CPU(struct perf_event * [4], cpu_events); + +#ifdef CONFIG_PERF_EVENTS +DECLARE_PER_CPU(u8, perf_monitors_used); +DECLARE_PER_CPU(u8, perf_bps_used); +# define perf_read_monitors_used() __this_cpu_read(perf_monitors_used) +# define perf_read_bps_used() __this_cpu_read(perf_bps_used) +#else /* ! CONFIG_PERF_EVENTS */ +# define perf_read_monitors_used() 0 +# define perf_read_bps_used() 0 +#endif /* CONFIG_PERF_EVENTS */ + +/* + * Bitmask for perf_monitors_used + * + * DIM0 has all counters from DIM1 and some more. So events for + * DIM1 are marked with DIM0_DIM1, and the actual used monitor + * will be determined at runtime. + */ +enum { + _DDM0 = 0, + _DDM1, + _DIM0, + _DIM1, + _DDM0_DDM1, + _DIM0_DIM1, + MAX_HW_MONITORS +}; +#define DDM0 (1 << _DDM0) +#define DDM1 (1 << _DDM1) +#define DIM0 (1 << _DIM0) +#define DIM1 (1 << _DIM1) +#define DDM0_DDM1 (1 << _DDM0_DDM1) +#define DIM0_DIM1 (1 << _DIM0_DIM1) diff --git a/arch/e2k/include/asm/perf_event_uncore.h b/arch/e2k/include/asm/perf_event_uncore.h index 316a1a6..a660cf0 100644 --- a/arch/e2k/include/asm/perf_event_uncore.h +++ b/arch/e2k/include/asm/perf_event_uncore.h @@ -82,7 +82,7 @@ struct e2k_uncore { struct e2k_uncore_valid_events *valid_events; int (*validate_event)(struct e2k_uncore *, struct hw_perf_event *); u64 (*get_event)(struct hw_perf_event *); - int (*add_event)(struct perf_event *); + int (*add_event)(struct e2k_uncore *, struct perf_event *); struct e2k_uncore_reg_ops *reg_ops; struct pmu pmu; diff --git a/arch/e2k/include/asm/process.h b/arch/e2k/include/asm/process.h index 5a62028..a238ca1 100644 --- a/arch/e2k/include/asm/process.h +++ b/arch/e2k/include/asm/process.h @@ -27,24 +27,6 @@ #include #include /* host mode support */ -#undef DEBUG_PV_UST_MODE -#undef DebugUST -#define DEBUG_PV_UST_MODE 0 /* guest user stacks debug */ -#define DebugUST(fmt, args...) \ -({ \ - if (debug_guest_ust) \ - pr_info("%s(): " fmt, __func__, ##args); \ -}) - -#undef DEBUG_PV_SYSCALL_MODE -#define DEBUG_PV_SYSCALL_MODE 0 /* syscall injection debugging */ - -#if DEBUG_PV_UST_MODE || DEBUG_PV_SYSCALL_MODE -extern bool debug_guest_ust; -#else -#define debug_guest_ust false -#endif /* DEBUG_PV_UST_MODE || DEBUG_PV_SYSCALL_MODE */ - #undef DEBUG_SS_MODE #undef DebugSS #define DEBUG_SS_MODE 0 /* stack switching */ @@ -818,12 +800,10 @@ static inline void free_virt_task_struct(struct task_struct *task) { /* virtual machines is not supported */ } -#else /* CONFIG_VIRTUALIZATION */ +#elif defined(CONFIG_KVM_HOST_MODE) /* It is native host kernel with virtualization support */ /* or paravirtualized host and guest */ -/* or native guest kernel #include - */ #endif /* ! CONFIG_VIRTUALIZATION */ /* @@ -852,119 +832,6 @@ native_preserve_user_hw_stacks_to_copy(e2k_stacks_t *u_stacks, u_stacks->pcshtp = cur_stacks->pcshtp; } -static __always_inline void -native_kernel_hw_stack_frames_copy(u64 *dst, const u64 *src, unsigned long size) -{ - void *dst_tail; - const void *src_tail; - u64 copied; - int i; - - /* - * Kernel does not use FP registers so do not copy them. - * This only applies to CPUs before V5 instruction set - * (since V5 FP registers become general-purpose QP registers). - */ - if (cpu_has(CPU_FEAT_QPREG)) { -#pragma loop count (10) - for (i = 0; i < size / 64; i++) - E2K_TAGGED_MEMMOVE_64(&dst[8 * i], &src[8 * i]); - - copied = round_down(size, 64); - dst_tail = (void *) dst + copied; - src_tail = (void *) src + copied; - } else { -#pragma loop count (5) - for (i = 0; i < size / 128; i++) - E2K_TAGGED_MEMMOVE_128_RF_V2(&dst[16 * i], - &src[16 * i]); - - copied = round_down(size, 128); - dst_tail = (void *) dst + copied; - src_tail = (void *) src + copied; - - if (size & 64) { - E2K_TAGGED_MEMMOVE_64(dst_tail, src_tail); - dst_tail += 64; - src_tail += 64; - } - } - - if (size & 32) - E2K_TAGGED_MEMMOVE_32(dst_tail, src_tail); -} - -static __always_inline void -native_collapse_kernel_pcs(u64 *dst, const u64 *src, u64 spilled_size) -{ - e2k_pcsp_hi_t k_pcsp_hi; - u64 size; - int i; - - DebugUST("current host chain stack index 0x%x, PCSHTP 0x%llx\n", - NATIVE_NV_READ_PCSP_HI_REG().PCSP_hi_ind, - NATIVE_READ_PCSHTP_REG_SVALUE()); - - NATIVE_FLUSHC; - k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); - - size = k_pcsp_hi.PCSP_hi_ind - spilled_size; - BUG_ON(!IS_ALIGNED(size, ALIGN_PCSTACK_TOP_SIZE) || (s64) size < 0); -#pragma loop count (2) - for (i = 0; i < size / 32; i++) { - u64 v0, v1, v2, v3; - - v0 = src[4 * i]; - v1 = src[4 * i + 1]; - v2 = src[4 * i + 2]; - v3 = src[4 * i + 3]; - dst[4 * i] = v0; - dst[4 * i + 1] = v1; - dst[4 * i + 2] = v2; - dst[4 * i + 3] = v3; - } - - k_pcsp_hi.PCSP_hi_ind -= spilled_size; - NATIVE_NV_NOIRQ_WRITE_PCSP_HI_REG(k_pcsp_hi); - - DebugUST("move spilled chain part from host top %px to " - "bottom %px, size 0x%llx\n", - src, dst, size); - DebugUST("host kernel chain stack index is now 0x%x, " - "guest user PCSHTP 0x%llx\n", - k_pcsp_hi.PCSP_hi_ind, spilled_size); -} - -static __always_inline void -native_collapse_kernel_ps(u64 *dst, const u64 *src, u64 spilled_size) -{ - e2k_psp_hi_t k_psp_hi; - u64 size; - - DebugUST("current host procedure stack index 0x%x, PSHTP 0x%x\n", - NATIVE_NV_READ_PSP_HI_REG().PSP_hi_ind, - NATIVE_NV_READ_PSHTP_REG().PSHTP_ind); - - NATIVE_FLUSHR; - k_psp_hi = NATIVE_NV_READ_PSP_HI_REG(); - - size = k_psp_hi.PSP_hi_ind - spilled_size; - BUG_ON(!IS_ALIGNED(size, ALIGN_PSTACK_TOP_SIZE) || (s64) size < 0); - - prefetchw_range(src, size); - native_kernel_hw_stack_frames_copy(dst, src, size); - - k_psp_hi.PSP_hi_ind -= spilled_size; - NATIVE_NV_NOIRQ_WRITE_PSP_HI_REG(k_psp_hi); - - DebugUST("move spilled procedure part from host top %px to " - "bottom %px, size 0x%llx\n", - src, dst, size); - DebugUST("host kernel procedure stack index is now 0x%x, " - "guest user PSHTP 0x%llx\n", - k_psp_hi.PSP_hi_ind, spilled_size); -} - /** * find_in_u_pcs_list - find frame offset from old_u_pcs_list * @frame - frame to search @@ -1093,22 +960,6 @@ preserve_user_hw_stacks_to_copy(e2k_stacks_t *u_stacks, native_preserve_user_hw_stacks_to_copy(u_stacks, cur_stacks); } -static __always_inline void -kernel_hw_stack_frames_copy(u64 *dst, const u64 *src, unsigned long size) -{ - native_kernel_hw_stack_frames_copy(dst, src, size); -} -static __always_inline void -collapse_kernel_pcs(u64 *dst, const u64 *src, u64 spilled_size) -{ - native_collapse_kernel_pcs(dst, src, spilled_size); -} -static __always_inline void -collapse_kernel_ps(u64 *dst, const u64 *src, u64 spilled_size) -{ - native_collapse_kernel_ps(dst, src, spilled_size); -} - #if !defined(CONFIG_VIRTUALIZATION) /* native kernel without virtualization support */ #define do_map_user_hard_stack_to_kernel(node, kstart, ubase, size) \ @@ -1164,104 +1015,6 @@ copy_spilled_user_stacks(struct e2k_stacks *child_stacks, #error "Undefined virtualization mode" #endif /* CONFIG_PARAVIRT_GUEST */ -/* - * Copy hardware stack from user to *current* kernel stack. - * One has to be careful to avoid hardware FILL of this stack. - */ -static inline int __copy_user_to_current_hw_stack(void *dst, void __user *src, - unsigned long size, const pt_regs_t *regs, bool chain) -{ - unsigned long min_flt, maj_flt, ts_flag; - - if (likely(!host_test_intc_emul_mode(regs))) { - if (!__range_ok((unsigned long __force) src, size, - PAGE_OFFSET)) - return -EFAULT; - } - - ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); - - /* - * Every page fault here has a chance of FILL'ing the frame - * that is being copied, in which case we repeat the copy. - */ - do { - min_flt = READ_ONCE(current->min_flt); - maj_flt = READ_ONCE(current->maj_flt); - - if (chain) - E2K_FLUSHC; - else - E2K_FLUSHR; - - SET_USR_PFAULT("$.recovery_memcpy_fault"); - fast_tagged_memory_copy_from_user(dst, src, size, regs, - TAGGED_MEM_STORE_REC_OPC | - MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, - TAGGED_MEM_LOAD_REC_OPC | - MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, - true); - if (RESTORE_USR_PFAULT) { - clear_ts_flag(ts_flag); - return -EFAULT; - } - } while (unlikely(min_flt != READ_ONCE(current->min_flt) || - maj_flt != READ_ONCE(current->maj_flt))); - - clear_ts_flag(ts_flag); - return 0; -} - - -static inline int copy_user_to_current_hw_stack(void *dst, void __user *src, - unsigned long size, pt_regs_t *regs, bool chain) -{ - unsigned long flags; - int ret; - - raw_all_irq_save(flags); - ret = __copy_user_to_current_hw_stack(dst, src, size, regs, chain); - raw_all_irq_restore(flags); - - return ret; -} - -static inline int copy_e2k_stack_from_user(void *dst, void __user *src, - unsigned long size, pt_regs_t *regs) -{ - unsigned long ts_flag; - int ret; - - if (likely(!host_test_intc_emul_mode(regs))) { - if (!__range_ok((unsigned long __force) src, size, PAGE_OFFSET)) - return -EFAULT; - } - - ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); - ret = host_copy_from_user_with_tags(dst, src, size, regs); - clear_ts_flag(ts_flag); - - return (ret) ? -EFAULT : 0; -} - -static inline int copy_e2k_stack_to_user(void __user *dst, void *src, - unsigned long size, pt_regs_t *regs) -{ - unsigned long ts_flag; - int ret; - - if (likely(!host_test_intc_emul_mode(regs))) { - if (!__range_ok((unsigned long __force) dst, size, PAGE_OFFSET)) - return -EFAULT; - } - - ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); - ret = host_copy_to_user_with_tags(dst, src, size, regs); - clear_ts_flag(ts_flag); - - return (ret) ? -EFAULT : 0; -} - DECLARE_PER_CPU(void *, reserve_hw_stacks); static inline int on_reserve_stacks(void) { @@ -1278,134 +1031,6 @@ static inline int on_reserve_stacks(void) KERNEL_PC_STACK_SIZE; } -static __always_inline int -user_hw_stack_frames_copy(void __user *dst, void *src, unsigned long copy_size, - const pt_regs_t *regs, unsigned long hw_stack_ind, bool is_pcsp) -{ - unsigned long ts_flag; - - if (unlikely(hw_stack_ind < copy_size)) { - unsigned long flags; - raw_all_irq_save(flags); - if (is_pcsp) { - E2K_FLUSHC; - } else { - E2K_FLUSHR; - } - raw_all_irq_restore(flags); - } - - SET_USR_PFAULT("$.recovery_memcpy_fault"); - - ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); - fast_tagged_memory_copy_to_user(dst, src, copy_size, regs, - TAGGED_MEM_STORE_REC_OPC | - MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, - TAGGED_MEM_LOAD_REC_OPC | - MAS_BYPASS_L1_CACHE << LDST_REC_OPC_MAS_SHIFT, true); - clear_ts_flag(ts_flag); - - if (RESTORE_USR_PFAULT) { - pr_err("process %s (%d) %s stack could not be copied " - "from %px to %px size 0x%lx (out of memory?)\n", - current->comm, current->pid, - (is_pcsp) ? "chain" : "procedure", - src, dst, copy_size); - return -EFAULT; - } - DebugUST("copying guest %s stack spilled to host from %px " - "to guest kernel stack from %px, size 0x%lx\n", - (is_pcsp) ? "chain" : "procedure", src, dst, copy_size); - - return 0; -} - -static __always_inline int -user_crs_frames_copy(e2k_mem_crs_t __user *u_frame, pt_regs_t *regs, - e2k_mem_crs_t *crs) -{ - unsigned long ts_flag; - int ret; - - ts_flag = set_ts_flag(TS_KERNEL_SYSCALL); - ret = host_copy_to_user(u_frame, crs, sizeof(*crs), regs); - clear_ts_flag(ts_flag); - if (unlikely(ret)) - return -EFAULT; - - return 0; -} - -static __always_inline u64 get_wsz(enum restore_caller from) -{ - return NATIVE_READ_WD_REG().size >> 4; -} - -static __always_inline int user_psp_stack_copy(e2k_psp_lo_t u_psp_lo, - e2k_psp_hi_t u_psp_hi, s64 u_pshtp_size, - e2k_psp_lo_t k_psp_lo, e2k_psp_hi_t k_psp_hi, - unsigned long copy_size, const pt_regs_t *regs) -{ - void __user *dst; - void *src; - - dst = (void __user *) (AS(u_psp_lo).base + AS(u_psp_hi).ind - - u_pshtp_size); - src = (void *) AS(k_psp_lo).base; - - return user_hw_stack_frames_copy(dst, src, copy_size, - regs, k_psp_hi.PSP_hi_ind, false); -} - -static __always_inline int user_pcsp_stack_copy(e2k_pcsp_lo_t u_pcsp_lo, - e2k_pcsp_hi_t u_pcsp_hi, s64 u_pcshtp_size, - e2k_pcsp_lo_t k_pcsp_lo, e2k_pcsp_hi_t k_pcsp_hi, - unsigned long copy_size, const pt_regs_t *regs) -{ - void __user *dst; - void *src; - - dst = (void __user *)(AS(u_pcsp_lo).base + AS(u_pcsp_hi).ind - - u_pcshtp_size); - src = (void *) AS(k_pcsp_lo).base; - - return user_hw_stack_frames_copy(dst, src, copy_size, - regs, k_pcsp_hi.PCSP_hi_ind, true); -} - -static __always_inline u64 get_ps_clear_size(u64 cur_window_q, - e2k_pshtp_t pshtp) -{ - s64 u_pshtp_size_q; - - u_pshtp_size_q = GET_PSHTP_Q_INDEX(pshtp); - if (u_pshtp_size_q > E2K_MAXSR - cur_window_q) - u_pshtp_size_q = E2K_MAXSR - cur_window_q; - - return E2K_MAXSR - (cur_window_q + u_pshtp_size_q); -} - -static __always_inline s64 get_ps_copy_size(u64 cur_window_q, s64 u_pshtp_size) -{ - return u_pshtp_size - (E2K_MAXSR - cur_window_q) * EXT_4_NR_SZ; -} - -#ifdef CONFIG_CPU_HAS_FILL_INSTRUCTION -# define E2K_CF_MAX_FILL (E2K_CF_MAX_FILL_FILLC_q * 0x10) -#else -extern int cf_max_fill_return; -# define E2K_CF_MAX_FILL cf_max_fill_return -#endif - -static __always_inline s64 get_pcs_copy_size(s64 u_pcshtp_size) -{ - /* Before v6 it was possible to fill no more than 16 registers. - * Since E2K_MAXCR_q is much bigger than 16 we can be sure that - * there is enough space in CF for the FILL, so there is no - * need to take into account space taken by current window. */ - return u_pcshtp_size - E2K_CF_MAX_FILL; -} - #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* This function is used to fixup ret_stack, so make sure it itself * does not rely on correct values in ret_stack by using "notrace". */ @@ -1427,352 +1052,14 @@ static inline void apply_graph_tracer_delta(unsigned long delta) } #endif -/** - * user_hw_stacks_copy - copy user hardware stacks that have been - * SPILLed to kernel back to user space - * @stacks - saved user stack registers - * @cur_window_q - size of current window in procedure stack, - * needed only if @copy_full is not set - * @copy_full - set if want to copy _all_ of SPILLed stacks - * - * This does not update stacks->pshtp and stacks->pcshtp. Main reason is - * signals: if a signal arrives after copying then it must see a coherent - * state where saved stacks->pshtp and stacks->pcshtp values show how much - * data from user space is spilled to kernel space. - */ -static __always_inline int -native_user_hw_stacks_copy(struct e2k_stacks *stacks, - pt_regs_t *regs, u64 cur_window_q, bool copy_full) -{ - trap_pt_regs_t *trap = regs->trap; - e2k_psp_lo_t u_psp_lo = stacks->psp_lo, - k_psp_lo = current_thread_info()->k_psp_lo; - e2k_psp_hi_t u_psp_hi = stacks->psp_hi; - e2k_pcsp_lo_t u_pcsp_lo = stacks->pcsp_lo, - k_pcsp_lo = current_thread_info()->k_pcsp_lo; - e2k_pcsp_hi_t u_pcsp_hi = stacks->pcsp_hi; - s64 u_pshtp_size, u_pcshtp_size, ps_copy_size, pcs_copy_size; - int ret; - - u_pshtp_size = GET_PSHTP_MEM_INDEX(stacks->pshtp); - u_pcshtp_size = PCSHTP_SIGN_EXTEND(stacks->pcshtp); - - /* - * Copy user's part from kernel stacks into user stacks - * Update user's stack registers - */ - if (copy_full) { - pcs_copy_size = u_pcshtp_size; - ps_copy_size = u_pshtp_size; - } else { - pcs_copy_size = get_pcs_copy_size(u_pcshtp_size); - ps_copy_size = get_ps_copy_size(cur_window_q, u_pshtp_size); - - /* Make sure there is enough space in CF for the FILL */ - BUG_ON((E2K_MAXCR_q - 4) * 16 < E2K_CF_MAX_FILL); - } - - if (likely(pcs_copy_size <= 0 && ps_copy_size <= 0)) - return 0; - - if (unlikely(pcs_copy_size > 0)) { - e2k_pcsp_hi_t k_pcsp_hi = NATIVE_NV_READ_PCSP_HI_REG(); - - /* Since not all user data has been SPILL'ed it is possible - * that we have already overflown user's hardware stack. */ - if (unlikely(AS(u_pcsp_hi).ind > AS(u_pcsp_hi).size)) { - ret = handle_chain_stack_bounds(stacks, trap); - if (unlikely(ret)) { - pr_warning("process %s (%d) chain stack overflow (out of memory?)\n", - current->comm, current->pid); - return ret; - } - - u_pcsp_lo = stacks->pcsp_lo; - u_pcsp_hi = stacks->pcsp_hi; - } - - ret = user_pcsp_stack_copy(u_pcsp_lo, u_pcsp_hi, u_pcshtp_size, - k_pcsp_lo, k_pcsp_hi, pcs_copy_size, regs); - if (ret) - return ret; - } - - if (unlikely(ps_copy_size > 0)) { - e2k_psp_hi_t k_psp_hi = NATIVE_NV_READ_PSP_HI_REG(); - - /* Since not all user data has been SPILL'ed it is possible - * that we have already overflowed user's hardware stack. */ - if (unlikely(AS(u_psp_hi).ind > AS(u_psp_hi).size)) { - ret = handle_proc_stack_bounds(stacks, trap); - if (unlikely(ret)) { - pr_warning("process %s (%d) procedure stack overflow (out of memory?)\n", - current->comm, current->pid); - return ret; - } - - u_psp_lo = stacks->psp_lo; - u_psp_hi = stacks->psp_hi; - } - - ret = user_psp_stack_copy(u_psp_lo, u_psp_hi, u_pshtp_size, - k_psp_lo, k_psp_hi, ps_copy_size, regs); - if (ret) - return ret; - } - - return 0; -} - -static inline void collapse_kernel_hw_stacks(struct e2k_stacks *stacks) -{ - e2k_pcsp_lo_t k_pcsp_lo = current_thread_info()->k_pcsp_lo; - e2k_psp_lo_t k_psp_lo = current_thread_info()->k_psp_lo; - unsigned long flags, spilled_pc_size, spilled_p_size; - e2k_pshtp_t pshtp = stacks->pshtp; - u64 *dst; - const u64 *src; - - spilled_pc_size = PCSHTP_SIGN_EXTEND(stacks->pcshtp); - spilled_p_size = GET_PSHTP_MEM_INDEX(pshtp); - DebugUST("guest user spilled to host kernel stack part: chain 0x%lx " - "procedure 0x%lx\n", - spilled_pc_size, spilled_p_size); - /* When user tries to return from the last user frame - * we will have pcshtp = pcsp_hi.ind = 0. But situation - * with pcsp_hi.ind != 0 and pcshtp = 0 is impossible. */ - if (WARN_ON_ONCE(spilled_pc_size < SZ_OF_CR && - AS(stacks->pcsp_hi).ind != 0)) - do_exit(SIGKILL); - - /* Keep the last user frame (see user_hw_stacks_copy_full()) */ - if (spilled_pc_size >= SZ_OF_CR) { - spilled_pc_size -= SZ_OF_CR; - DebugUST("Keep the prev user chain frame, so spilled chain " - "size is now 0x%lx\n", - spilled_pc_size); - } - - raw_all_irq_save(flags); - - if (spilled_pc_size) { - dst = (u64 *) AS(k_pcsp_lo).base; - src = (u64 *) (AS(k_pcsp_lo).base + spilled_pc_size); - collapse_kernel_pcs(dst, src, spilled_pc_size); - - stacks->pcshtp = SZ_OF_CR; - - apply_graph_tracer_delta(-spilled_pc_size); - } - - if (spilled_p_size) { - dst = (u64 *) AS(k_psp_lo).base; - src = (u64 *) (AS(k_psp_lo).base + spilled_p_size); - collapse_kernel_ps(dst, src, spilled_p_size); - - AS(pshtp).ind = 0; - stacks->pshtp = pshtp; - } - - raw_all_irq_restore(flags); -} - -/** - * user_hw_stacks_prepare - prepare user hardware stacks that have been - * SPILLed to kernel back to user space - * @stacks - saved user stack registers - * @cur_window_q - size of current window in procedure stack, - * needed only if @copy_full is not set - * @syscall - true if called upon direct system call exit (no signal handlers) - * - * This does two things: - * - * 1) It is possible that upon kernel entry pcshtp == 0 in some cases: - * - user signal handler had pcshtp==0x20 before return to sigreturn() - * - user context had pcshtp==0x20 before return to makecontext_trampoline() - * - chain stack underflow happened - * So it is possible in sigreturn() and traps, but not in system calls. - * If we are using the trick with return to FILL user hardware stacks than - * we must have frame in chain stack to return to. So in this case kernel's - * chain stack is moved up by one frame (0x20 bytes). - * We also fill the new frame with actual user data and update stacks->pcshtp, - * this is needed to keep the coherent state where saved stacks->pcshtp values - * shows how much data from user space has been spilled to kernel space. - * - * 2) It is not possible to always FILL all of user data that have been - * SPILLed to kernel stacks. So we manually copy the leftovers that can - * not be FILLed to user space. - * This copy does not update stacks->pshtp and stacks->pcshtp. Main reason - * is signals: if a signal arrives after copying then it must see a coherent - * state where saved stacks->pshtp and stacks->pcshtp values show how much - * data from user space has been spilled to kernel space. - */ -static __always_inline void native_user_hw_stacks_prepare( - struct e2k_stacks *stacks, pt_regs_t *regs, - u64 cur_window_q, enum restore_caller from, int syscall) -{ - e2k_pcshtp_t u_pcshtp = stacks->pcshtp; - int ret; - - BUG_ON(from & FROM_PV_VCPU_MODE); - - /* - * 1) Make sure there is free space in kernel chain stack to return to - */ - if (!syscall && u_pcshtp == 0) { - unsigned long flags; - e2k_pcsp_lo_t u_pcsp_lo = stacks->pcsp_lo, - k_pcsp_lo = current_thread_info()->k_pcsp_lo; - e2k_pcsp_hi_t u_pcsp_hi = stacks->pcsp_hi, k_pcsp_hi; - e2k_mem_crs_t __user *u_cframe; - e2k_mem_crs_t *k_crs; - u64 u_cbase; - int ret = -EINVAL; - - raw_all_irq_save(flags); - E2K_FLUSHC; - k_pcsp_hi = READ_PCSP_HI_REG(); - BUG_ON(AS(k_pcsp_hi).ind); - AS(k_pcsp_hi).ind += SZ_OF_CR; - WRITE_PCSP_HI_REG(k_pcsp_hi); - - k_crs = (e2k_mem_crs_t *) AS(k_pcsp_lo).base; - u_cframe = (e2k_mem_crs_t __user *) (AS(u_pcsp_lo).base + - AS(u_pcsp_hi).ind); - u_cbase = ((from & FROM_RETURN_PV_VCPU_TRAP) || - host_test_intc_emul_mode(regs)) ? - u_pcsp_lo.PCSP_lo_base : - (u64) CURRENT_PCS_BASE(); - if ((u64) u_cframe > u_cbase) { - ret = __copy_user_to_current_hw_stack(k_crs, - u_cframe - 1, sizeof(*k_crs), regs, true); - } - raw_all_irq_restore(flags); - - /* Can happen if application returns until runs out of - * chain stack or there is no free memory for stacks. - * There is no user stack to return to - die. */ - if (ret) { - E2K_LMS_HALT_OK; - SIGDEBUG_PRINT("SIGKILL. %s\n", - (ret == -EINVAL) ? "tried to return to kernel" : - "ran into Out-of-Memory on user stacks"); - force_sig(SIGKILL); - return; - } - - if (AS(u_pcsp_hi).ind < SZ_OF_CR) { - update_pcsp_regs(AS(u_pcsp_lo).base, - &u_pcsp_lo, &u_pcsp_hi); - stacks->pcsp_lo = u_pcsp_lo; - stacks->pcsp_hi = u_pcsp_hi; - BUG_ON(AS(u_pcsp_hi).ind < SZ_OF_CR); - } - - u_pcshtp = SZ_OF_CR; - stacks->pcshtp = u_pcshtp; - } - - /* - * 2) Copy user data that cannot be FILLed - */ - ret = native_user_hw_stacks_copy(stacks, regs, cur_window_q, false); - if (unlikely(ret)) - do_exit(SIGKILL); -} - -#ifndef CONFIG_VIRTUALIZATION -/* native kernel without virtualization support */ -static __always_inline int -user_hw_stacks_copy(struct e2k_stacks *stacks, - pt_regs_t *regs, u64 cur_window_q, bool copy_full) -{ - return native_user_hw_stacks_copy(stacks, regs, cur_window_q, copy_full); -} - -static __always_inline void -host_user_hw_stacks_prepare(struct e2k_stacks *stacks, pt_regs_t *regs, - u64 cur_window_q, enum restore_caller from, int syscall) -{ - native_user_hw_stacks_prepare(stacks, regs, cur_window_q, - from, syscall); -} -#elif defined(CONFIG_KVM_GUEST_KERNEL) -/* It is native guest kernel (without paravirtualization) */ -#include -#elif defined(CONFIG_PARAVIRT_GUEST) -/* It is paravirtualized kernel (host and guest) */ -#include -#elif defined(CONFIG_KVM_HOST_MODE) -/* It is host kernel with virtualization support */ -#include -#else /* unknow mode */ -#error "unknown virtualization mode" -#endif /* !CONFIG_VIRTUALIZATION */ - -/** - * user_hw_stacks_copy_full - copy part of user stacks that was SPILLed - * into kernel back to user stacks. - * @stacks - saved user stack registers - * @regs - pt_regs pointer - * @crs - last frame to copy - * - * If @crs is not NULL then the frame pointed to by it will also be copied - * to userspace. Note that 'stacks->pcsp_hi.ind' is _not_ updated after - * copying since it would leave stack in inconsistent state (with two - * copies of the same @crs frame), this is left to the caller. * - * - * Inlining this reduces the amount of memory to copy in - * collapse_kernel_hw_stacks(). - */ -static inline int user_hw_stacks_copy_full(struct e2k_stacks *stacks, - pt_regs_t *regs, e2k_mem_crs_t *crs) -{ - int ret; - - /* - * Copy part of user stacks that were SPILLed into kernel stacks - */ - ret = user_hw_stacks_copy(stacks, regs, 0, true); - if (unlikely(ret)) - return ret; - - /* - * Nothing to FILL so remove the resulting hole from kernel stacks. - * - * IMPORTANT: there is always at least one user frame at the top of - * kernel stack - the one that issued a system call (in case of an - * exception we uphold this rule manually, see user_hw_stacks_prepare()) - * We keep this ABI and _always_ leave space for one user frame, - * this way we can later FILL using return trick (otherwise there - * would be no space in chain stack for the trick). - */ - collapse_kernel_hw_stacks(stacks); - - /* - * Copy saved %cr registers - * - * Caller must take care of filling of resulting hole - * (last user frame from pcshtp == SZ_OF_CR). - */ - if (crs) { - e2k_mem_crs_t __user *u_frame; - int ret; - - u_frame = (void __user *) (AS(stacks->pcsp_lo).base + - AS(stacks->pcsp_hi).ind); - ret = user_crs_frames_copy(u_frame, regs, ®s->crs); - if (unlikely(ret)) - return ret; - } - - return 0; -} +extern int user_hw_stacks_copy_full(struct e2k_stacks *stacks, + pt_regs_t *regs, e2k_mem_crs_t *crs); extern e2k_addr_t get_nested_kernel_IP(pt_regs_t *regs, int n); extern unsigned long remap_e2k_stack(unsigned long addr, unsigned long old_size, unsigned long new_size, bool after); extern int find_cui_by_ip(unsigned long ip); + #endif /* _E2K_PROCESS_H */ diff --git a/arch/e2k/include/asm/regs_state.h b/arch/e2k/include/asm/regs_state.h index 70e8bb8..ac06f0a 100644 --- a/arch/e2k/include/asm/regs_state.h +++ b/arch/e2k/include/asm/regs_state.h @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -219,14 +220,6 @@ do { \ NATIVE_DO_SAVE_MONITOR_COUNTERS(sw_regs); \ } while (0) -/* - * When we use monitor registers, we count monitor events for the whole system, - * so DIMAR0, DIMAR1, DDMAR0 and DDMAR1 registers are not depend on process and - * need not be saved while process switching. DIMCR and DDMCR registers are not - * depend on process too, but they should be saved while process switching, - * because they are used to determine monitoring start moment during monitor - * events counting for a process. - */ static inline void native_save_user_only_regs(struct sw_regs *sw_regs) { if (machine.save_dimtp) @@ -946,27 +939,46 @@ static inline void native_restore_user_only_regs(struct sw_regs *sw_regs) } } -#ifdef CONFIG_PERF_EVENTS -DECLARE_PER_CPU(u8, perf_monitors_used); -DECLARE_PER_CPU(u8, perf_bps_used); -# define is_perf_using_monitors __this_cpu_read(perf_monitors_used) -# define is_perf_using_bps __this_cpu_read(perf_bps_used) -#else /* ! CONFIG_PERF_EVENTS */ -#define is_perf_using_monitors false -#define is_perf_using_bps false -#endif /* CONFIG_PERF_EVENTS */ - static inline void native_clear_user_only_regs(void) { - if (!is_perf_using_bps) { + u8 monitors_used = perf_read_monitors_used(); + u8 bps_used = perf_read_bps_used(); + if (!bps_used) { NATIVE_WRITE_DIBCR_REG_VALUE(0); NATIVE_WRITE_DDBCR_REG_VALUE(0); } - if (!MONITORING_IS_ACTIVE && !is_perf_using_monitors) { - NATIVE_WRITE_DIMCR_REG_VALUE(0); - NATIVE_WRITE_DIBSR_REG_VALUE(0); - NATIVE_WRITE_DDMCR_REG_VALUE(0); - NATIVE_WRITE_DDBSR_REG_VALUE(0); + if (!MONITORING_IS_ACTIVE) { + if (!monitors_used) { + NATIVE_WRITE_DIMCR_REG_VALUE(0); + NATIVE_WRITE_DIBSR_REG_VALUE(0); + NATIVE_WRITE_DDMCR_REG_VALUE(0); + NATIVE_WRITE_DDBSR_REG_VALUE(0); + } else { + e2k_dimcr_t dimcr = NATIVE_READ_DIMCR_REG(); + e2k_ddmcr_t ddmcr = NATIVE_READ_DDMCR_REG(); + e2k_dibsr_t dibsr = NATIVE_READ_DIBSR_REG(); + e2k_ddbsr_t ddbsr = NATIVE_READ_DDBSR_REG(); + if (!(monitors_used & DIM0)) { + dimcr.half_word[0] = 0; + dibsr.m0 = 0; + } + if (!(monitors_used & DIM1)) { + dimcr.half_word[1] = 0; + dibsr.m1 = 0; + } + if (!(monitors_used & DDM0)) { + ddmcr.half_word[0] = 0; + ddbsr.m0 = 0; + } + if (!(monitors_used & DDM1)) { + ddmcr.half_word[1] = 0; + ddbsr.m1 = 0; + } + NATIVE_WRITE_DIMCR_REG(dimcr); + NATIVE_WRITE_DDMCR_REG(ddmcr); + NATIVE_WRITE_DIBSR_REG(dibsr); + NATIVE_WRITE_DDBSR_REG(ddbsr); + } } } @@ -1299,7 +1311,6 @@ native_set_current_thread_info(struct thread_info *thread, set_osgd_task_struct(task); } - static inline void set_current_thread_info(struct thread_info *thread, struct task_struct *task) { @@ -1308,5 +1319,21 @@ set_current_thread_info(struct thread_info *thread, struct task_struct *task) set_osgd_task_struct(task); } +#define SAVE_PSYSCALL_RVAL(regs, _rval, _rval1, _rval2, _rv1_tag, \ + _rv2_tag, _return_desk) \ +({ \ + (regs)->sys_rval = (_rval); \ + (regs)->rval1 = (_rval1); \ + (regs)->rval2 = (_rval2); \ + (regs)->rv1_tag = (_rv1_tag); \ + (regs)->rv2_tag = (_rv2_tag); \ + (regs)->return_desk = (_return_desk); \ +}) + +#define SAVE_SYSCALL_RVAL(regs, rval) \ +({ \ + (regs)->sys_rval = (rval); \ +}) + #endif /* _E2K_REGS_STATE_H */ diff --git a/arch/e2k/include/asm/sic_regs.h b/arch/e2k/include/asm/sic_regs.h index a602d2a..89e0519 100644 --- a/arch/e2k/include/asm/sic_regs.h +++ b/arch/e2k/include/asm/sic_regs.h @@ -204,6 +204,18 @@ #define SIC_mc_ch 0x400 #define SIC_mc_status 0x44c +#define SIC_mc_opmb 0x424 +#define SIC_mc0_opmb 0x414 +#define SIC_mc1_opmb 0x454 +#define SIC_mc2_opmb 0x494 +#define SIC_mc3_opmb 0x4d4 + +#define SIC_mc_cfg 0x418 +#define SIC_mc0_cfg 0x418 +#define SIC_mc1_cfg 0x458 +#define SIC_mc2_cfg 0x498 +#define SIC_mc3_cfg 0x4d8 + /* HMU */ #define SIC_hmu_mic 0xd00 #define SIC_hmu0_int 0xd40 @@ -1085,6 +1097,79 @@ typedef union e2k_mc_ecc_struct { /* Structure word */ #define E2K_MC_ECC_secnt fields.secnt /* single error counter */ #define E2K_MC_ECC_reg word + +/* + * Read/Write MCX_OPMb (X={0, 1, 2, 3}) registers + * ! only for P1 processor type ! + */ +typedef unsigned int e2k_mc_opmb_t; /* single word (32 bits) */ +typedef struct e2k_mc_opmb_fields { + e2k_mc_opmb_t ct0 : 3; /* [0:2] */ + e2k_mc_opmb_t ct1 : 3; /* [3:5] */ + e2k_mc_opmb_t pbm0 : 2; /* [6:7] */ + e2k_mc_opmb_t pbm1 : 2; /* [8:9] */ + e2k_mc_opmb_t rm : 1; /* [10] */ + e2k_mc_opmb_t rdodt : 1; /* [11] */ + e2k_mc_opmb_t wrodt : 1; /* [12] */ + e2k_mc_opmb_t bl8int : 1; /* [13] */ + e2k_mc_opmb_t mi_fast : 1; /* [14] */ + e2k_mc_opmb_t mt : 1; /* [15] */ + e2k_mc_opmb_t il : 1; /* [16] */ + e2k_mc_opmb_t rcven_del : 2; /* [17:18] */ + e2k_mc_opmb_t mc_ps : 1; /* [19] */ + e2k_mc_opmb_t arp_en : 1; /* [20] */ + e2k_mc_opmb_t flt_brop : 1; /* [21] */ + e2k_mc_opmb_t flt_rdpr : 1; /* [22] */ + e2k_mc_opmb_t flt_blk : 1; /* [23] */ + e2k_mc_opmb_t parerr : 1; /* [24] */ + e2k_mc_opmb_t cmdpack : 1; /* [25] */ + e2k_mc_opmb_t sldwr : 1; /* [26] */ + e2k_mc_opmb_t sldrd : 1; /* [27] */ + e2k_mc_opmb_t mirr : 1; /* [28] */ + e2k_mc_opmb_t twrwr : 2; /* [29:30] */ + e2k_mc_opmb_t mcln : 1; /* [31] */ +} e2k_mc_opmb_fields_t; +typedef union e2k_mc_opmb_struct { /* Structure word */ + e2k_mc_opmb_fields_t fields; /* as fields */ + e2k_mc_opmb_t word; /* as entire register */ +} e2k_mc_opmb_struct_t; + +#define E2K_MC_OPMB_pbm0 fields.pbm0 /* physycal bank map slot 0 */ +#define E2K_MC_OPMB_pbm1 fields.pbm1 /* physycal bank map slot 1 */ +#define E2K_MC_OPMB_reg word + +/* + * Read/Write MCX_CFG (X={0, 1, 2, 3}) registers + * P9, E2C3, E12 and E16 processor type + */ +typedef unsigned int e2k_mc_cfg_t; /* single word (32 bits) */ +typedef struct e2k_mc_cfg_fields { + e2k_mc_cfg_t ct0 : 3; /* [0:2] */ + e2k_mc_cfg_t ct1 : 3; /* [3:5] */ + e2k_mc_cfg_t pbm0 : 2; /* [6:7] */ + e2k_mc_cfg_t pbm1 : 2; /* [8:9] */ + e2k_mc_cfg_t rm : 1; /* [10] */ + e2k_mc_cfg_t reserve1 : 2; /* [11:12] */ + e2k_mc_cfg_t mirr : 1; /* [13] */ + e2k_mc_cfg_t sf : 3; /* [14:16] */ + e2k_mc_cfg_t mt : 1; /* [17] */ + e2k_mc_cfg_t cs8 : 1; /* [18] */ + e2k_mc_cfg_t ptrr_mode : 2; /* [19:20] */ + e2k_mc_cfg_t mcrc : 1; /* [21] */ + e2k_mc_cfg_t odt_ext : 2; /* [22:23] */ + e2k_mc_cfg_t pbswap : 1; /* [24] */ + e2k_mc_cfg_t dqw : 2; /* [25:26] */ + e2k_mc_cfg_t pda_sel : 5; /* [27:31] */ +} e2k_mc_cfg_fields_t; +typedef union e2k_mc_cfg_struct { /* Structure word */ + e2k_mc_cfg_fields_t fields; /* as fields */ + e2k_mc_cfg_t word; /* as entire register */ +} e2k_mc_cfg_struct_t; + +#define E2K_MC_CFG_pbm0 fields.pbm0 /* physycal bank map slot 0 */ +#define E2K_MC_CFG_pbm1 fields.pbm1 /* physycal bank map slot 1 */ +#define E2K_MC_CFG_reg word + /* * Read/Write IPCC_CSRX (X={1, 2, 3}) registers */ diff --git a/arch/e2k/include/asm/sic_regs_access.h b/arch/e2k/include/asm/sic_regs_access.h index 7d2ae6b..601762c 100644 --- a/arch/e2k/include/asm/sic_regs_access.h +++ b/arch/e2k/include/asm/sic_regs_access.h @@ -77,6 +77,9 @@ boot_sic_write_node_nbsr_reg(int node_id, int reg_offset, unsigned int reg_val) unsigned int sic_get_mc_ecc(int node, int num); void sic_set_mc_ecc(int node, int num, unsigned int reg_value); +unsigned int sic_get_mc_opmb(int node, int num); +unsigned int sic_get_mc_cfg(int node, int num); + unsigned int sic_get_ipcc_csr(int node, int num); void sic_set_ipcc_csr(int node, int num, unsigned int val); diff --git a/arch/e2k/include/asm/signal.h b/arch/e2k/include/asm/signal.h index c45cbca..2cd7749 100644 --- a/arch/e2k/include/asm/signal.h +++ b/arch/e2k/include/asm/signal.h @@ -135,7 +135,8 @@ do { \ /* Reserve 64 bytes for kernel per C calling convention */ \ u64 used_dstack_size = round_up(64, E2K_ALIGN_STACK); \ \ - sbr = (u64)thread_info_task(ti)->stack + KERNEL_C_STACK_SIZE; \ + sbr = (u64)thread_info_task(ti)->stack + KERNEL_C_STACK_SIZE + \ + KERNEL_C_STACK_OFFSET; \ AW(usd_lo) = AW((ti)->k_usd_lo); \ AW(usd_hi) = AW((ti)->k_usd_hi); \ AS(usd_lo).base -= used_dstack_size; \ diff --git a/arch/e2k/include/asm/thread_info.h b/arch/e2k/include/asm/thread_info.h index 5681b19..9b43a65 100644 --- a/arch/e2k/include/asm/thread_info.h +++ b/arch/e2k/include/asm/thread_info.h @@ -241,6 +241,7 @@ typedef struct thread_info { /* hypercall */ /* End of flags only for virtualization support */ #define TIF_SYSCALL_TRACEPOINT 30 /* syscall tracepoint instrumentation */ +#define TIF_NAPI_WORK 31 /* napi_wq_worker() is running */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -266,6 +267,7 @@ typedef struct thread_info { #define _TIF_LIGHT_HYPERCALL (1 << TIF_LIGHT_HYPERCALL) #define _TIF_GENERIC_HYPERCALL (1 << TIF_GENERIC_HYPERCALL) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_NAPI_WORK (1 << TIF_NAPI_WORK) #define _TIF_WORK_SYSCALL_TRACE (_TIF_SYSCALL_TRACE | \ _TIF_KERNEL_TRACE | \ diff --git a/arch/e2k/include/asm/tlb_regs_access.h b/arch/e2k/include/asm/tlb_regs_access.h index b178466..1d20f93 100644 --- a/arch/e2k/include/asm/tlb_regs_access.h +++ b/arch/e2k/include/asm/tlb_regs_access.h @@ -104,4 +104,50 @@ get_MMU_phys_addr(e2k_addr_t virt_addr) return __probe_entry(GET_MMU_PHYS_ADDR(virt_addr)); } -#endif +typedef struct tlb_set_state { + tlb_tag_t tlb_tag; + pte_t tlb_entry; +} tlb_set_state_t; + +typedef struct tlb_line_state { + e2k_addr_t va; + bool huge; + tlb_set_state_t sets[NATIVE_TLB_SETS_NUM]; +} tlb_line_state_t; + +static inline tlb_tag_t +get_va_tlb_set_tag(e2k_addr_t addr, int set_no, bool large_page) +{ + return read_DTLB_va_tag_reg(addr, set_no, large_page); +} + +static inline pte_t +get_va_tlb_set_entry(e2k_addr_t addr, int set_no, bool large_page) +{ + pte_t tlb_entry; + + pte_val(tlb_entry) = read_DTLB_va_entry_reg(addr, set_no, large_page); + return tlb_entry; +} + +static inline void +get_va_tlb_state(tlb_line_state_t *tlb, e2k_addr_t addr, bool large_page) +{ + tlb_set_state_t *set_state; + int set_no; + + tlb->va = addr; + tlb->huge = large_page; + + for (set_no = 0; set_no < NATIVE_TLB_SETS_NUM; set_no++) { + set_state = &tlb->sets[set_no]; + tlb_tag_t tlb_tag; + pte_t tlb_entry; + tlb_tag = get_va_tlb_set_tag(addr, set_no, large_page); + tlb_entry = get_va_tlb_set_entry(addr, set_no, large_page); + set_state->tlb_tag = tlb_tag; + set_state->tlb_entry; + } +} + +#endif /* !_E2K_TLB_REGS_ACCESS_H_ */ diff --git a/arch/e2k/include/asm/trace-defs.h b/arch/e2k/include/asm/trace-defs.h new file mode 100644 index 0000000..89d2589 --- /dev/null +++ b/arch/e2k/include/asm/trace-defs.h @@ -0,0 +1,115 @@ +#ifndef _E2K_TRACE_DEFS_H_ +#define _E2K_TRACE_DEFS_H_ + +#include + +#include +#include + +static inline void +trace_get_va_translation(struct mm_struct *mm, e2k_addr_t address, + pgdval_t *pgd, pudval_t *pud, pmdval_t *pmd, pteval_t *pte, int *pt_level) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (likely(address < TASK_SIZE)) { + pgdp = pgd_offset(mm, address); + + *pgd = pgd_val(*pgdp); + *pt_level = E2K_PGD_LEVEL_NUM; + + if (!pgd_huge(*pgdp) && !pgd_none(*pgdp) && !pgd_bad(*pgdp)) { + pudp = pud_offset(pgdp, address); + + *pud = pud_val(*pudp); + *pt_level = E2K_PUD_LEVEL_NUM; + + if (!pud_huge(*pudp) && !pud_none(*pudp) && + !pud_bad(*pudp)) { + pmdp = pmd_offset(pudp, address); + + *pmd = pmd_val(*pmdp); + *pt_level = E2K_PMD_LEVEL_NUM; + + if (!pmd_huge(*pmdp) && !pmd_none(*pmdp) && + !pmd_bad(*pmdp)) { + ptep = pte_offset_map(pmdp, address); + + *pte = pte_val(*ptep); + *pt_level = E2K_PTE_LEVEL_NUM; + } + } + } + return; + } + + pgdp = pgd_offset_k(address); + *pgd = pgd_val(*pgdp); + *pt_level = E2K_PGD_LEVEL_NUM; + + if (!kernel_pgd_huge(*pgdp) && !pgd_none(*pgdp) && !pgd_bad(*pgdp)) { + pudp = pud_offset(pgdp, address); + *pud = pud_val(*pudp); + *pt_level = E2K_PUD_LEVEL_NUM; + + if (!kernel_pud_huge(*pudp) && !pud_none(*pudp) && + !pud_bad(*pudp)) { + pmdp = pmd_offset(pudp, address); + *pmd = pmd_val(*pmdp); + *pt_level = E2K_PMD_LEVEL_NUM; + + if (!kernel_pmd_huge(*pmdp) && !pmd_none(*pmdp) && + !pmd_bad(*pmdp)) { + ptep = pte_offset_kernel(pmdp, address); + *pte = pte_val(*ptep); + *pt_level = E2K_PTE_LEVEL_NUM; + } + } + } +} + +/* + * Save DTLB entries. + * + * Do not access not existing entries to avoid + * creating "empty" records in DTLB for no reason. + */ +static inline void +trace_get_dtlb_translation(struct mm_struct *mm, e2k_addr_t address, + u64 *dtlb_pgd, u64 *dtlb_pud, u64 *dtlb_pmd, u64 *dtlb_pte, int pt_level) +{ + *dtlb_pgd = get_MMU_DTLB_ENTRY(address); + + if (pt_level <= E2K_PUD_LEVEL_NUM) + *dtlb_pud = get_MMU_DTLB_ENTRY(pud_virt_offset(address)); + + if (pt_level <= E2K_PMD_LEVEL_NUM) + *dtlb_pmd = get_MMU_DTLB_ENTRY(pmd_virt_offset(address)); + + if (pt_level <= E2K_PTE_LEVEL_NUM) + *dtlb_pte = get_MMU_DTLB_ENTRY(pte_virt_offset(address)); +} + +#define mmu_print_pt_flags(entry, print, mmu_pt_v6) \ + (mmu_pt_v6) ? E2K_TRACE_PRINT_PT_V6_FLAGS(entry, print) \ + : \ + E2K_TRACE_PRINT_PT_V2_FLAGS(entry, print) +#define print_pt_flags(entry, print) \ + mmu_print_pt_flags(entry, print, MMU_IS_PT_V6()) + +#define E2K_TRACE_PRINT_PT_FLAGS(entry, print) print_pt_flags(entry, print) + + +#define mmu_print_dtlb_entry(entry, mmu_dtlb_v6) \ + ((mmu_dtlb_v6) ? E2K_TRACE_PRINT_DTLB_ENTRY_V2(entry) \ + : \ + E2K_TRACE_PRINT_DTLB_ENTRY_V6(entry)) +#define print_dtlb_entry(entry) \ + mmu_print_dtlb_entry(entry, MMU_IS_DTLB_V6()) + +#define E2K_TRACE_PRINT_DTLB(entry) print_dtlb_entry(entry) + +#endif /* _E2K_TRACE_DEFS_H_ */ diff --git a/arch/e2k/include/asm/trace.h b/arch/e2k/include/asm/trace.h index b26d62c..f6047e3 100644 --- a/arch/e2k/include/asm/trace.h +++ b/arch/e2k/include/asm/trace.h @@ -15,6 +15,7 @@ #include #include #include +#include #define E2K_TC_TYPE_STORE (1ULL << 17) #define E2K_TC_TYPE_S_F (1ULL << 19) @@ -125,25 +126,6 @@ TRACE_EVENT( )) ); -#define mmu_print_pt_flags(entry, print, mmu_pt_v6) \ - (mmu_pt_v6) ? E2K_TRACE_PRINT_PT_V6_FLAGS(entry, print) \ - : \ - E2K_TRACE_PRINT_PT_V2_FLAGS(entry, print) -#define print_pt_flags(entry, print) \ - mmu_print_pt_flags(entry, print, MMU_IS_PT_V6()) - -#define E2K_TRACE_PRINT_PT_FLAGS(entry, print) print_pt_flags(entry, print) - - -#define mmu_print_dtlb_entry(entry, mmu_dtlb_v6) \ - ((mmu_dtlb_v6) ? E2K_TRACE_PRINT_DTLB_ENTRY_V2(entry) \ - : \ - E2K_TRACE_PRINT_DTLB_ENTRY_V6(entry)) -#define print_dtlb_entry(entry) \ - mmu_print_dtlb_entry(entry, MMU_IS_DTLB_V6()) - -#define E2K_TRACE_PRINT_DTLB(entry) print_dtlb_entry(entry) - TRACE_EVENT( unhandled_page_fault, @@ -157,91 +139,19 @@ TRACE_EVENT( __field( u64, dtlb_pud ) __field( u64, dtlb_pmd ) __field( u64, dtlb_pte ) - __field( u64, pgd ) - __field( u64, pud ) - __field( u64, pmd ) - __field( u64, pte ) + __field( pgdval_t, pgd ) + __field( pudval_t, pud ) + __field( pmdval_t, pmd ) + __field( pteval_t, pte ) __field( int, pt_level ) ), TP_fast_assign( - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - __entry->address = address; - /* - * Save page table entries - */ - __entry->pt_level = 0; - - if (address < TASK_SIZE) { - struct mm_struct *mm = current->mm; - - pgdp = pgd_offset(mm, address); - - __entry->pgd = pgd_val(*pgdp); - __entry->pt_level = 1; - - if (!pgd_huge(*pgdp) && !pgd_none(*pgdp) && - !pgd_bad(*pgdp)) { - pudp = pud_offset(pgdp, address); - - __entry->pud = pud_val(*pudp); - __entry->pt_level = 2; - - if (!pud_huge(*pudp) && !pud_none(*pudp) && - !pud_bad(*pudp)) { - pmdp = pmd_offset(pudp, address); - - __entry->pmd = pmd_val(*pmdp); - __entry->pt_level = 3; - - if (!pmd_huge(*pmdp) && - !pmd_none(*pmdp) && - !pmd_bad(*pmdp)) { - ptep = pte_offset_map(pmdp, - address); - - __entry->pte = pte_val(*ptep); - __entry->pt_level = 4; - } - } - } - } else { - pgdp = pgd_offset_k(address); - - __entry->pgd = pgd_val(*pgdp); - __entry->pt_level = 1; - - if (!kernel_pgd_huge(*pgdp) && - !pgd_none(*pgdp) && !pgd_bad(*pgdp)) { - pudp = pud_offset(pgdp, address); - - __entry->pud = pud_val(*pudp); - __entry->pt_level = 2; - - if (!kernel_pud_huge(*pudp) && - !pud_none(*pudp) && !pud_bad(*pudp)) { - pmdp = pmd_offset(pudp, address); - - __entry->pmd = pmd_val(*pmdp); - __entry->pt_level = 3; - - if (!kernel_pmd_huge(*pmdp) && - !pmd_none(*pmdp) && - !pmd_bad(*pmdp)) { - ptep = pte_offset_kernel(pmdp, - address); - - __entry->pte = pte_val(*ptep); - __entry->pt_level = 4; - } - } - } - } + trace_get_va_translation(current->mm, address, + &__entry->pgd, &__entry->pud, &__entry->pmd, + &__entry->pte, &__entry->pt_level); /* * Save DTLB entries. @@ -249,30 +159,21 @@ TRACE_EVENT( * Do not access not existing entries to avoid * creating "empty" records in DTLB for no reason. */ - __entry->dtlb_entry = get_MMU_DTLB_ENTRY(address); - - if (__entry->pt_level >= 2) - __entry->dtlb_pud = get_MMU_DTLB_ENTRY( - pud_virt_offset(address)); - - if (__entry->pt_level >= 3) - __entry->dtlb_pmd = get_MMU_DTLB_ENTRY( - pmd_virt_offset(address)); - - if (__entry->pt_level >= 4) - __entry->dtlb_pte = get_MMU_DTLB_ENTRY( - pte_virt_offset(address)); + trace_get_dtlb_translation(current->mm, address, + &__entry->dtlb_entry, &__entry->dtlb_pud, + &__entry->dtlb_pmd, &__entry->dtlb_pte, + __entry->pt_level); ), TP_printk("\n" "Page table for address 0x%lx (all f's are returned if the entry has not been read)\n" - " pgd 0x%llx: %s\n" + " pgd 0x%lx: %s\n" " Access mode: %s%s\n" - " pud 0x%llx: %s\n" + " pud 0x%lx: %s\n" " Access mode: %s%s\n" - " pmd 0x%llx: %s\n" + " pmd 0x%lx: %s\n" " Access mode: %s%s\n" - " pte 0x%llx: %s\n" + " pte 0x%lx: %s\n" " Access mode: %s%s\n" "Probed DTLB entries:\n" " pud address entry 0x%llx: %s\n" @@ -281,22 +182,26 @@ TRACE_EVENT( " address entry 0x%llx: %s" , __entry->address, - (__entry->pt_level >= 1) ? __entry->pgd : -1ULL, - E2K_TRACE_PRINT_PT_FLAGS(__entry->pgd, __entry->pt_level >= 1), - (__entry->pt_level >= 2) ? __entry->pud : -1ULL, - E2K_TRACE_PRINT_PT_FLAGS(__entry->pud, __entry->pt_level >= 2), - (__entry->pt_level >= 3) ? __entry->pmd : -1ULL, - E2K_TRACE_PRINT_PT_FLAGS(__entry->pmd, __entry->pt_level >= 3), - (__entry->pt_level >= 4) ? __entry->pte : -1ULL, - E2K_TRACE_PRINT_PT_FLAGS(__entry->pte, __entry->pt_level >= 4), - (__entry->pt_level >= 2) ? __entry->dtlb_pud : -1ULL, - (__entry->pt_level >= 2) ? + (__entry->pt_level <= E2K_PGD_LEVEL_NUM) ? __entry->pgd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->pgd, + __entry->pt_level <= E2K_PGD_LEVEL_NUM), + (__entry->pt_level <= E2K_PUD_LEVEL_NUM) ? __entry->pud : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->pud, + __entry->pt_level <= E2K_PUD_LEVEL_NUM), + (__entry->pt_level <= E2K_PMD_LEVEL_NUM) ? __entry->pmd : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->pmd, + __entry->pt_level <= E2K_PMD_LEVEL_NUM), + (__entry->pt_level <= E2K_PTE_LEVEL_NUM) ? __entry->pte : -1UL, + E2K_TRACE_PRINT_PT_FLAGS(__entry->pte, + __entry->pt_level <= E2K_PTE_LEVEL_NUM), + (__entry->pt_level <= E2K_PUD_LEVEL_NUM) ? __entry->dtlb_pud : -1ULL, + (__entry->pt_level <= E2K_PUD_LEVEL_NUM) ? E2K_TRACE_PRINT_DTLB(__entry->dtlb_pud) : "(not read)", - (__entry->pt_level >= 3) ? __entry->dtlb_pmd : -1ULL, - (__entry->pt_level >= 3) ? + (__entry->pt_level <= E2K_PMD_LEVEL_NUM) ? __entry->dtlb_pmd : -1ULL, + (__entry->pt_level <= E2K_PMD_LEVEL_NUM) ? E2K_TRACE_PRINT_DTLB(__entry->dtlb_pmd) : "(not read)", - (__entry->pt_level >= 4) ? __entry->dtlb_pte : -1ULL, - (__entry->pt_level >= 4) ? + (__entry->pt_level <= E2K_PTE_LEVEL_NUM) ? __entry->dtlb_pte : -1ULL, + (__entry->pt_level <= E2K_PTE_LEVEL_NUM) ? E2K_TRACE_PRINT_DTLB(__entry->dtlb_pte) : "(not read)", __entry->dtlb_entry, E2K_TRACE_PRINT_DTLB(__entry->dtlb_entry)) diff --git a/arch/e2k/include/asm/trace_pgtable-v2.h b/arch/e2k/include/asm/trace_pgtable-v2.h index 530b2e0..49930ff 100644 --- a/arch/e2k/include/asm/trace_pgtable-v2.h +++ b/arch/e2k/include/asm/trace_pgtable-v2.h @@ -1,6 +1,3 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM e2k - #if !defined(_TRACE_E2K_PGTABLE_V2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_E2K_PGTABLE_V2_H diff --git a/arch/e2k/include/asm/trace_pgtable-v6.h b/arch/e2k/include/asm/trace_pgtable-v6.h index a21938c..3f60092 100644 --- a/arch/e2k/include/asm/trace_pgtable-v6.h +++ b/arch/e2k/include/asm/trace_pgtable-v6.h @@ -1,6 +1,3 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM e2k - #if !defined(_TRACE_E2K_PGTABLE_V6_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_E2K_PGTABLE_V6_H diff --git a/arch/e2k/include/asm/trap_table.S.h b/arch/e2k/include/asm/trap_table.S.h index ea59739..846503a 100644 --- a/arch/e2k/include/asm/trap_table.S.h +++ b/arch/e2k/include/asm/trap_table.S.h @@ -146,9 +146,10 @@ .macro HANDLER_TRAMPOLINE ctprN, scallN, fn, wbsL /* Force load OSGD->GD. Alternative is to use non-0 CUI for kernel */ { - nop sdisp \ctprN, \scallN } + /* CPU_HWBUG_VIRT_PSIZE_INTERCEPTION */ + { nop } { nop } { nop } { nop } call \ctprN, wbs=\wbsL disp \ctprN, \fn SWITCH_HW_STACKS_FROM_USER() diff --git a/arch/e2k/include/asm/trap_table.h b/arch/e2k/include/asm/trap_table.h index 707460c..ec60a42 100644 --- a/arch/e2k/include/asm/trap_table.h +++ b/arch/e2k/include/asm/trap_table.h @@ -249,7 +249,8 @@ is_kernel_data_stack_bounds(bool on_kernel, e2k_usd_lo_t usd_lo) #include #ifndef __ASSEMBLY__ -static inline void init_pt_regs_for_syscall(struct pt_regs *regs) +__always_inline /* For CPU_HWBUG_VIRT_PSIZE_INTERCEPTION */ +static void init_pt_regs_for_syscall(struct pt_regs *regs) { regs->next = NULL; regs->trap = NULL; diff --git a/arch/e2k/include/asm/unistd.h b/arch/e2k/include/asm/unistd.h index b0fed80..b545a73 100644 --- a/arch/e2k/include/asm/unistd.h +++ b/arch/e2k/include/asm/unistd.h @@ -31,6 +31,7 @@ #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC diff --git a/arch/e2k/include/uapi/asm/unistd.h b/arch/e2k/include/uapi/asm/unistd.h index 5dafd19..7d0ba04 100644 --- a/arch/e2k/include/uapi/asm/unistd.h +++ b/arch/e2k/include/uapi/asm/unistd.h @@ -222,25 +222,10 @@ #define __NR_stat64 195 #define __NR_lstat64 196 #define __NR_fstat64 197 -#define __NR_lchown32 198 -#define __NR_getuid32 199 -#define __NR_getgid32 200 -#define __NR_geteuid32 201 -#define __NR_getegid32 202 -#define __NR_setreuid32 203 -#define __NR_setregid32 204 + #define __NR_pidfd_send_signal 205 #define __NR_pidfd_open 206 -#define __NR_fchown32 207 -#define __NR_setresuid32 208 -#define __NR_getresuid32 209 -#define __NR_setresgid32 210 -#define __NR_getresgid32 211 -#define __NR_chown32 212 -#define __NR_setuid32 213 -#define __NR_setgid32 214 -#define __NR_setfsuid32 215 -#define __NR_setfsgid32 216 + #define __NR_pivot_root 217 #define __NR_mincore 218 #define __NR_madvise 219 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f6dcd58..ab18a60 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -701,6 +701,9 @@ struct sk_buff { }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; +#ifdef CONFIG_MCST + struct list_head napi_skb_list; +#endif }; union {