From b0b49f2673f011cad7deeabf7a683b388c351278 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 13 Mar 2014 16:01:26 -0700 Subject: [PATCH 01/23] x86, vdso: Remove compat vdso support The compat vDSO is a complicated hack that's needed to maintain compatibility with a small range of glibc versions. This removes it and replaces it with a much simpler hack: a config option to disable the 32-bit vDSO by default. This also changes the default value of CONFIG_COMPAT_VDSO to n -- users configuring kernels from scratch almost certainly want that choice. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/4bb4690899106eb11430b1186d5cc66ca9d1660c.1394751608.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 22 ++- arch/x86/Kconfig | 26 +++- arch/x86/include/asm/elf.h | 4 - arch/x86/include/asm/fixmap.h | 8 - arch/x86/include/asm/vdso.h | 5 +- arch/x86/vdso/vdso-layout.lds.S | 2 +- arch/x86/vdso/vdso32-setup.c | 230 +++------------------------- arch/x86/vdso/vdso32/vdso32.lds.S | 2 - 8 files changed, 57 insertions(+), 242 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7116fda7077f..86019750e15f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3409,14 +3409,24 @@ bytes respectively. Such letter suffixes can also be entirely omitted. of CONFIG_HIGHPTE. vdso= [X86,SH] - vdso=2: enable compat VDSO (default with COMPAT_VDSO) - vdso=1: enable VDSO (default) + On X86_32, this is an alias for vdso32=. Otherwise: + + vdso=1: enable VDSO (the default) vdso=0: disable VDSO mapping - vdso32= [X86] - vdso32=2: enable compat VDSO (default with COMPAT_VDSO) - vdso32=1: enable 32-bit VDSO (default) - vdso32=0: disable 32-bit VDSO mapping + vdso32= [X86] Control the 32-bit vDSO + vdso32=1: enable 32-bit VDSO + vdso32=0 or vdso32=2: disable 32-bit VDSO + + See the help text for CONFIG_COMPAT_VDSO for more + details. If CONFIG_COMPAT_VDSO is set, the default is + vdso32=0; otherwise, the default is vdso32=1. + + For compatibility with older kernels, vdso32=2 is an + alias for vdso32=0. + + Try vdso32=0 if you encounter an error that says: + dl_main: Assertion `(void *) ph->p_vaddr == _rtld_local._dl_sysinfo_dso' failed! vector= [IA-64,SMP] vector=percpu: enable percpu vector domain diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0af5250d914f..9122f6bcbe45 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1836,17 +1836,29 @@ config DEBUG_HOTPLUG_CPU0 If unsure, say N. config COMPAT_VDSO - def_bool y - prompt "Compat VDSO support" + def_bool n + prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)" depends on X86_32 || IA32_EMULATION ---help--- - Map the 32-bit VDSO to the predictable old-style address too. + Certain buggy versions of glibc will crash if they are + presented with a 32-bit vDSO that is not mapped at the address + indicated in its segment table. - Say N here if you are running a sufficiently recent glibc - version (2.3.3 or later), to remove the high-mapped - VDSO mapping and to exclusively use the randomized VDSO. + The bug was introduced by f866314b89d56845f55e6f365e18b31ec978ec3a + and fixed by 3b3ddb4f7db98ec9e912ccdf54d35df4aa30e04a and + 49ad572a70b8aeb91e57483a11dd1b77e31c4468. Glibc 2.3.3 is + the only released version with the bug, but OpenSUSE 9 + contains a buggy "glibc 2.3.2". - If unsure, say Y. + The symptom of the bug is that everything crashes on startup, saying: + dl_main: Assertion `(void *) ph->p_vaddr == _rtld_local._dl_sysinfo_dso' failed! + + Saying Y here changes the default value of the vdso32 boot + option from 1 to 0, which turns off the 32-bit vDSO entirely. + This works around the glibc bug but hurts performance. + + If unsure, say N: if you are compiling your own kernel, you + are unlikely to be using a buggy version of glibc. config CMDLINE_BOOL bool "Built-in kernel command line" diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9c999c1674fa..2c71182d30ef 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -281,16 +281,12 @@ do { \ #define STACK_RND_MASK (0x7ff) -#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) - #define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #else /* CONFIG_X86_32 */ -#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */ - /* 1GB for 64bit, 8MB for 32bit */ #define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7252cd339175..2377f5618fb7 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -40,15 +40,8 @@ */ extern unsigned long __FIXADDR_TOP; #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) - -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) #else #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) - -/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ -#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) -#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) #endif @@ -74,7 +67,6 @@ extern unsigned long __FIXADDR_TOP; enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_HOLE, - FIX_VDSO, #else VSYSCALL_LAST_PAGE, VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index fddb53d63915..5594e84d65d9 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -2,8 +2,6 @@ #define _ASM_X86_VDSO_H #if defined CONFIG_X86_32 || defined CONFIG_COMPAT -extern const char VDSO32_PRELINK[]; - /* * Given a pointer to the vDSO image, find the pointer to VDSO32_name * as that symbol is defined in the vDSO sources or linker script. @@ -11,8 +9,7 @@ extern const char VDSO32_PRELINK[]; #define VDSO32_SYMBOL(base, name) \ ({ \ extern const char VDSO32_##name[]; \ - (void __user *)(VDSO32_##name - VDSO32_PRELINK + \ - (unsigned long)(base)); \ + (void __user *)(VDSO32_##name + (unsigned long)(base)); \ }) #endif diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 634a2cf62046..8c550c1976f3 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -6,7 +6,7 @@ SECTIONS { - . = VDSO_PRELINK + SIZEOF_HEADERS; + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index d6bfb876cfb0..ab20c04b688a 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -26,16 +26,10 @@ #include #include -enum { - VDSO_DISABLED = 0, - VDSO_ENABLED = 1, - VDSO_COMPAT = 2, -}; - #ifdef CONFIG_COMPAT_VDSO -#define VDSO_DEFAULT VDSO_COMPAT +#define VDSO_DEFAULT 0 #else -#define VDSO_DEFAULT VDSO_ENABLED +#define VDSO_DEFAULT 1 #endif #ifdef CONFIG_X86_64 @@ -43,13 +37,6 @@ enum { #define arch_setup_additional_pages syscall32_setup_pages #endif -/* - * This is the difference between the prelinked addresses in the vDSO images - * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO - * in the user address space. - */ -#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK) - /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? @@ -60,6 +47,9 @@ static int __init vdso_setup(char *s) { vdso_enabled = simple_strtoul(s, NULL, 0); + if (vdso_enabled > 1) + pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + return 1; } @@ -76,123 +66,6 @@ __setup_param("vdso=", vdso32_setup, vdso_setup, 0); EXPORT_SYMBOL_GPL(vdso_enabled); #endif -static __init void reloc_symtab(Elf32_Ehdr *ehdr, - unsigned offset, unsigned size) -{ - Elf32_Sym *sym = (void *)ehdr + offset; - unsigned nsym = size / sizeof(*sym); - unsigned i; - - for(i = 0; i < nsym; i++, sym++) { - if (sym->st_shndx == SHN_UNDEF || - sym->st_shndx == SHN_ABS) - continue; /* skip */ - - if (sym->st_shndx > SHN_LORESERVE) { - printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", - sym->st_shndx); - continue; - } - - switch(ELF_ST_TYPE(sym->st_info)) { - case STT_OBJECT: - case STT_FUNC: - case STT_SECTION: - case STT_FILE: - sym->st_value += VDSO_ADDR_ADJUST; - } - } -} - -static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) -{ - Elf32_Dyn *dyn = (void *)ehdr + offset; - - for(; dyn->d_tag != DT_NULL; dyn++) - switch(dyn->d_tag) { - case DT_PLTGOT: - case DT_HASH: - case DT_STRTAB: - case DT_SYMTAB: - case DT_RELA: - case DT_INIT: - case DT_FINI: - case DT_REL: - case DT_DEBUG: - case DT_JMPREL: - case DT_VERSYM: - case DT_VERDEF: - case DT_VERNEED: - case DT_ADDRRNGLO ... DT_ADDRRNGHI: - /* definitely pointers needing relocation */ - dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; - break; - - case DT_ENCODING ... OLD_DT_LOOS-1: - case DT_LOOS ... DT_HIOS-1: - /* Tags above DT_ENCODING are pointers if - they're even */ - if (dyn->d_tag >= DT_ENCODING && - (dyn->d_tag & 1) == 0) - dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; - break; - - case DT_VERDEFNUM: - case DT_VERNEEDNUM: - case DT_FLAGS_1: - case DT_RELACOUNT: - case DT_RELCOUNT: - case DT_VALRNGLO ... DT_VALRNGHI: - /* definitely not pointers */ - break; - - case OLD_DT_LOOS ... DT_LOOS-1: - case DT_HIOS ... DT_VALRNGLO-1: - default: - if (dyn->d_tag > DT_ENCODING) - printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", - dyn->d_tag); - break; - } -} - -static __init void relocate_vdso(Elf32_Ehdr *ehdr) -{ - Elf32_Phdr *phdr; - Elf32_Shdr *shdr; - int i; - - BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 || - !elf_check_arch_ia32(ehdr) || - ehdr->e_type != ET_DYN); - - ehdr->e_entry += VDSO_ADDR_ADJUST; - - /* rebase phdrs */ - phdr = (void *)ehdr + ehdr->e_phoff; - for (i = 0; i < ehdr->e_phnum; i++) { - phdr[i].p_vaddr += VDSO_ADDR_ADJUST; - - /* relocate dynamic stuff */ - if (phdr[i].p_type == PT_DYNAMIC) - reloc_dyn(ehdr, phdr[i].p_offset); - } - - /* rebase sections */ - shdr = (void *)ehdr + ehdr->e_shoff; - for(i = 0; i < ehdr->e_shnum; i++) { - if (!(shdr[i].sh_flags & SHF_ALLOC)) - continue; - - shdr[i].sh_addr += VDSO_ADDR_ADJUST; - - if (shdr[i].sh_type == SHT_SYMTAB || - shdr[i].sh_type == SHT_DYNSYM) - reloc_symtab(ehdr, shdr[i].sh_offset, - shdr[i].sh_size); - } -} - static struct page *vdso32_pages[1]; #ifdef CONFIG_X86_64 @@ -212,12 +85,6 @@ void syscall32_cpu_init(void) wrmsrl(MSR_CSTAR, ia32_cstar_target); } -#define compat_uses_vma 1 - -static inline void map_compat_vdso(int map) -{ -} - #else /* CONFIG_X86_32 */ #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) @@ -241,37 +108,6 @@ void enable_sep_cpu(void) put_cpu(); } -static struct vm_area_struct gate_vma; - -static int __init gate_vma_init(void) -{ - gate_vma.vm_mm = NULL; - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; - - return 0; -} - -#define compat_uses_vma 0 - -static void map_compat_vdso(int map) -{ - static int vdso_mapped; - - if (map == vdso_mapped) - return; - - vdso_mapped = map; - - __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT, - map ? PAGE_READONLY_EXEC : PAGE_NONE); - - /* flush stray tlbs */ - flush_tlb_all(); -} - #endif /* CONFIG_X86_64 */ int __init sysenter_setup(void) @@ -282,10 +118,6 @@ int __init sysenter_setup(void) vdso32_pages[0] = virt_to_page(syscall_page); -#ifdef CONFIG_X86_32 - gate_vma_init(); -#endif - if (vdso32_syscall()) { vsyscall = &vdso32_syscall_start; vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; @@ -298,7 +130,6 @@ int __init sysenter_setup(void) } memcpy(syscall_page, vsyscall, vsyscall_len); - relocate_vdso(syscall_page); return 0; } @@ -309,48 +140,35 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) struct mm_struct *mm = current->mm; unsigned long addr; int ret = 0; - bool compat; #ifdef CONFIG_X86_X32_ABI if (test_thread_flag(TIF_X32)) return x32_setup_additional_pages(bprm, uses_interp); #endif - if (vdso_enabled == VDSO_DISABLED) + if (vdso_enabled != 1) /* Other values all mean "disabled" */ return 0; down_write(&mm->mmap_sem); - /* Test compat mode once here, in case someone - changes it via sysctl */ - compat = (vdso_enabled == VDSO_COMPAT); - - map_compat_vdso(compat); - - if (compat) - addr = VDSO_HIGH_BASE; - else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; } current->mm->context.vdso = (void *)addr; - if (compat_uses_vma || !compat) { - /* - * MAYWRITE to allow gdb to COW and set breakpoints - */ - ret = install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso32_pages); + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + ret = install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + vdso32_pages); - if (ret) - goto up_fail; - } + if (ret) + goto up_fail; current_thread_info()->sysenter_return = VDSO32_SYMBOL(addr, SYSENTER_RETURN); @@ -411,20 +229,12 @@ const char *arch_vma_name(struct vm_area_struct *vma) struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - /* - * Check to see if the corresponding task was created in compat vdso - * mode. - */ - if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) - return &gate_vma; return NULL; } int in_gate_area(struct mm_struct *mm, unsigned long addr) { - const struct vm_area_struct *vma = get_gate_vma(mm); - - return vma && addr >= vma->vm_start && addr < vma->vm_end; + return 0; } int in_gate_area_no_mm(unsigned long addr) diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S index 976124bb5f92..90e7aa90dbe4 100644 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -8,7 +8,6 @@ * values visible using the asm-x86/vdso.h macros from the kernel proper. */ -#define VDSO_PRELINK 0 #include "../vdso-layout.lds.S" /* The ELF entry point can be used to set the AT_SYSINFO value. */ @@ -31,7 +30,6 @@ VERSION /* * Symbols we define here called VDSO* get their values into vdso32-syms.h. */ -VDSO32_PRELINK = VDSO_PRELINK; VDSO32_vsyscall = __kernel_vsyscall; VDSO32_sigreturn = __kernel_sigreturn; VDSO32_rt_sigreturn = __kernel_rt_sigreturn; From 7dda038756704b3562187b29c81f86de935148c6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 13 Mar 2014 16:01:27 -0700 Subject: [PATCH 02/23] x86_32, mm: Remove user bit from identity map PDE The only reason that the user bit was set was to support userspace access to the compat vDSO in the fixmap. The compat vDSO is gone, so the user bit can be removed. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/e240a977f3c7cbd525a091fd6521499ec4b8e94f.1394751608.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_types.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 1aa9ccd43223..757f9c40321b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -214,13 +214,8 @@ #ifdef CONFIG_X86_64 #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC #else -/* - * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection - * bits are combined, this will alow user to access the high address mapped - * VDSO in the presence of CONFIG_COMPAT_VDSO - */ #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ -#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif From 1f2cbcf648962cdcf511d234cb39745baa9f5d07 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 13 Mar 2014 19:44:47 -0700 Subject: [PATCH 03/23] x86, vdso, xen: Remove stray reference to FIX_VDSO Checkin b0b49f2673f0 x86, vdso: Remove compat vdso support ... removed the VDSO from the fixmap, and thus FIX_VDSO; remove a stray reference in Xen. Found by Fengguang Wu's test robot. Reported-by: Fengguang Wu Cc: Andy Lutomirski Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Link: http://lkml.kernel.org/r/4bb4690899106eb11430b1186d5cc66ca9d1660c.1394751608.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 256282e7888b..21c6a420e19a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2058,7 +2058,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) case FIX_RO_IDT: #ifdef CONFIG_X86_32 case FIX_WP_TEST: - case FIX_VDSO: # ifdef CONFIG_HIGHMEM case FIX_KMAP_BEGIN ... FIX_KMAP_END: # endif From d2312e3379d581d2c3603357a0181046448e1de3 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:01 +0100 Subject: [PATCH 04/23] x86, vdso: Make vsyscall_gtod_data handling x86 generic This patch move the vsyscall_gtod_data handling out of vsyscall_64.c into an additonal file vsyscall_gtod.c to make the functionality available for x86 32 bit kernel. It also adds a new vsyscall_32.c which setup the VVAR page. Reviewed-by: Andy Lutomirski Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-2-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 +- arch/x86/include/asm/clocksource.h | 4 -- arch/x86/include/asm/vvar.h | 12 ++++-- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/hpet.c | 2 - arch/x86/kernel/tsc.c | 2 - arch/x86/kernel/vmlinux.lds.S | 3 -- arch/x86/kernel/vsyscall_64.c | 45 ----------------------- arch/x86/kernel/vsyscall_gtod.c | 59 ++++++++++++++++++++++++++++++ arch/x86/tools/relocs.c | 2 +- 10 files changed, 72 insertions(+), 63 deletions(-) create mode 100644 arch/x86/kernel/vsyscall_gtod.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9122f6bcbe45..ab3ebc8ba3b9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -107,9 +107,9 @@ config X86 select HAVE_ARCH_SOFT_DIRTY select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS - select ARCH_CLOCKSOURCE_DATA if X86_64 + select ARCH_CLOCKSOURCE_DATA select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) - select GENERIC_TIME_VSYSCALL if X86_64 + select GENERIC_TIME_VSYSCALL select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 16a57f4ed64d..eda81dc0f4ae 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,8 +3,6 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#ifdef CONFIG_X86_64 - #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ @@ -14,6 +12,4 @@ struct arch_clocksource_data { int vclock_mode; }; -#endif /* CONFIG_X86_64 */ - #endif /* _ASM_X86_CLOCKSOURCE_H */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index d76ac40da206..0a534eac5f56 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -16,9 +16,6 @@ * you mess up, the linker will catch it.) */ -/* Base address of vvars. This is not ABI. */ -#define VVAR_ADDRESS (-10*1024*1024 - 4096) - #if defined(__VVAR_KERNEL_LDS) /* The kernel linker script defines its own magic to put vvars in the @@ -29,6 +26,15 @@ #else +extern char __vvar_page; + +/* Base address of vvars. This is not ABI. */ +#ifdef CONFIG_X86_64 +#define VVAR_ADDRESS (-10*1024*1024 - 4096) +#else +#define VVAR_ADDRESS (&__vvar_page) +#endif + #define DECLARE_VVAR(offset, type, name) \ static type const * const vvaraddr_ ## name = \ (void *)(VVAR_ADDRESS + (offset)); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cb648c84b327..f4d96000d33a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-y += syscall_$(BITS).o +obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index da85a8e830a1..e4b86abd08ca 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -752,9 +752,7 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, -#ifdef CONFIG_X86_64 .archdata = { .vclock_mode = VCLOCK_HPET }, -#endif }; static int hpet_clocksource_register(void) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cfbe99f88830..227dcfc0e5e7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -985,9 +985,7 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, -#ifdef CONFIG_X86_64 .archdata = { .vclock_mode = VCLOCK_TSC }, -#endif }; void mark_tsc_unstable(char *reason) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index da6b35a98260..1d4897baf1cc 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -147,7 +147,6 @@ SECTIONS _edata = .; } :data -#ifdef CONFIG_X86_64 . = ALIGN(PAGE_SIZE); __vvar_page = .; @@ -169,8 +168,6 @@ SECTIONS . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); -#endif /* CONFIG_X86_64 */ - /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1f96f9347ed9..9ea287666c65 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -47,14 +47,12 @@ #include #include #include -#include #include #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -77,48 +75,6 @@ static int __init vsyscall_setup(char *str) } early_param("vsyscall", vsyscall_setup); -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.sys_tz = sys_tz; -} - -void update_vsyscall(struct timekeeper *tk) -{ - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - - write_seqcount_begin(&vdata->seq); - - /* copy vsyscall data */ - vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->clock->cycle_last; - vdata->clock.mask = tk->clock->mask; - vdata->clock.mult = tk->mult; - vdata->clock.shift = tk->shift; - - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->xtime_nsec; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->xtime_nsec - + (tk->wall_to_monotonic.tv_nsec - << tk->shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->shift; - vdata->monotonic_time_sec++; - } - - vdata->wall_time_coarse.tv_sec = tk->xtime_sec; - vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); - - vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse, - tk->wall_to_monotonic); - - write_seqcount_end(&vdata->seq); -} - static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { @@ -374,7 +330,6 @@ void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - extern char __vvar_page; unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c new file mode 100644 index 000000000000..b5a943dba9f3 --- /dev/null +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include +#include + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.sys_tz = sys_tz; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + write_seqcount_begin(&vdata->seq); + + /* copy vsyscall data */ + vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->clock->cycle_last; + vdata->clock.mask = tk->clock->mask; + vdata->clock.mult = tk->mult; + vdata->clock.shift = tk->shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + (tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse.tv_sec = tk->xtime_sec; + vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); + + vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse, + tk->wall_to_monotonic); + + write_seqcount_end(&vdata->seq); +} diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index cfbdbdb4e173..bbb1d2259ecf 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -69,8 +69,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__per_cpu_load|" "init_per_cpu__.*|" "__end_rodata_hpage_align|" - "__vvar_page|" #endif + "__vvar_page|" "_end)$" }; From 3935ed6a3a533c1736e3ca65bff72afd1773be27 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:02 +0100 Subject: [PATCH 05/23] mm: Add new func _install_special_mapping() to mmap.c The _install_special_mapping() is the new base function for install_special_mapping(). This function will return a pointer of the created VMA or a error code in an ERR_PTR() This new function will be needed by the for the vdso 32 bit support to map the additonal vvar and hpet pages into the 32 bit address space. This will be done with io_remap_pfn_range() and remap_pfn_range, which requieres a vm_area_struct. Reviewed-by: Andy Lutomirski Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-3-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 3 +++ mm/mmap.c | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c1b7414c7bef..6c7fedf5db2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1750,6 +1750,9 @@ extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); +extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long flags, struct page **pages); extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); diff --git a/mm/mmap.c b/mm/mmap.c index 20ff0c33274c..81ba54ff96c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2918,7 +2918,7 @@ static const struct vm_operations_struct special_mapping_vmops = { * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ -int install_special_mapping(struct mm_struct *mm, +struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { @@ -2927,7 +2927,7 @@ int install_special_mapping(struct mm_struct *mm, vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (unlikely(vma == NULL)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; @@ -2948,11 +2948,23 @@ int install_special_mapping(struct mm_struct *mm, perf_event_mmap(vma); - return 0; + return vma; out: kmem_cache_free(vm_area_cachep, vma); - return ret; + return ERR_PTR(ret); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = _install_special_mapping(mm, + addr, len, vm_flags, pages); + + if (IS_ERR(vma)) + return PTR_ERR(vma); + return 0; } static DEFINE_MUTEX(mm_all_locks_mutex); From 411f790cd7e91fac0db80d3cf789cb6deeac298e Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:03 +0100 Subject: [PATCH 06/23] x86, vdso: Revamp vclock_gettime.c This intermediate patch revamps the vclock_gettime.c by moving some functions around. It is only for spliting purpose, to make whole the 32 bit vdso timer patch easier to review. Reviewed-by: Andy Lutomirski Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-4-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vclock_gettime.c | 93 +++++++++++++++++----------------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index eb5d7a56f8d4..bbc80657050f 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -26,43 +26,28 @@ #define gtod (&VVAR(vsyscall_gtod_data)) -notrace static cycle_t vread_tsc(void) -{ - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - - last = VVAR(vsyscall_gtod_data).clock.cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - static notrace cycle_t vread_hpet(void) { return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); } +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} + #ifdef CONFIG_PARAVIRT_CLOCK static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) @@ -133,24 +118,38 @@ static notrace cycle_t vread_pvclock(int *mode) } #endif -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static cycle_t vread_tsc(void) { - long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); - return ret; + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; } -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); - return ret; -} - - notrace static inline u64 vgetsns(int *mode) { long v; From ce39c64028a075d14af32bfb8336bfe1370c0443 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:04 +0100 Subject: [PATCH 07/23] x86, vdso: __vdso_clock_gettime() cleanup This patch is a small code cleanup for the __vdso_clock_gettime() function. It removes the unneeded return values from do_monotonic_coarse() and do_realtime_coarse() and add a fallback label for doing the kernel gettimeofday() system call. Reviewed-by: Andy Lutomirski Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-5-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vclock_gettime.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index bbc80657050f..fd074dd19a4a 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -209,7 +209,7 @@ notrace static int do_monotonic(struct timespec *ts) return mode; } -notrace static int do_realtime_coarse(struct timespec *ts) +notrace static void do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { @@ -217,10 +217,9 @@ notrace static int do_realtime_coarse(struct timespec *ts) ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); - return 0; } -notrace static int do_monotonic_coarse(struct timespec *ts) +notrace static void do_monotonic_coarse(struct timespec *ts) { unsigned long seq; do { @@ -228,30 +227,32 @@ notrace static int do_monotonic_coarse(struct timespec *ts) ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); - - return 0; } notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - int ret = VCLOCK_NONE; - switch (clock) { case CLOCK_REALTIME: - ret = do_realtime(ts); + if (do_realtime(ts) == VCLOCK_NONE) + goto fallback; break; case CLOCK_MONOTONIC: - ret = do_monotonic(ts); + if (do_monotonic(ts) == VCLOCK_NONE) + goto fallback; break; case CLOCK_REALTIME_COARSE: - return do_realtime_coarse(ts); + do_realtime_coarse(ts); + break; case CLOCK_MONOTONIC_COARSE: - return do_monotonic_coarse(ts); + do_monotonic_coarse(ts); + break; + default: + goto fallback; } - if (ret == VCLOCK_NONE) - return vdso_fallback_gettime(clock, ts); return 0; +fallback: + return vdso_fallback_gettime(clock, ts); } int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); From af8c93d8d9809c3cf71cae2c398069399e64efa3 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:05 +0100 Subject: [PATCH 08/23] x86, vdso: Replace VVAR(vsyscall_gtod_data) by gtod macro There a currently more than 30 users of the gtod macro, so replace the last VVAR(vsyscall_gtod_data) by gtod macro. Reviewed-by: Andy Lutomirski Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-6-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vclock_gettime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index fd074dd19a4a..743f27774b73 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -109,7 +109,7 @@ static notrace cycle_t vread_pvclock(int *mode) *mode = VCLOCK_NONE; /* refer to tsc.c read_tsc() comment for rationale */ - last = VVAR(vsyscall_gtod_data).clock.cycle_last; + last = gtod->clock.cycle_last; if (likely(ret >= last)) return ret; @@ -133,7 +133,7 @@ notrace static cycle_t vread_tsc(void) rdtsc_barrier(); ret = (cycle_t)vget_cycles(); - last = VVAR(vsyscall_gtod_data).clock.cycle_last; + last = gtod->clock.cycle_last; if (likely(ret >= last)) return ret; @@ -288,7 +288,7 @@ int gettimeofday(struct timeval *, struct timezone *) notrace time_t __vdso_time(time_t *t) { /* This is atomic on x86_64 so we don't need any locks. */ - time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); + time_t result = ACCESS_ONCE(gtod->wall_time_sec); if (t) *t = result; From 0df1ea2b7955d3cb311a549c44ed482452b859ff Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:06 +0100 Subject: [PATCH 09/23] x86, vdso: Cleanup __vdso_gettimeofday() This patch cleans up the __vdso_gettimeofday() function a little. It kicks out an unneeded ret local variable and makes the code faster if only the timezone is needed (an admittedly rare case.) Reviewed-by: Andy Lutomirski Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-7-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vclock_gettime.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 743f27774b73..09dae4a1c6dc 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -259,13 +259,12 @@ int clock_gettime(clockid_t, struct timespec *) notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { - long ret = VCLOCK_NONE; - if (likely(tv != NULL)) { BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != offsetof(struct timespec, tv_nsec) || sizeof(*tv) != sizeof(struct timespec)); - ret = do_realtime((struct timespec *)tv); + if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) + return vdso_fallback_gtod(tv, tz); tv->tv_usec /= 1000; } if (unlikely(tz != NULL)) { @@ -274,8 +273,6 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) tz->tz_dsttime = gtod->sys_tz.tz_dsttime; } - if (ret == VCLOCK_NONE) - return vdso_fallback_gtod(tv, tz); return 0; } int gettimeofday(struct timeval *, struct timezone *) From ef721987aef0cc0abba08c88810f2155f76b0b1f Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:07 +0100 Subject: [PATCH 10/23] x86, vdso: Introduce VVAR marco for vdso32 This patch revamps the vvar.h for introduce the VVAR macro for vdso32. Reviewed-by: Andy Lutomirski Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-8-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vvar.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 0a534eac5f56..52c79ffc3161 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -26,6 +26,15 @@ #else +#ifdef BUILD_VDSO32 + +#define DECLARE_VVAR(offset, type, name) \ + extern type vvar_ ## name __attribute__((visibility("hidden"))); + +#define VVAR(name) (vvar_ ## name) + +#else + extern char __vvar_page; /* Base address of vvars. This is not ABI. */ @@ -39,12 +48,13 @@ extern char __vvar_page; static type const * const vvaraddr_ ## name = \ (void *)(VVAR_ADDRESS + (offset)); +#define VVAR(name) (*vvaraddr_ ## name) +#endif + #define DEFINE_VVAR(type, name) \ type name \ __attribute__((section(".vvar_" #name), aligned(16))) __visible -#define VVAR(name) (*vvaraddr_ ## name) - #endif /* DECLARE_VVAR(offset, type, name) */ From b4b541a610c4db8643b36030ee5012203ca65778 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 17 Mar 2014 23:22:08 +0100 Subject: [PATCH 11/23] x86, vdso: Patch alternatives in the 32-bit VDSO We need the alternatives mechanism for rdtsc_barrier() to work. Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-9-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vdso.h | 2 ++ arch/x86/vdso/Makefile | 3 ++- arch/x86/vdso/vdso32-setup.c | 23 ++++++++++++----------- arch/x86/vdso/vma.c | 13 ++++++++++--- 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 5594e84d65d9..f8605e61b0c5 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -28,4 +28,6 @@ extern const char vdso32_int80_start, vdso32_int80_end; extern const char vdso32_syscall_start, vdso32_syscall_end; extern const char vdso32_sysenter_start, vdso32_sysenter_end; +void __init patch_vdso32(void *vdso, size_t len); + #endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index fd14be1d1472..7a3d13e23f25 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -21,7 +21,8 @@ vobjs-$(VDSOX32-y) += $(vobjx32s-compat) vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y)) # files to link into kernel -obj-$(VDSO64-y) += vma.o vdso.o +obj-y += vma.o +obj-$(VDSO64-y) += vdso.o obj-$(VDSOX32-y) += vdsox32.o obj-$(VDSO32-y) += vdso32.o vdso32-setup.o diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index ab20c04b688a..e0fc767bcad3 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -112,24 +112,25 @@ void enable_sep_cpu(void) int __init sysenter_setup(void) { - void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); - const void *vsyscall; - size_t vsyscall_len; + void *vdso_page = (void *)get_zeroed_page(GFP_ATOMIC); + const void *vdso; + size_t vdso_len; - vdso32_pages[0] = virt_to_page(syscall_page); + vdso32_pages[0] = virt_to_page(vdso_page); if (vdso32_syscall()) { - vsyscall = &vdso32_syscall_start; - vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; + vdso = &vdso32_syscall_start; + vdso_len = &vdso32_syscall_end - &vdso32_syscall_start; } else if (vdso32_sysenter()){ - vsyscall = &vdso32_sysenter_start; - vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; + vdso = &vdso32_sysenter_start; + vdso_len = &vdso32_sysenter_end - &vdso32_sysenter_start; } else { - vsyscall = &vdso32_int80_start; - vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; + vdso = &vdso32_int80_start; + vdso_len = &vdso32_int80_end - &vdso32_int80_start; } - memcpy(syscall_page, vsyscall, vsyscall_len); + memcpy(vdso_page, vdso, vdso_len); + patch_vdso32(vdso_page, vdso_len); return 0; } diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 431e87544411..7345bc9a1af6 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -16,6 +16,7 @@ #include #include +#if defined(CONFIG_X86_64) unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; @@ -28,8 +29,12 @@ static unsigned vdso_size; extern char vdsox32_start[], vdsox32_end[]; extern struct page *vdsox32_pages[]; static unsigned vdsox32_size; +#endif +#endif -static void __init patch_vdsox32(void *vdso, size_t len) +#if defined(CONFIG_X86_32) || defined(CONFIG_X86_X32_ABI) || \ + defined(CONFIG_COMPAT) +void __init patch_vdso32(void *vdso, size_t len) { Elf32_Ehdr *hdr = vdso; Elf32_Shdr *sechdrs, *alt_sec = 0; @@ -52,7 +57,7 @@ static void __init patch_vdsox32(void *vdso, size_t len) } /* If we get here, it's probably a bug. */ - pr_warning("patch_vdsox32: .altinstructions not found\n"); + pr_warning("patch_vdso32: .altinstructions not found\n"); return; /* nothing to patch */ found: @@ -61,6 +66,7 @@ found: } #endif +#if defined(CONFIG_X86_64) static void __init patch_vdso64(void *vdso, size_t len) { Elf64_Ehdr *hdr = vdso; @@ -104,7 +110,7 @@ static int __init init_vdso(void) vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); #ifdef CONFIG_X86_X32_ABI - patch_vdsox32(vdsox32_start, vdsox32_end - vdsox32_start); + patch_vdso32(vdsox32_start, vdsox32_end - vdsox32_start); npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE; vdsox32_size = npages << PAGE_SHIFT; for (i = 0; i < npages; i++) @@ -204,3 +210,4 @@ static __init int vdso_setup(char *s) return 0; } __setup("vdso=", vdso_setup); +#endif From 7a59ed415f5b57469e22e41fc4188d5399e0b194 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:09 +0100 Subject: [PATCH 12/23] x86, vdso: Add 32 bit VDSO time support for 32 bit kernel This patch add the time support for 32 bit a VDSO to a 32 bit kernel. For 32 bit programs running on a 32 bit kernel, the same mechanism is used as for 64 bit programs running on a 64 bit kernel. Reviewed-by: Andy Lutomirski Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-10-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vdso.h | 5 ++ arch/x86/include/asm/vdso32.h | 11 ++++ arch/x86/vdso/Makefile | 8 +++ arch/x86/vdso/vclock_gettime.c | 76 ++++++++++++++++++++++++--- arch/x86/vdso/vdso-layout.lds.S | 22 ++++++++ arch/x86/vdso/vdso32-setup.c | 47 +++++++++++++++-- arch/x86/vdso/vdso32/vclock_gettime.c | 9 ++++ arch/x86/vdso/vdso32/vdso32.lds.S | 11 ++++ 8 files changed, 178 insertions(+), 11 deletions(-) create mode 100644 arch/x86/include/asm/vdso32.h create mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index f8605e61b0c5..bde435998f3a 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -2,6 +2,11 @@ #define _ASM_X86_VDSO_H #if defined CONFIG_X86_32 || defined CONFIG_COMPAT + +#include + +extern const char VDSO32_PRELINK[]; + /* * Given a pointer to the vDSO image, find the pointer to VDSO32_name * as that symbol is defined in the vDSO sources or linker script. diff --git a/arch/x86/include/asm/vdso32.h b/arch/x86/include/asm/vdso32.h new file mode 100644 index 000000000000..7efb7018406e --- /dev/null +++ b/arch/x86/include/asm/vdso32.h @@ -0,0 +1,11 @@ +#ifndef _ASM_X86_VDSO32_H +#define _ASM_X86_VDSO32_H + +#define VDSO_BASE_PAGE 0 +#define VDSO_VVAR_PAGE 1 +#define VDSO_HPET_PAGE 2 +#define VDSO_PAGES 3 +#define VDSO_PREV_PAGES 2 +#define VDSO_OFFSET(x) ((x) * PAGE_SIZE) + +#endif diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 7a3d13e23f25..6cef7a1b014c 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -146,8 +146,16 @@ KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) $(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ $(obj)/vdso32/note.o \ $(obj)/vdso32/%.o $(call if_changed,vdso) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 09dae4a1c6dc..90bb5e8027ac 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -4,6 +4,9 @@ * * Fast user context implementation of clock_gettime, gettimeofday, and time. * + * 32 Bit compat layer by Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * * The code should have no internal unresolved relocations. * Check with readelf after changing. */ @@ -12,13 +15,11 @@ #define DISABLE_BRANCH_PROFILING #include -#include -#include +#include #include #include #include #include -#include #include #include #include @@ -26,6 +27,12 @@ #define gtod (&VVAR(vsyscall_gtod_data)) +extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); +extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +extern time_t __vdso_time(time_t *t); + +#ifndef BUILD_VDSO32 + static notrace cycle_t vread_hpet(void) { return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); @@ -118,6 +125,59 @@ static notrace cycle_t vread_pvclock(int *mode) } #endif +#else + +extern u8 hpet_page + __attribute__((visibility("hidden"))); + +#ifdef CONFIG_HPET_TIMER +static notrace cycle_t vread_hpet(void) +{ + return readl((const void __iomem *)(&hpet_page + HPET_COUNTER)); +} +#endif + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call VDSO32_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "memory", "edx"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call VDSO32_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "memory", "edx"); + return ret; +} + +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace cycle_t vread_pvclock(int *mode) +{ + *mode = VCLOCK_NONE; + return 0; +} +#endif + +#endif + notrace static cycle_t vread_tsc(void) { cycle_t ret; @@ -131,7 +191,7 @@ notrace static cycle_t vread_tsc(void) * but no one has ever seen it happen. */ rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); + ret = (cycle_t)__native_read_tsc(); last = gtod->clock.cycle_last; @@ -152,12 +212,14 @@ notrace static cycle_t vread_tsc(void) notrace static inline u64 vgetsns(int *mode) { - long v; + u64 v; cycles_t cycles; if (gtod->clock.vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); +#ifdef CONFIG_HPET_TIMER else if (gtod->clock.vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); +#endif #ifdef CONFIG_PARAVIRT_CLOCK else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) cycles = vread_pvclock(mode); @@ -189,7 +251,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) return mode; } -notrace static int do_monotonic(struct timespec *ts) +notrace static int __always_inline do_monotonic(struct timespec *ts) { unsigned long seq; u64 ns; @@ -284,7 +346,7 @@ int gettimeofday(struct timeval *, struct timezone *) */ notrace time_t __vdso_time(time_t *t) { - /* This is atomic on x86_64 so we don't need any locks. */ + /* This is atomic on x86 so we don't need any locks. */ time_t result = ACCESS_ONCE(gtod->wall_time_sec); if (t) diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 8c550c1976f3..c6d0e1bf27a4 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -6,6 +6,24 @@ SECTIONS { +#ifdef BUILD_VDSO32 +#include + + .hpet_sect : { + hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); + } :text :hpet_sect + + .vvar_sect : { + vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset; +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + } :text :vvar_sect +#endif . = SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -61,4 +79,8 @@ PHDRS dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; +#ifdef BUILD_VDSO32 + vvar_sect PT_NULL FLAGS(4); /* PF_R */ + hpet_sect PT_NULL FLAGS(4); /* PF_R */ +#endif } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index e0fc767bcad3..e10abdf4cc10 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -25,6 +25,9 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_COMPAT_VDSO #define VDSO_DEFAULT 0 @@ -141,6 +144,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) struct mm_struct *mm = current->mm; unsigned long addr; int ret = 0; + struct vm_area_struct *vma; #ifdef CONFIG_X86_X32_ABI if (test_thread_flag(TIF_X32)) @@ -163,14 +167,49 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) /* * MAYWRITE to allow gdb to COW and set breakpoints */ - ret = install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso32_pages); + ret = install_special_mapping(mm, + addr, + VDSO_OFFSET(VDSO_PAGES - VDSO_PREV_PAGES), + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + vdso32_pages); if (ret) goto up_fail; + vma = _install_special_mapping(mm, + addr - VDSO_OFFSET(VDSO_PREV_PAGES), + VDSO_OFFSET(VDSO_PREV_PAGES), + VM_READ, + NULL); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + ret = remap_pfn_range(vma, + addr - VDSO_OFFSET(VDSO_VVAR_PAGE), + __pa_symbol(&__vvar_page) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + +#ifdef CONFIG_HPET_TIMER + if (hpet_address) { + ret = io_remap_pfn_range(vma, + addr - VDSO_OFFSET(VDSO_HPET_PAGE), + hpet_address >> PAGE_SHIFT, + PAGE_SIZE, + pgprot_noncached(PAGE_READONLY)); + + if (ret) + goto up_fail; + } +#endif + current_thread_info()->sysenter_return = VDSO32_SYMBOL(addr, SYSENTER_RETURN); diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c new file mode 100644 index 000000000000..ca65e4213d63 --- /dev/null +++ b/arch/x86/vdso/vdso32/vclock_gettime.c @@ -0,0 +1,9 @@ +#define BUILD_VDSO32 + +#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE +#undef CONFIG_OPTIMIZE_INLINING +#endif + +#undef CONFIG_X86_PPRO_FENCE + +#include "../vclock_gettime.c" diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S index 90e7aa90dbe4..28c460703bbc 100644 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -8,6 +8,11 @@ * values visible using the asm-x86/vdso.h macros from the kernel proper. */ +#include + +#define BUILD_VDSO32 +#define VDSO_PRELINK 0 + #include "../vdso-layout.lds.S" /* The ELF entry point can be used to set the AT_SYSINFO value. */ @@ -23,6 +28,9 @@ VERSION __kernel_vsyscall; __kernel_sigreturn; __kernel_rt_sigreturn; + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; local: *; }; } @@ -33,3 +41,6 @@ VERSION VDSO32_vsyscall = __kernel_vsyscall; VDSO32_sigreturn = __kernel_sigreturn; VDSO32_rt_sigreturn = __kernel_rt_sigreturn; +VDSO32_clock_gettime = clock_gettime; +VDSO32_gettimeofday = gettimeofday; +VDSO32_time = time; From 7c03156f34d113f885f045d8fb8cc3efd9e64751 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:10 +0100 Subject: [PATCH 13/23] x86, vdso: Add 32 bit VDSO time support for 64 bit kernel This patch add the VDSO time support for the IA32 Emulation Layer. Due the nature of the kernel headers and the LP64 compiler where the size of a long and a pointer differs against a 32 bit compiler, there is some type hacking necessary for optimal performance. The vsyscall_gtod_data struture must be a rearranged to serve 32- and 64-bit code access at the same time: - The seqcount_t was replaced by an unsigned, this makes the vsyscall_gtod_data intedepend of kernel configuration and internal functions. - All kernel internal structures are replaced by fix size elements which works for 32- and 64-bit access - The inner struct clock was removed to pack the whole struct. The "unsigned seq" would be handled by functions derivated from seqcount_t. Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-11-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vgtod.h | 71 ++++++++++++++++----- arch/x86/include/asm/vvar.h | 5 ++ arch/x86/kernel/vsyscall_gtod.c | 34 ++++++---- arch/x86/vdso/vclock_gettime.c | 91 +++++++++++++++------------ arch/x86/vdso/vdso32/vclock_gettime.c | 21 +++++++ 5 files changed, 155 insertions(+), 67 deletions(-) diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 46e24d36b7da..3c3366c2e37f 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -1,30 +1,73 @@ #ifndef _ASM_X86_VGTOD_H #define _ASM_X86_VGTOD_H -#include +#include #include +#ifdef BUILD_VDSO32_64 +typedef u64 gtod_long_t; +#else +typedef unsigned long gtod_long_t; +#endif +/* + * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time + * so be carefull by modifying this structure. + */ struct vsyscall_gtod_data { - seqcount_t seq; + unsigned seq; - struct { /* extract of a clocksource struct */ - int vclock_mode; - cycle_t cycle_last; - cycle_t mask; - u32 mult; - u32 shift; - } clock; + int vclock_mode; + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; /* open coded 'struct timespec' */ - time_t wall_time_sec; u64 wall_time_snsec; + gtod_long_t wall_time_sec; + gtod_long_t monotonic_time_sec; u64 monotonic_time_snsec; - time_t monotonic_time_sec; + gtod_long_t wall_time_coarse_sec; + gtod_long_t wall_time_coarse_nsec; + gtod_long_t monotonic_time_coarse_sec; + gtod_long_t monotonic_time_coarse_nsec; - struct timezone sys_tz; - struct timespec wall_time_coarse; - struct timespec monotonic_time_coarse; + int tz_minuteswest; + int tz_dsttime; }; extern struct vsyscall_gtod_data vsyscall_gtod_data; +static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) +{ + unsigned ret; + +repeat: + ret = ACCESS_ONCE(s->seq); + if (unlikely(ret & 1)) { + cpu_relax(); + goto repeat; + } + smp_rmb(); + return ret; +} + +static inline int gtod_read_retry(const struct vsyscall_gtod_data *s, + unsigned start) +{ + smp_rmb(); + return unlikely(s->seq != start); +} + +static inline void gtod_write_begin(struct vsyscall_gtod_data *s) +{ + ++s->seq; + smp_wmb(); +} + +static inline void gtod_write_end(struct vsyscall_gtod_data *s) +{ + smp_wmb(); + ++s->seq; +} + #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 52c79ffc3161..081d909bc495 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -16,6 +16,9 @@ * you mess up, the linker will catch it.) */ +#ifndef _ASM_X86_VVAR_H +#define _ASM_X86_VVAR_H + #if defined(__VVAR_KERNEL_LDS) /* The kernel linker script defines its own magic to put vvars in the @@ -64,3 +67,5 @@ DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR + +#endif diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index b5a943dba9f3..f9c6e56e14b5 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -4,6 +4,7 @@ * * Modified for x86 32 bit architecture by * Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany * * Thanks to hpa@transmeta.com for some useful hint. * Special thanks to Ingo Molnar for his early experience with @@ -13,26 +14,28 @@ #include #include +#include DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); void update_vsyscall_tz(void) { - vsyscall_gtod_data.sys_tz = sys_tz; + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; } void update_vsyscall(struct timekeeper *tk) { struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - write_seqcount_begin(&vdata->seq); + gtod_write_begin(vdata); /* copy vsyscall data */ - vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->clock->cycle_last; - vdata->clock.mask = tk->clock->mask; - vdata->clock.mult = tk->mult; - vdata->clock.shift = tk->shift; + vdata->vclock_mode = tk->clock->archdata.vclock_mode; + vdata->cycle_last = tk->clock->cycle_last; + vdata->mask = tk->clock->mask; + vdata->mult = tk->mult; + vdata->shift = tk->shift; vdata->wall_time_sec = tk->xtime_sec; vdata->wall_time_snsec = tk->xtime_nsec; @@ -49,11 +52,18 @@ void update_vsyscall(struct timekeeper *tk) vdata->monotonic_time_sec++; } - vdata->wall_time_coarse.tv_sec = tk->xtime_sec; - vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift); - vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse, - tk->wall_to_monotonic); + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; - write_seqcount_end(&vdata->seq); + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 90bb5e8027ac..16d686171e9a 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -14,16 +14,14 @@ /* Disable profiling for userspace code: */ #define DISABLE_BRANCH_PROFILING -#include #include -#include -#include -#include #include #include +#include #include -#include -#include +#include +#include +#include #define gtod (&VVAR(vsyscall_gtod_data)) @@ -31,11 +29,23 @@ extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); +#ifdef CONFIG_HPET_TIMER +static inline u32 read_hpet_counter(const volatile void *addr) +{ + return *(const volatile u32 *) (addr + HPET_COUNTER); +} +#endif + #ifndef BUILD_VDSO32 +#include +#include +#include +#include + static notrace cycle_t vread_hpet(void) { - return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); + return read_hpet_counter((const void *)fix_to_virt(VSYSCALL_HPET)); } notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) @@ -116,7 +126,7 @@ static notrace cycle_t vread_pvclock(int *mode) *mode = VCLOCK_NONE; /* refer to tsc.c read_tsc() comment for rationale */ - last = gtod->clock.cycle_last; + last = gtod->cycle_last; if (likely(ret >= last)) return ret; @@ -133,7 +143,7 @@ extern u8 hpet_page #ifdef CONFIG_HPET_TIMER static notrace cycle_t vread_hpet(void) { - return readl((const void __iomem *)(&hpet_page + HPET_COUNTER)); + return read_hpet_counter((const void *)(&hpet_page)); } #endif @@ -193,7 +203,7 @@ notrace static cycle_t vread_tsc(void) rdtsc_barrier(); ret = (cycle_t)__native_read_tsc(); - last = gtod->clock.cycle_last; + last = gtod->cycle_last; if (likely(ret >= last)) return ret; @@ -214,20 +224,21 @@ notrace static inline u64 vgetsns(int *mode) { u64 v; cycles_t cycles; - if (gtod->clock.vclock_mode == VCLOCK_TSC) + + if (gtod->vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); #ifdef CONFIG_HPET_TIMER - else if (gtod->clock.vclock_mode == VCLOCK_HPET) + else if (gtod->vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); #endif #ifdef CONFIG_PARAVIRT_CLOCK - else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) + else if (gtod->vclock_mode == VCLOCK_PVCLOCK) cycles = vread_pvclock(mode); #endif else return 0; - v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; - return v * gtod->clock.mult; + v = (cycles - gtod->cycle_last) & gtod->mask; + return v * gtod->mult; } /* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ @@ -237,17 +248,18 @@ notrace static int __always_inline do_realtime(struct timespec *ts) u64 ns; int mode; - ts->tv_nsec = 0; do { - seq = raw_read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; ns += vgetsns(&mode); - ns >>= gtod->clock.shift; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; - timespec_add_ns(ts, ns); return mode; } @@ -257,16 +269,17 @@ notrace static int __always_inline do_monotonic(struct timespec *ts) u64 ns; int mode; - ts->tv_nsec = 0; do { - seq = raw_read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; ns += vgetsns(&mode); - ns >>= gtod->clock.shift; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); - timespec_add_ns(ts, ns); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; return mode; } @@ -275,20 +288,20 @@ notrace static void do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = raw_read_seqcount_begin(>od->seq); - ts->tv_sec = gtod->wall_time_coarse.tv_sec; - ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->wall_time_coarse_sec; + ts->tv_nsec = gtod->wall_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); } notrace static void do_monotonic_coarse(struct timespec *ts) { unsigned long seq; do { - seq = raw_read_seqcount_begin(>od->seq); - ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; - ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->monotonic_time_coarse_sec; + ts->tv_nsec = gtod->monotonic_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); } notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) @@ -322,17 +335,13 @@ int clock_gettime(clockid_t, struct timespec *) notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { if (likely(tv != NULL)) { - BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != - offsetof(struct timespec, tv_nsec) || - sizeof(*tv) != sizeof(struct timespec)); if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) return vdso_fallback_gtod(tv, tz); tv->tv_usec /= 1000; } if (unlikely(tz != NULL)) { - /* Avoid memcpy. Some old compilers fail to inline it */ - tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest; - tz->tz_dsttime = gtod->sys_tz.tz_dsttime; + tz->tz_minuteswest = gtod->tz_minuteswest; + tz->tz_dsttime = gtod->tz_dsttime; } return 0; diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c index ca65e4213d63..175cc72c0f68 100644 --- a/arch/x86/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/vdso/vdso32/vclock_gettime.c @@ -6,4 +6,25 @@ #undef CONFIG_X86_PPRO_FENCE +#ifdef CONFIG_X86_64 + +/* + * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel + * configuration + */ +#undef CONFIG_64BIT +#undef CONFIG_X86_64 +#undef CONFIG_ILLEGAL_POINTER_VALUE +#undef CONFIG_SPARSEMEM_VMEMMAP +#undef CONFIG_NR_CPUS + +#define CONFIG_X86_32 1 +#define CONFIG_PAGE_OFFSET 0 +#define CONFIG_ILLEGAL_POINTER_VALUE 0 +#define CONFIG_NR_CPUS 1 + +#define BUILD_VDSO32_64 + +#endif + #include "../vclock_gettime.c" From 309944be296efbb3ca4737d12ef49d2ba97cbecc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 17 Mar 2014 23:22:11 +0100 Subject: [PATCH 14/23] x86, vdso: Zero-pad the VVAR page By coincidence, the VVAR page is at the end of an ELF segment. As a result, if it ends up being a partial page, the kernel loader will leave garbage behind at the end of the vvar page. Zero-pad it to a full page to fix this issue. This has probably been broken since the VVAR page was introduced. On QEMU, if you dump the run-time contents of the VVAR page, you can find entertaining strings from seabios left behind. It's remotely possible that this is a security bug -- conceivably there's some BIOS out there that leaves something sensitive in the few K of memory that is exposed to userspace. Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-12-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1d4897baf1cc..49edf2dd3613 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -164,6 +164,11 @@ SECTIONS #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR + /* + * Pad the rest of the page with zeros. Otherwise the loader + * can leave garbage here. + */ + . = __vvar_beginning_hack + PAGE_SIZE; } :data . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); From 008cc907de327d83a0be609cd495fccb0e5dfa4c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 17 Mar 2014 23:22:12 +0100 Subject: [PATCH 15/23] x86, vdso32: Disable stack protector, adjust optimizations For the 32-bit VDSO, match the 64-bit VDSO in: 1. Disable the stack protector. 2. Use -fno-omit-frame-pointer for user space debugging sanity. 3. Use -foptimize-sibling-calls like the 64-bit VDSO does. Reported-by: Ingo Molnar Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-13-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 6cef7a1b014c..a2de5fc15511 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -151,6 +151,9 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic +KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) +KBUILD_CFLAGS_32 += -fno-omit-frame-pointer $(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ From 4e40112c4ff6a577dd06d92b2a54cdf06265bf74 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Mon, 17 Mar 2014 23:22:13 +0100 Subject: [PATCH 16/23] x86, vdso32: handle 32 bit vDSO larger one page This patch enables 32 bit vDSO which are larger than a page. Signed-off-by: Stefani Seibold Link: http://lkml.kernel.org/r/1395094933-14252-14-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso32-setup.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index e10abdf4cc10..5b4aaefb6b42 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -69,7 +70,8 @@ __setup_param("vdso=", vdso32_setup, vdso_setup, 0); EXPORT_SYMBOL_GPL(vdso_enabled); #endif -static struct page *vdso32_pages[1]; +static struct page **vdso32_pages; +static unsigned int vdso32_size; #ifdef CONFIG_X86_64 @@ -115,11 +117,10 @@ void enable_sep_cpu(void) int __init sysenter_setup(void) { - void *vdso_page = (void *)get_zeroed_page(GFP_ATOMIC); + void *vdso_pages; const void *vdso; size_t vdso_len; - - vdso32_pages[0] = virt_to_page(vdso_page); + unsigned int i; if (vdso32_syscall()) { vdso = &vdso32_syscall_start; @@ -132,8 +133,15 @@ int __init sysenter_setup(void) vdso_len = &vdso32_int80_end - &vdso32_int80_start; } - memcpy(vdso_page, vdso, vdso_len); - patch_vdso32(vdso_page, vdso_len); + vdso32_size = (vdso_len + PAGE_SIZE - 1) / PAGE_SIZE; + vdso32_pages = kmalloc(sizeof(*vdso32_pages) * vdso32_size, GFP_ATOMIC); + vdso_pages = kmalloc(VDSO_OFFSET(vdso32_size), GFP_ATOMIC); + + for(i = 0; i != vdso32_size; ++i) + vdso32_pages[i] = virt_to_page(vdso_pages + VDSO_OFFSET(i)); + + memcpy(vdso_pages, vdso, vdso_len); + patch_vdso32(vdso_pages, vdso_len); return 0; } @@ -169,7 +177,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) */ ret = install_special_mapping(mm, addr, - VDSO_OFFSET(VDSO_PAGES - VDSO_PREV_PAGES), + VDSO_OFFSET(vdso32_size), VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso32_pages); From b67e612cef1e5964efc6fa99fb7ad3d31c4db01a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 20 Mar 2014 15:01:21 -0700 Subject: [PATCH 17/23] x86: Load the 32-bit vdso in place, just like the 64-bit vdsos This replaces a decent amount of incomprehensible and buggy code with much more straightforward code. It also brings the 32-bit vdso more in line with the 64-bit vdsos, so maybe someday they can share even more code. This wastes a small amount of kernel .data and .text space, but it avoids a couple of allocations on startup, so it should be more or less a wash memory-wise. Signed-off-by: Andy Lutomirski Cc: Stefani Seibold Link: http://lkml.kernel.org/r/b8093933fad09ce181edb08a61dcd5d2592e9814.1395352498.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vdso.h | 8 ------ arch/x86/vdso/vdso.S | 22 ++-------------- arch/x86/vdso/vdso32-setup.c | 50 +++++++++++++++++++++--------------- arch/x86/vdso/vdso32.S | 21 +++------------ arch/x86/vdso/vdso_image.h | 30 ++++++++++++++++++++++ arch/x86/vdso/vdsox32.S | 22 ++-------------- arch/x86/vdso/vma.c | 8 +++--- 7 files changed, 70 insertions(+), 91 deletions(-) create mode 100644 arch/x86/vdso/vdso_image.h diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bde435998f3a..0301d78bb910 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -25,14 +25,6 @@ extern const char VDSO32_PRELINK[]; extern void __user __kernel_sigreturn; extern void __user __kernel_rt_sigreturn; -/* - * These symbols are defined by vdso32.S to mark the bounds - * of the ELF DSO images included therein. - */ -extern const char vdso32_int80_start, vdso32_int80_end; -extern const char vdso32_syscall_start, vdso32_syscall_end; -extern const char vdso32_sysenter_start, vdso32_sysenter_end; - void __init patch_vdso32(void *vdso, size_t len); #endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1e13eb8c9656..c749d15da2e9 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -1,21 +1,3 @@ -#include -#include +#include "vdso_image.h" -__PAGE_ALIGNED_DATA - - .globl vdso_start, vdso_end - .align PAGE_SIZE -vdso_start: - .incbin "arch/x86/vdso/vdso.so" -vdso_end: - .align PAGE_SIZE /* extra data here leaks to userspace. */ - -.previous - - .globl vdso_pages - .bss - .align 8 - .type vdso_pages, @object -vdso_pages: - .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8 - .size vdso_pages, .-vdso_pages +DEFINE_VDSO_IMAGE(vdso, "arch/x86/vdso/vdso.so") diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 5b4aaefb6b42..b45528ee8e19 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -29,6 +29,7 @@ #include #include #include +#include "vdso_image.h" #ifdef CONFIG_COMPAT_VDSO #define VDSO_DEFAULT 0 @@ -41,6 +42,12 @@ #define arch_setup_additional_pages syscall32_setup_pages #endif +DECLARE_VDSO_IMAGE(vdso32_int80); +#ifdef CONFIG_COMPAT +DECLARE_VDSO_IMAGE(vdso32_syscall); +#endif +DECLARE_VDSO_IMAGE(vdso32_sysenter); + /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? @@ -71,7 +78,7 @@ EXPORT_SYMBOL_GPL(vdso_enabled); #endif static struct page **vdso32_pages; -static unsigned int vdso32_size; +static unsigned vdso32_size; #ifdef CONFIG_X86_64 @@ -117,31 +124,32 @@ void enable_sep_cpu(void) int __init sysenter_setup(void) { - void *vdso_pages; - const void *vdso; - size_t vdso_len; - unsigned int i; + char *vdso32_start, *vdso32_end; + int npages, i; +#ifdef CONFIG_COMPAT if (vdso32_syscall()) { - vdso = &vdso32_syscall_start; - vdso_len = &vdso32_syscall_end - &vdso32_syscall_start; - } else if (vdso32_sysenter()){ - vdso = &vdso32_sysenter_start; - vdso_len = &vdso32_sysenter_end - &vdso32_sysenter_start; + vdso32_start = vdso32_syscall_start; + vdso32_end = vdso32_syscall_end; + vdso32_pages = vdso32_syscall_pages; + } else +#endif + if (vdso32_sysenter()) { + vdso32_start = vdso32_sysenter_start; + vdso32_end = vdso32_sysenter_end; + vdso32_pages = vdso32_sysenter_pages; } else { - vdso = &vdso32_int80_start; - vdso_len = &vdso32_int80_end - &vdso32_int80_start; + vdso32_start = vdso32_int80_start; + vdso32_end = vdso32_int80_end; + vdso32_pages = vdso32_int80_pages; } - vdso32_size = (vdso_len + PAGE_SIZE - 1) / PAGE_SIZE; - vdso32_pages = kmalloc(sizeof(*vdso32_pages) * vdso32_size, GFP_ATOMIC); - vdso_pages = kmalloc(VDSO_OFFSET(vdso32_size), GFP_ATOMIC); + npages = ((vdso32_end - vdso32_start) + PAGE_SIZE - 1) / PAGE_SIZE; + vdso32_size = npages << PAGE_SHIFT; + for (i = 0; i < npages; i++) + vdso32_pages[i] = virt_to_page(vdso32_start + i*PAGE_SIZE); - for(i = 0; i != vdso32_size; ++i) - vdso32_pages[i] = virt_to_page(vdso_pages + VDSO_OFFSET(i)); - - memcpy(vdso_pages, vdso, vdso_len); - patch_vdso32(vdso_pages, vdso_len); + patch_vdso32(vdso32_start, vdso32_size); return 0; } @@ -177,7 +185,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) */ ret = install_special_mapping(mm, addr, - VDSO_OFFSET(vdso32_size), + vdso32_size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso32_pages); diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S index 2ce5f82c333b..cfa6adda110b 100644 --- a/arch/x86/vdso/vdso32.S +++ b/arch/x86/vdso/vdso32.S @@ -1,22 +1,9 @@ -#include +#include "vdso_image.h" -__INITDATA +DEFINE_VDSO_IMAGE(vdso32_int80, "arch/x86/vdso/vdso32-int80.so") - .globl vdso32_int80_start, vdso32_int80_end -vdso32_int80_start: - .incbin "arch/x86/vdso/vdso32-int80.so" -vdso32_int80_end: - - .globl vdso32_syscall_start, vdso32_syscall_end -vdso32_syscall_start: #ifdef CONFIG_COMPAT - .incbin "arch/x86/vdso/vdso32-syscall.so" +DEFINE_VDSO_IMAGE(vdso32_syscall, "arch/x86/vdso/vdso32-syscall.so") #endif -vdso32_syscall_end: - .globl vdso32_sysenter_start, vdso32_sysenter_end -vdso32_sysenter_start: - .incbin "arch/x86/vdso/vdso32-sysenter.so" -vdso32_sysenter_end: - -__FINIT +DEFINE_VDSO_IMAGE(vdso32_sysenter, "arch/x86/vdso/vdso32-sysenter.so") diff --git a/arch/x86/vdso/vdso_image.h b/arch/x86/vdso/vdso_image.h new file mode 100644 index 000000000000..1baa6bc517bf --- /dev/null +++ b/arch/x86/vdso/vdso_image.h @@ -0,0 +1,30 @@ +#ifndef _VDSO_IMAGE_H +#define _VDSO_IMAGE_H + +#include +#include + +#define DEFINE_VDSO_IMAGE(symname, filename) \ +__PAGE_ALIGNED_DATA ; \ + .globl symname##_start, symname##_end ; \ + .align PAGE_SIZE ; \ + symname##_start: ; \ + .incbin filename ; \ + symname##_end: ; \ + .align PAGE_SIZE /* extra data here leaks to userspace. */ ; \ + \ +.previous ; \ + \ + .globl symname##_pages ; \ + .bss ; \ + .align 8 ; \ + .type symname##_pages, @object ; \ + symname##_pages: ; \ + .zero (symname##_end - symname##_start + PAGE_SIZE - 1) / PAGE_SIZE * (BITS_PER_LONG / 8) ; \ + .size symname##_pages, .-symname##_pages + +#define DECLARE_VDSO_IMAGE(symname) \ + extern char symname##_start[], symname##_end[]; \ + extern struct page *symname##_pages[] + +#endif /* _VDSO_IMAGE_H */ diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S index 295f1c7543d8..19a692778650 100644 --- a/arch/x86/vdso/vdsox32.S +++ b/arch/x86/vdso/vdsox32.S @@ -1,21 +1,3 @@ -#include -#include +#include "vdso_image.h" -__PAGE_ALIGNED_DATA - - .globl vdsox32_start, vdsox32_end - .align PAGE_SIZE -vdsox32_start: - .incbin "arch/x86/vdso/vdsox32.so" -vdsox32_end: - .align PAGE_SIZE /* extra data here leaks to userspace. */ - -.previous - - .globl vdsox32_pages - .bss - .align 8 - .type vdsox32_pages, @object -vdsox32_pages: - .zero (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE * 8 - .size vdsox32_pages, .-vdsox32_pages +DEFINE_VDSO_IMAGE(vdsox32, "arch/x86/vdso/vdsox32.so") diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7345bc9a1af6..6db0bbd876fd 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -15,19 +15,17 @@ #include #include #include +#include "vdso_image.h" #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso_enabled = 1; -extern char vdso_start[], vdso_end[]; +DECLARE_VDSO_IMAGE(vdso); extern unsigned short vdso_sync_cpuid; - -extern struct page *vdso_pages[]; static unsigned vdso_size; #ifdef CONFIG_X86_X32_ABI -extern char vdsox32_start[], vdsox32_end[]; -extern struct page *vdsox32_pages[]; +DECLARE_VDSO_IMAGE(vdsox32); static unsigned vdsox32_size; #endif #endif From 9e6f450f946d35d585798da268d45c679632fe05 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 20 Mar 2014 18:57:18 -0700 Subject: [PATCH 18/23] x86, vdso: Move more vdso definitions into vdso.h This fixes the Xen build and gets rid of a silly header file. Signed-off-by: Andy Lutomirski Cc: Stefani Seibold Link: http://lkml.kernel.org/r/1df77311795aff75f5742c787d277518314a38d3.1395366931.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vdso.h | 38 ++++++++++++++++++++++++++++++++++++ arch/x86/vdso/vdso.S | 2 +- arch/x86/vdso/vdso32-setup.c | 7 ------- arch/x86/vdso/vdso32.S | 2 +- arch/x86/vdso/vdso_image.h | 30 ---------------------------- arch/x86/vdso/vdsox32.S | 2 +- arch/x86/vdso/vma.c | 1 - 7 files changed, 41 insertions(+), 41 deletions(-) delete mode 100644 arch/x86/vdso/vdso_image.h diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 0301d78bb910..7622a65a969e 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -1,10 +1,46 @@ #ifndef _ASM_X86_VDSO_H #define _ASM_X86_VDSO_H +#include +#include + +#ifdef __ASSEMBLER__ + +#define DEFINE_VDSO_IMAGE(symname, filename) \ +__PAGE_ALIGNED_DATA ; \ + .globl symname##_start, symname##_end ; \ + .align PAGE_SIZE ; \ + symname##_start: ; \ + .incbin filename ; \ + symname##_end: ; \ + .align PAGE_SIZE /* extra data here leaks to userspace. */ ; \ + \ +.previous ; \ + \ + .globl symname##_pages ; \ + .bss ; \ + .align 8 ; \ + .type symname##_pages, @object ; \ + symname##_pages: ; \ + .zero (symname##_end - symname##_start + PAGE_SIZE - 1) / PAGE_SIZE * (BITS_PER_LONG / 8) ; \ + .size symname##_pages, .-symname##_pages + +#else + +#define DECLARE_VDSO_IMAGE(symname) \ + extern char symname##_start[], symname##_end[]; \ + extern struct page *symname##_pages[] + #if defined CONFIG_X86_32 || defined CONFIG_COMPAT #include +DECLARE_VDSO_IMAGE(vdso32_int80); +#ifdef CONFIG_COMPAT +DECLARE_VDSO_IMAGE(vdso32_syscall); +#endif +DECLARE_VDSO_IMAGE(vdso32_sysenter); + extern const char VDSO32_PRELINK[]; /* @@ -27,4 +63,6 @@ extern void __user __kernel_rt_sigreturn; void __init patch_vdso32(void *vdso, size_t len); +#endif /* __ASSEMBLER__ */ + #endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index c749d15da2e9..be3f23b09af5 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -1,3 +1,3 @@ -#include "vdso_image.h" +#include DEFINE_VDSO_IMAGE(vdso, "arch/x86/vdso/vdso.so") diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index b45528ee8e19..791c1cb822c6 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -29,7 +29,6 @@ #include #include #include -#include "vdso_image.h" #ifdef CONFIG_COMPAT_VDSO #define VDSO_DEFAULT 0 @@ -42,12 +41,6 @@ #define arch_setup_additional_pages syscall32_setup_pages #endif -DECLARE_VDSO_IMAGE(vdso32_int80); -#ifdef CONFIG_COMPAT -DECLARE_VDSO_IMAGE(vdso32_syscall); -#endif -DECLARE_VDSO_IMAGE(vdso32_sysenter); - /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S index cfa6adda110b..018bcd9f97b4 100644 --- a/arch/x86/vdso/vdso32.S +++ b/arch/x86/vdso/vdso32.S @@ -1,4 +1,4 @@ -#include "vdso_image.h" +#include DEFINE_VDSO_IMAGE(vdso32_int80, "arch/x86/vdso/vdso32-int80.so") diff --git a/arch/x86/vdso/vdso_image.h b/arch/x86/vdso/vdso_image.h deleted file mode 100644 index 1baa6bc517bf..000000000000 --- a/arch/x86/vdso/vdso_image.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _VDSO_IMAGE_H -#define _VDSO_IMAGE_H - -#include -#include - -#define DEFINE_VDSO_IMAGE(symname, filename) \ -__PAGE_ALIGNED_DATA ; \ - .globl symname##_start, symname##_end ; \ - .align PAGE_SIZE ; \ - symname##_start: ; \ - .incbin filename ; \ - symname##_end: ; \ - .align PAGE_SIZE /* extra data here leaks to userspace. */ ; \ - \ -.previous ; \ - \ - .globl symname##_pages ; \ - .bss ; \ - .align 8 ; \ - .type symname##_pages, @object ; \ - symname##_pages: ; \ - .zero (symname##_end - symname##_start + PAGE_SIZE - 1) / PAGE_SIZE * (BITS_PER_LONG / 8) ; \ - .size symname##_pages, .-symname##_pages - -#define DECLARE_VDSO_IMAGE(symname) \ - extern char symname##_start[], symname##_end[]; \ - extern struct page *symname##_pages[] - -#endif /* _VDSO_IMAGE_H */ diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S index 19a692778650..f4aa34e7f370 100644 --- a/arch/x86/vdso/vdsox32.S +++ b/arch/x86/vdso/vdsox32.S @@ -1,3 +1,3 @@ -#include "vdso_image.h" +#include DEFINE_VDSO_IMAGE(vdsox32, "arch/x86/vdso/vdsox32.so") diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 6db0bbd876fd..1ad102613127 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -15,7 +15,6 @@ #include #include #include -#include "vdso_image.h" #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso_enabled = 1; From 3c1b63b9e4862fb16352a0646439c2dd6d9e0e5c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 20 Mar 2014 18:57:19 -0700 Subject: [PATCH 19/23] x86, vdso: Finish removing VDSO32_PRELINK It's a declaration of a nonexistent symbol. We can get rid of the 64-bit versions, too, but that's more intrusive. Signed-off-by: Andy Lutomirski Cc: Stefani Seibold Link: http://lkml.kernel.org/r/2ce2ce18447d8a0b78d44a278a066b6c0af06b32.1395366931.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vdso.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 7622a65a969e..d1dc55404ff1 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -41,8 +41,6 @@ DECLARE_VDSO_IMAGE(vdso32_syscall); #endif DECLARE_VDSO_IMAGE(vdso32_sysenter); -extern const char VDSO32_PRELINK[]; - /* * Given a pointer to the vDSO image, find the pointer to VDSO32_name * as that symbol is defined in the vDSO sources or linker script. From 645a387ecbdb4aa78c8451a66416340616134537 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Sun, 23 Mar 2014 17:38:14 +0100 Subject: [PATCH 20/23] x86, vdso: Fix size of get_unmapped_area() The size of the reserved memory for a 32 bit vdso must be the size of the 32 bit vDSO in pages + HPET page + VVAR page. One page is not enough for this. Grrrr.... silly copy and paste bug, was right in previous patch. Signed-off-by: Stefani Seibold Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/1395592694-20571-1-git-send-email-stefani@seibold.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso32-setup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 791c1cb822c6..00348980a3a6 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -165,12 +165,14 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) down_write(&mm->mmap_sem); - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + addr = get_unmapped_area(NULL, 0, vdso32_size + VDSO_OFFSET(VDSO_PREV_PAGES), 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } + addr += VDSO_OFFSET(VDSO_PREV_PAGES); + current->mm->context.vdso = (void *)addr; /* From 26f5ef2e3c3c18f1dc31461ddf1db00b014edcd4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 25 Mar 2014 13:41:36 -0700 Subject: [PATCH 21/23] x86, vdso: Actually discard the .discard sections The .discard/.discard.* sections are used to generate intermediate results for the assembler (effectively "test assembly".) The output is waste and should not be retained. Cc: Stefani Seibold Cc: Andy Lutomirski Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-psizrnant8x3nrhbgvq2vekr@git.kernel.org --- arch/x86/vdso/vdso-layout.lds.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index c6d0e1bf27a4..2e263f367b13 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -62,6 +62,11 @@ SECTIONS . = ALIGN(0x100); .text : { *(.text*) } :text =0x90909090 + + /DISCARD/ : { + *(.discard) + *(.discard.*) + } } /* From b9a4a56c1e5c72eac92d18aa48250a8c965632be Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 25 Mar 2014 16:25:53 -0700 Subject: [PATCH 22/23] x86, vdso, build: Don't rebuild 32-bit vdsos on every make vdso32/vclock_gettime.o was confusing kbuild. Signed-off-by: Andy Lutomirski Cc: Stefani Seibold Link: http://lkml.kernel.org/r/d741449340642213744dd659471a35bb970a0c4c.1395789923.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index a2de5fc15511..6e6361a01736 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -136,7 +136,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds targets += $(vdso32-images) $(vdso32-images:=.dbg) -targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o) +targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) extra-y += $(vdso32-images) From 37c975545ec63320789962bf307f000f08fabd48 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 29 Mar 2014 13:15:35 -0700 Subject: [PATCH 23/23] x86, vdso: Fix the symbol versions on the 32-bit vDSO The new symbols provide the same API as the 64-bit variants, so they should have the same symbol version name. This can't break userspace, since these symbols are new for 32-bit Linux. Signed-off-by: Andy Lutomirski Cc: Stefani Seibold Link: http://lkml.kernel.org/r/0a869bce03d25619565b1eee7d69a4fd15fd203a.1396124118.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso32/vdso32.lds.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S index 28c460703bbc..aadb8b9994cd 100644 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -23,14 +23,18 @@ ENTRY(__kernel_vsyscall); */ VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + }; + LINUX_2.5 { global: __kernel_vsyscall; __kernel_sigreturn; __kernel_rt_sigreturn; - __vdso_clock_gettime; - __vdso_gettimeofday; - __vdso_time; local: *; }; }