From 81b23ba645e6b2b446093b2d927c261a17f7dee3 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 14:08:07 +0800 Subject: [PATCH 001/312] csky: Fixup mb() synchronization problem The mb() is the superset of dma and smp. Using bar.xxx to implement mb() will cause problem when sync data with dma device, becasue bar.xxx couldn't guarantee bus transactions finished at outside bus level. We must use sync.s instead of bar.xxx for dma data synchronization and it will guarantee retirement after getting the bus bresponse. Changes for V2: - Use sync.s for all mb, rmb, wmb, dma_wmb, dma_rmb. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/include/asm/barrier.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h index 476eb786f22d..a430e7fddf35 100644 --- a/arch/csky/include/asm/barrier.h +++ b/arch/csky/include/asm/barrier.h @@ -9,11 +9,12 @@ #define nop() asm volatile ("nop\n":::"memory") /* - * sync: completion barrier - * sync.s: completion barrier and shareable to other cores - * sync.i: completion barrier with flush cpu pipeline - * sync.is: completion barrier with flush cpu pipeline and shareable to - * other cores + * sync: completion barrier, all sync.xx instructions + * guarantee the last response recieved by bus transaction + * made by ld/st instructions before sync.s + * sync.s: inherit from sync, but also shareable to other cores + * sync.i: inherit from sync, but also flush cpu pipeline + * sync.is: the same with sync.i + sync.s * * bar.brwarw: ordering barrier for all load/store instructions before it * bar.brwarws: ordering barrier for all load/store instructions before it @@ -27,9 +28,7 @@ */ #ifdef CONFIG_CPU_HAS_CACHEV2 -#define mb() asm volatile ("bar.brwarw\n":::"memory") -#define rmb() asm volatile ("bar.brar\n":::"memory") -#define wmb() asm volatile ("bar.bwaw\n":::"memory") +#define mb() asm volatile ("sync.s\n":::"memory") #ifdef CONFIG_SMP #define __smp_mb() asm volatile ("bar.brwarws\n":::"memory") From 7f80fe207de9602aaff028c79345caa68c90cd31 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 14:43:22 +0800 Subject: [PATCH 002/312] csky: Fixup dma_alloc_coherent with PAGE_SO attribute This bug is from commit: 2b070ccdf8c0 (fixup abiv2 mmap(... O_SYNC) failed). In that patch we remove the _PAGE_SO for memory noncache mapping and this will cause problem when drivers use dma descriptors to control the transcations without dma_w/rmb(). After referencing other archs' implementation, pgprot_writecombine is introduced for mmap(... O_SYNC). Signed-off-by: Guo Ren --- arch/csky/include/asm/pgtable.h | 10 ++++++++++ arch/csky/mm/ioremap.c | 6 ++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c429a6f347de..fc19ba446d62 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -258,6 +258,16 @@ static inline pgprot_t pgprot_noncached(pgprot_t _prot) { unsigned long prot = pgprot_val(_prot); + prot = (prot & ~_CACHE_MASK) | _CACHE_UNCACHED | _PAGE_SO; + + return __pgprot(prot); +} + +#define pgprot_writecombine pgprot_writecombine +static inline pgprot_t pgprot_writecombine(pgprot_t _prot) +{ + unsigned long prot = pgprot_val(_prot); + prot = (prot & ~_CACHE_MASK) | _CACHE_UNCACHED; return __pgprot(prot); diff --git a/arch/csky/mm/ioremap.c b/arch/csky/mm/ioremap.c index 8473b6bdf512..48531115fd9d 100644 --- a/arch/csky/mm/ioremap.c +++ b/arch/csky/mm/ioremap.c @@ -29,8 +29,7 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) vaddr = (unsigned long)area->addr; - prot = __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | - _PAGE_GLOBAL | _CACHE_UNCACHED | _PAGE_SO); + prot = pgprot_noncached(PAGE_KERNEL); if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { free_vm_area(area); @@ -51,10 +50,9 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { if (!pfn_valid(pfn)) { - vma_prot.pgprot |= _PAGE_SO; return pgprot_noncached(vma_prot); } else if (file->f_flags & O_SYNC) { - return pgprot_noncached(vma_prot); + return pgprot_writecombine(vma_prot); } return vma_prot; From 4af9027d3f4061992c0b065102a0a666b72f073b Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 17:02:26 +0800 Subject: [PATCH 003/312] csky/dma: Fixup cache_op failed when cross memory ZONEs If the paddr and size are cross between NORMAL_ZONE and HIGHMEM_ZONE memory range, cache_op will panic in do_page_fault with bad_area. Optimize the code to support the range which cross memory ZONEs. Changes for V2: - Revert back to postcore_initcall Signed-off-by: Guo Ren Cc: Christoph Hellwig Cc: Arnd Bergmann --- arch/csky/mm/dma-mapping.c | 71 ++++++++++++++------------------------ 1 file changed, 26 insertions(+), 45 deletions(-) diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index 80783bb71c5c..65f531d54814 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -20,69 +20,50 @@ static int __init atomic_pool_init(void) } postcore_initcall(atomic_pool_init); -void arch_dma_prep_coherent(struct page *page, size_t size) -{ - if (PageHighMem(page)) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - do { - void *ptr = kmap_atomic(page); - size_t _size = (size < PAGE_SIZE) ? size : PAGE_SIZE; - - memset(ptr, 0, _size); - dma_wbinv_range((unsigned long)ptr, - (unsigned long)ptr + _size); - - kunmap_atomic(ptr); - - page++; - size -= PAGE_SIZE; - count--; - } while (count); - } else { - void *ptr = page_address(page); - - memset(ptr, 0, size); - dma_wbinv_range((unsigned long)ptr, (unsigned long)ptr + size); - } -} - static inline void cache_op(phys_addr_t paddr, size_t size, void (*fn)(unsigned long start, unsigned long end)) { - struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); - unsigned int offset = paddr & ~PAGE_MASK; - size_t left = size; - unsigned long start; + struct page *page = phys_to_page(paddr); + void *start = __va(page_to_phys(page)); + unsigned long offset = offset_in_page(paddr); + size_t left = size; do { size_t len = left; + if (offset + len > PAGE_SIZE) + len = PAGE_SIZE - offset; + if (PageHighMem(page)) { - void *addr; + start = kmap_atomic(page); - if (offset + len > PAGE_SIZE) { - if (offset >= PAGE_SIZE) { - page += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; - } - len = PAGE_SIZE - offset; - } + fn((unsigned long)start + offset, + (unsigned long)start + offset + len); - addr = kmap_atomic(page); - start = (unsigned long)(addr + offset); - fn(start, start + len); - kunmap_atomic(addr); + kunmap_atomic(start); } else { - start = (unsigned long)phys_to_virt(paddr); - fn(start, start + size); + fn((unsigned long)start + offset, + (unsigned long)start + offset + len); } offset = 0; + page++; + start += PAGE_SIZE; left -= len; } while (left); } +static void dma_wbinv_set_zero_range(unsigned long start, unsigned long end) +{ + memset((void *)start, 0, end - start); + dma_wbinv_range(start, end); +} + +void arch_dma_prep_coherent(struct page *page, size_t size) +{ + cache_op(page_to_phys(page), size, dma_wbinv_set_zero_range); +} + void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir) { From ae76f635d4e1cffa6870cc5472567ca9d6940a22 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 30 Jul 2019 17:16:28 +0800 Subject: [PATCH 004/312] csky: Optimize arch_sync_dma_for_cpu/device with dma_inv_range DMA_FROM_DEVICE only need to read dma data of memory into CPU cache, so there is no need to clear cache before. Also clear + inv for DMA_FROM_DEVICE won't cause problem, because the memory range for dma won't be touched by software during dma working. Changes for V2: - Remove clr cache and ignore the DMA_TO_DEVICE in _for_cpu. - Change inv to wbinv cache with DMA_FROM_DEVICE in _for_device. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/include/asm/cache.h | 1 + arch/csky/mm/cachev1.c | 7 ++++++- arch/csky/mm/cachev2.c | 11 ++++++++++- arch/csky/mm/dma-mapping.c | 5 ++--- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/csky/include/asm/cache.h b/arch/csky/include/asm/cache.h index d68373463676..1d5fc2f78fd7 100644 --- a/arch/csky/include/asm/cache.h +++ b/arch/csky/include/asm/cache.h @@ -24,6 +24,7 @@ void cache_wbinv_range(unsigned long start, unsigned long end); void cache_wbinv_all(void); void dma_wbinv_range(unsigned long start, unsigned long end); +void dma_inv_range(unsigned long start, unsigned long end); void dma_wb_range(unsigned long start, unsigned long end); #endif diff --git a/arch/csky/mm/cachev1.c b/arch/csky/mm/cachev1.c index b8a75cce0b8c..494ec912abff 100644 --- a/arch/csky/mm/cachev1.c +++ b/arch/csky/mm/cachev1.c @@ -120,7 +120,12 @@ void dma_wbinv_range(unsigned long start, unsigned long end) cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); } +void dma_inv_range(unsigned long start, unsigned long end) +{ + cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); +} + void dma_wb_range(unsigned long start, unsigned long end) { - cache_op_range(start, end, DATA_CACHE|CACHE_INV, 1); + cache_op_range(start, end, DATA_CACHE|CACHE_CLR|CACHE_INV, 1); } diff --git a/arch/csky/mm/cachev2.c b/arch/csky/mm/cachev2.c index baaf05d69f44..b61be6518e21 100644 --- a/arch/csky/mm/cachev2.c +++ b/arch/csky/mm/cachev2.c @@ -69,11 +69,20 @@ void dma_wbinv_range(unsigned long start, unsigned long end) sync_is(); } +void dma_inv_range(unsigned long start, unsigned long end) +{ + unsigned long i = start & ~(L1_CACHE_BYTES - 1); + + for (; i < end; i += L1_CACHE_BYTES) + asm volatile("dcache.iva %0\n"::"r"(i):"memory"); + sync_is(); +} + void dma_wb_range(unsigned long start, unsigned long end) { unsigned long i = start & ~(L1_CACHE_BYTES - 1); for (; i < end; i += L1_CACHE_BYTES) - asm volatile("dcache.civa %0\n"::"r"(i):"memory"); + asm volatile("dcache.cva %0\n"::"r"(i):"memory"); sync_is(); } diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index 65f531d54814..106ef02a8f89 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -85,11 +85,10 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, { switch (dir) { case DMA_TO_DEVICE: - cache_op(paddr, size, dma_wb_range); - break; + return; case DMA_FROM_DEVICE: case DMA_BIDIRECTIONAL: - cache_op(paddr, size, dma_wbinv_range); + cache_op(paddr, size, dma_inv_range); break; default: BUG(); From a92c7ba982e3950a82fd63b3fd289248bd99e8f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Wed, 7 Aug 2019 19:22:34 -0400 Subject: [PATCH 005/312] fs/handle.c - fix up kerneldoc When building with W=1, we get some kerneldoc warnings: CC fs/fhandle.o fs/fhandle.c:259: warning: Function parameter or member 'flags' not described in 'sys_open_by_handle_at' fs/fhandle.c:259: warning: Excess function parameter 'flag' description in 'sys_open_by_handle_at' Fix the typo that caused it. Signed-off-by: Valdis Kletnieks Signed-off-by: Al Viro --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 0ee727485615..01263ffbc4c0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -246,7 +246,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened - * @flag: open flags. + * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative From 5336c17928cc464845ff765ce45b368c22f848e0 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Thu, 15 Aug 2019 16:24:56 +0800 Subject: [PATCH 006/312] csky: Fixup ioremap function losing Implement the following apis to meet usage in different scenarios. - ioremap (NonCache + StrongOrder) - ioremap_nocache (NonCache + StrongOrder) - ioremap_wc (NonCache + WeakOrder ) - ioremap_cache ( Cache + WeakOrder ) Also change flag VM_ALLOC to VM_IOREMAP in get_vm_area_caller. Signed-off-by: Guo Ren Cc: Arnd Bergmann Cc: Christoph Hellwig --- arch/csky/include/asm/io.h | 23 ++++++++++++----------- arch/csky/mm/ioremap.c | 23 +++++++++++++++++------ 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/arch/csky/include/asm/io.h b/arch/csky/include/asm/io.h index c1dfa9c10e36..80d071e2567f 100644 --- a/arch/csky/include/asm/io.h +++ b/arch/csky/include/asm/io.h @@ -4,17 +4,10 @@ #ifndef __ASM_CSKY_IO_H #define __ASM_CSKY_IO_H -#include +#include #include #include -extern void __iomem *ioremap(phys_addr_t offset, size_t size); - -extern void iounmap(void *addr); - -extern int remap_area_pages(unsigned long address, phys_addr_t phys_addr, - size_t size, unsigned long flags); - /* * I/O memory access primitives. Reads are ordered relative to any * following Normal memory access. Writes are ordered relative to any prior @@ -40,9 +33,17 @@ extern int remap_area_pages(unsigned long address, phys_addr_t phys_addr, #define writel(v,c) ({ wmb(); writel_relaxed((v),(c)); mb(); }) #endif -#define ioremap_nocache(phy, sz) ioremap(phy, sz) -#define ioremap_wc ioremap_nocache -#define ioremap_wt ioremap_nocache +/* + * I/O memory mapping functions. + */ +extern void __iomem *ioremap_cache(phys_addr_t addr, size_t size); +extern void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot); +extern void iounmap(void *addr); + +#define ioremap(addr, size) __ioremap((addr), (size), pgprot_noncached(PAGE_KERNEL)) +#define ioremap_wc(addr, size) __ioremap((addr), (size), pgprot_writecombine(PAGE_KERNEL)) +#define ioremap_nocache(addr, size) ioremap((addr), (size)) +#define ioremap_cache ioremap_cache #include diff --git a/arch/csky/mm/ioremap.c b/arch/csky/mm/ioremap.c index 48531115fd9d..e13cd3497628 100644 --- a/arch/csky/mm/ioremap.c +++ b/arch/csky/mm/ioremap.c @@ -8,12 +8,12 @@ #include -void __iomem *ioremap(phys_addr_t addr, size_t size) +static void __iomem *__ioremap_caller(phys_addr_t addr, size_t size, + pgprot_t prot, void *caller) { phys_addr_t last_addr; unsigned long offset, vaddr; struct vm_struct *area; - pgprot_t prot; last_addr = addr + size - 1; if (!size || last_addr < addr) @@ -23,14 +23,12 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) addr &= PAGE_MASK; size = PAGE_ALIGN(size + offset); - area = get_vm_area_caller(size, VM_ALLOC, __builtin_return_address(0)); + area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) return NULL; vaddr = (unsigned long)area->addr; - prot = pgprot_noncached(PAGE_KERNEL); - if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { free_vm_area(area); return NULL; @@ -38,7 +36,20 @@ void __iomem *ioremap(phys_addr_t addr, size_t size) return (void __iomem *)(vaddr + offset); } -EXPORT_SYMBOL(ioremap); + +void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot) +{ + return __ioremap_caller(phys_addr, size, prot, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(__ioremap); + +void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) +{ + return __ioremap_caller(phys_addr, size, PAGE_KERNEL, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_cache); void iounmap(void __iomem *addr) { From be819aa6f11145de32dab8690ec6055348488c18 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 15:31:29 +0800 Subject: [PATCH 007/312] csky: Fixup arch_get_unmapped_area() implementation Current arch_get_unmapped_area() of abiv1 doesn't use standard kernel api. After referring to the implementation of arch/arm, we implement it with vm_unmapped_area() from linux/mm.h. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/inc/abi/page.h | 5 ++- arch/csky/abiv1/mmap.c | 77 ++++++++++++++++++---------------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/arch/csky/abiv1/inc/abi/page.h b/arch/csky/abiv1/inc/abi/page.h index 6336e92a103a..c864519117c7 100644 --- a/arch/csky/abiv1/inc/abi/page.h +++ b/arch/csky/abiv1/inc/abi/page.h @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. -extern unsigned long shm_align_mask; +#include + extern void flush_dcache_page(struct page *page); static inline unsigned long pages_do_alias(unsigned long addr1, unsigned long addr2) { - return (addr1 ^ addr2) & shm_align_mask; + return (addr1 ^ addr2) & (SHMLBA-1); } static inline void clear_user_page(void *addr, unsigned long vaddr, diff --git a/arch/csky/abiv1/mmap.c b/arch/csky/abiv1/mmap.c index b462fd50b23a..6792aca49999 100644 --- a/arch/csky/abiv1/mmap.c +++ b/arch/csky/abiv1/mmap.c @@ -9,58 +9,63 @@ #include #include -unsigned long shm_align_mask = (0x4000 >> 1) - 1; /* Sane caches */ +#define COLOUR_ALIGN(addr,pgoff) \ + ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ + (((pgoff)<mm; + struct vm_area_struct *vma; + int do_align = 0; + struct vm_unmapped_area_info info; + /* + * We only need to do colour alignment if either the I or D + * caches alias. + */ + do_align = filp || (flags & MAP_SHARED); + + /* + * We enforce the MAP_FIXED case. + */ if (flags & MAP_FIXED) { - /* - * We do not accept a shared mapping if it would violate - * cache aliasing constraints. - */ - if ((flags & MAP_SHARED) && - ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) + if (flags & MAP_SHARED && + (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)) return -EINVAL; return addr; } if (len > TASK_SIZE) return -ENOMEM; - do_color_align = 0; - if (filp || (flags & MAP_SHARED)) - do_color_align = 1; + if (addr) { - if (do_color_align) + if (do_align) addr = COLOUR_ALIGN(addr, pgoff); else addr = PAGE_ALIGN(addr); - vmm = find_vma(current->mm, addr); - if (TASK_SIZE - len >= addr && - (!vmm || addr + len <= vmm->vm_start)) - return addr; - } - addr = TASK_UNMAPPED_BASE; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); - else - addr = PAGE_ALIGN(addr); - for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { - /* At this point: (!vmm || addr < vmm->vm_end). */ - if (TASK_SIZE - len < addr) - return -ENOMEM; - if (!vmm || addr + len <= vmm->vm_start) + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vm_start_gap(vma))) return addr; - addr = vmm->vm_end; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; + info.align_offset = pgoff << PAGE_SHIFT; + return vm_unmapped_area(&info); } From dc140045c0cace809af872e3799e8fbe1b7d7f86 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 12:47:24 +0800 Subject: [PATCH 008/312] csky: Fixup defer cache flush for 610 We use defer cache flush mechanism to improve the performance of 610, but the implementation is wrong. We fix it up now and update the mechanism: - Zero page needn't be flushed. - If page is file mapping & non-touched in user space, defer flush. - If page is anon mapping or dirty file mapping, flush immediately. - In update_mmu_cache finish the defer flush by flush_dcache_page(). For 610 we need take care the dcache aliasing issue: - VIPT cache with 8K-bytes size per way in 4K page granularity. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/cacheflush.c | 50 +++++++++++++++------------- arch/csky/abiv1/inc/abi/cacheflush.h | 4 +-- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 10af8b6fe322..fee99fc6612f 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -11,42 +11,46 @@ #include #include +#define PG_dcache_clean PG_arch_1 + void flush_dcache_page(struct page *page) { - struct address_space *mapping = page_mapping(page); - unsigned long addr; + struct address_space *mapping; - if (mapping && !mapping_mapped(mapping)) { - set_bit(PG_arch_1, &(page)->flags); + if (page == ZERO_PAGE(0)) return; + + mapping = page_mapping_file(page); + + if (mapping && !page_mapcount(page)) + clear_bit(PG_dcache_clean, &page->flags); + else { + dcache_wbinv_all(); + if (mapping) + icache_inv_all(); + set_bit(PG_dcache_clean, &page->flags); } - - /* - * We could delay the flush for the !page_mapping case too. But that - * case is for exec env/arg pages and those are %99 certainly going to - * get faulted into the tlb (and thus flushed) anyways. - */ - addr = (unsigned long) page_address(page); - dcache_wb_range(addr, addr + PAGE_SIZE); } +EXPORT_SYMBOL(flush_dcache_page); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) +void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep) { - unsigned long addr; + unsigned long pfn = pte_pfn(*ptep); struct page *page; - unsigned long pfn; - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) + if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - addr = (unsigned long) page_address(page); + if (page == ZERO_PAGE(0)) + return; - if (vma->vm_flags & VM_EXEC || - pages_do_alias(addr, address & PAGE_MASK)) - cache_wbinv_all(); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + dcache_wbinv_all(); - clear_bit(PG_arch_1, &(page)->flags); + if (page_mapping_file(page)) { + if (vma->vm_flags & VM_EXEC) + icache_inv_all(); + } } diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 5f663aef9b1b..fce5604cef40 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -26,8 +26,8 @@ extern void flush_dcache_page(struct page *); #define flush_icache_page(vma, page) cache_wbinv_all() #define flush_icache_range(start, end) cache_wbinv_range(start, end) -#define flush_icache_user_range(vma, pg, adr, len) \ - cache_wbinv_range(adr, adr + len) +#define flush_icache_user_range(vma,page,addr,len) \ + flush_dcache_page(page) #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ do { \ From c7e6f0e99227b3dcdc5e62f789119e000887ff79 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 20 Aug 2019 20:15:44 +0800 Subject: [PATCH 009/312] csky: Support kernel non-aligned access We prohibit non-aligned access in kernel mode, but some special NIC driver needs to support kernel-state unaligned access. For example, when the bus does not support unaligned access, IP header parsing will cause non-aligned access and driver does not recopy the skb buffer to dma for performance reasons. Added kernel_enable & user_enable to control unaligned access and added kernel_count & user_count for statistical unaligned access. Signed-off-by: Guo Ren Cc: Arnd Bergmann --- arch/csky/abiv1/alignment.c | 62 +++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c index 27ef5b2c43ab..cb2a0d94a144 100644 --- a/arch/csky/abiv1/alignment.c +++ b/arch/csky/abiv1/alignment.c @@ -5,8 +5,10 @@ #include #include -static int align_enable = 1; -static int align_count; +static int align_kern_enable = 1; +static int align_usr_enable = 1; +static int align_kern_count = 0; +static int align_usr_count = 0; static inline uint32_t get_ptreg(struct pt_regs *regs, uint32_t rx) { @@ -32,9 +34,6 @@ static int ldb_asm(uint32_t addr, uint32_t *valp) uint32_t val; int err; - if (!access_ok((void *)addr, 1)) - return 1; - asm volatile ( "movi %0, 0\n" "1:\n" @@ -67,9 +66,6 @@ static int stb_asm(uint32_t addr, uint32_t val) { int err; - if (!access_ok((void *)addr, 1)) - return 1; - asm volatile ( "movi %0, 0\n" "1:\n" @@ -203,8 +199,6 @@ static int stw_c(struct pt_regs *regs, uint32_t rz, uint32_t addr) if (stb_asm(addr, byte3)) return 1; - align_count++; - return 0; } @@ -226,7 +220,14 @@ void csky_alignment(struct pt_regs *regs) uint32_t addr = 0; if (!user_mode(regs)) + goto kernel_area; + + if (!align_usr_enable) { + pr_err("%s user disabled.\n", __func__); goto bad_area; + } + + align_usr_count++; ret = get_user(tmp, (uint16_t *)instruction_pointer(regs)); if (ret) { @@ -234,6 +235,19 @@ void csky_alignment(struct pt_regs *regs) goto bad_area; } + goto good_area; + +kernel_area: + if (!align_kern_enable) { + pr_err("%s kernel disabled.\n", __func__); + goto bad_area; + } + + align_kern_count++; + + tmp = *(uint16_t *)instruction_pointer(regs); + +good_area: opcode = (uint32_t)tmp; rx = opcode & 0xf; @@ -286,18 +300,32 @@ bad_area: force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr); } -static struct ctl_table alignment_tbl[4] = { +static struct ctl_table alignment_tbl[5] = { { - .procname = "enable", - .data = &align_enable, - .maxlen = sizeof(align_enable), + .procname = "kernel_enable", + .data = &align_kern_enable, + .maxlen = sizeof(align_kern_enable), .mode = 0666, .proc_handler = &proc_dointvec }, { - .procname = "count", - .data = &align_count, - .maxlen = sizeof(align_count), + .procname = "user_enable", + .data = &align_usr_enable, + .maxlen = sizeof(align_usr_enable), + .mode = 0666, + .proc_handler = &proc_dointvec + }, + { + .procname = "kernel_count", + .data = &align_kern_count, + .maxlen = sizeof(align_kern_count), + .mode = 0666, + .proc_handler = &proc_dointvec + }, + { + .procname = "user_count", + .data = &align_usr_count, + .maxlen = sizeof(align_usr_count), .mode = 0666, .proc_handler = &proc_dointvec }, From 4ad35c1f56386c8e7019c921bba1af109fde9693 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 21 Aug 2019 19:15:52 +0800 Subject: [PATCH 010/312] csky: Fixup 610 vipt cache flush mechanism 610 has vipt aliasing issue, so we need to finish the cache flush apis mentioned in cachetlb.rst to avoid data corruption. Here is the list of modified apis in the patch: - flush_kernel_dcache_page (new add) - flush_dcache_mmap_lock (new add) - flush_dcache_mmap_unlock (new add) - flush_kernel_vmap_range (new add) - invalidate_kernel_vmap_range (new add) - flush_anon_page (new add) - flush_cache_range (new add) - flush_cache_vmap (flush all) - flush_cache_vunmap (flush all) - flush_cache_mm (only dcache flush) - flush_icache_page (just nop) - copy_from_user_page (remove no need flush) - copy_to_user_page (remove no need flush) Change to V2: - Fixup compile error with xa_lock*(&mapping->i_pages) Signed-off-by: Guo Ren Cc: Arnd Bergmann Cc: Christoph Hellwig --- arch/csky/abiv1/cacheflush.c | 20 ++++++++++++++ arch/csky/abiv1/inc/abi/cacheflush.h | 41 ++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index fee99fc6612f..9f1fe80cc847 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -54,3 +54,23 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, icache_inv_all(); } } + +void flush_kernel_dcache_page(struct page *page) +{ + struct address_space *mapping; + + mapping = page_mapping_file(page); + + if (!mapping || mapping_mapped(mapping)) + dcache_wbinv_all(); +} +EXPORT_SYMBOL(flush_kernel_dcache_page); + +void flush_cache_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + dcache_wbinv_all(); + + if (vma->vm_flags & VM_EXEC) + icache_inv_all(); +} diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index fce5604cef40..79ef9e8c1afd 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -4,26 +4,49 @@ #ifndef __ABI_CSKY_CACHEFLUSH_H #define __ABI_CSKY_CACHEFLUSH_H -#include +#include #include #include #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); -#define flush_cache_mm(mm) cache_wbinv_all() +#define flush_cache_mm(mm) dcache_wbinv_all() #define flush_cache_page(vma, page, pfn) cache_wbinv_all() #define flush_cache_dup_mm(mm) cache_wbinv_all() +#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +extern void flush_kernel_dcache_page(struct page *); + +#define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) +#define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) + +static inline void flush_kernel_vmap_range(void *addr, int size) +{ + dcache_wbinv_all(); +} +static inline void invalidate_kernel_vmap_range(void *addr, int size) +{ + dcache_wbinv_all(); +} + +#define ARCH_HAS_FLUSH_ANON_PAGE +static inline void flush_anon_page(struct vm_area_struct *vma, + struct page *page, unsigned long vmaddr) +{ + if (PageAnon(page)) + cache_wbinv_all(); +} + /* * if (current_mm != vma->mm) cache_wbinv_range(start, end) will be broken. * Use cache_wbinv_all() here and need to be improved in future. */ -#define flush_cache_range(vma, start, end) cache_wbinv_all() -#define flush_cache_vmap(start, end) cache_wbinv_range(start, end) -#define flush_cache_vunmap(start, end) cache_wbinv_range(start, end) +extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +#define flush_cache_vmap(start, end) cache_wbinv_all() +#define flush_cache_vunmap(start, end) cache_wbinv_all() -#define flush_icache_page(vma, page) cache_wbinv_all() +#define flush_icache_page(vma, page) do {} while (0); #define flush_icache_range(start, end) cache_wbinv_range(start, end) #define flush_icache_user_range(vma,page,addr,len) \ @@ -31,19 +54,13 @@ extern void flush_dcache_page(struct page *); #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ do { \ - cache_wbinv_all(); \ memcpy(dst, src, len); \ - cache_wbinv_all(); \ } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ - cache_wbinv_all(); \ memcpy(dst, src, len); \ cache_wbinv_all(); \ } while (0) -#define flush_dcache_mmap_lock(mapping) do {} while (0) -#define flush_dcache_mmap_unlock(mapping) do {} while (0) - #endif /* __ABI_CSKY_CACHEFLUSH_H */ From d098913a10f8ef8e6043765d7f2fa552527d9c42 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 5 Sep 2019 07:37:22 -0700 Subject: [PATCH 011/312] bus: ti-sysc: Fix clock handling for no-idle quirks NFSroot can fail on dra7 when cpsw is probed using ti-sysc interconnect target module driver as reported by Keerthy. Device clocks and the interconnect target module may or may not be enabled by the bootloader on init, but we currently assume the clocks and module are on from the bootloader for "ti,no-idle" and "ti,no-idle-on-init" quirks as reported by Grygorii Strashko. Let's fix the issue by always enabling clocks init, and never disable them for "ti,no-idle" quirk. For "ti,no-idle-on-init" quirk, we must decrement the usage count later on to allow PM runtime to idle the module if requested. Fixes: 1a5cd7c23cc5 ("bus: ti-sysc: Enable all clocks directly during init to read revision") Cc: Keerthy Cc: Vignesh Raghavendra Reported-by: Keerthy Reported-by: Grygorii Strashko Reviewed-by: Grygorii Strashko Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 48 +++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 2db474ab4c6b..da88de487792 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -1630,17 +1630,19 @@ static int sysc_init_module(struct sysc *ddata) if (error) return error; - if (manage_clocks) { - sysc_clkdm_deny_idle(ddata); + sysc_clkdm_deny_idle(ddata); - error = sysc_enable_opt_clocks(ddata); - if (error) - return error; + /* + * Always enable clocks. The bootloader may or may not have enabled + * the related clocks. + */ + error = sysc_enable_opt_clocks(ddata); + if (error) + return error; - error = sysc_enable_main_clocks(ddata); - if (error) - goto err_opt_clocks; - } + error = sysc_enable_main_clocks(ddata); + if (error) + goto err_opt_clocks; if (!(ddata->cfg.quirks & SYSC_QUIRK_NO_RESET_ON_INIT)) { error = sysc_rstctrl_reset_deassert(ddata, true); @@ -1658,7 +1660,7 @@ static int sysc_init_module(struct sysc *ddata) goto err_main_clocks; } - if (!ddata->legacy_mode && manage_clocks) { + if (!ddata->legacy_mode) { error = sysc_enable_module(ddata->dev); if (error) goto err_main_clocks; @@ -1675,6 +1677,7 @@ err_main_clocks: if (manage_clocks) sysc_disable_main_clocks(ddata); err_opt_clocks: + /* No re-enable of clockdomain autoidle to prevent module autoidle */ if (manage_clocks) { sysc_disable_opt_clocks(ddata); sysc_clkdm_allow_idle(ddata); @@ -2355,6 +2358,28 @@ static void ti_sysc_idle(struct work_struct *work) ddata = container_of(work, struct sysc, idle_work.work); + /* + * One time decrement of clock usage counts if left on from init. + * Note that we disable opt clocks unconditionally in this case + * as they are enabled unconditionally during init without + * considering sysc_opt_clks_needed() at that point. + */ + if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | + SYSC_QUIRK_NO_IDLE_ON_INIT)) { + sysc_clkdm_deny_idle(ddata); + sysc_disable_main_clocks(ddata); + sysc_disable_opt_clocks(ddata); + sysc_clkdm_allow_idle(ddata); + } + + /* Keep permanent PM runtime usage count for SYSC_QUIRK_NO_IDLE */ + if (ddata->cfg.quirks & SYSC_QUIRK_NO_IDLE) + return; + + /* + * Decrement PM runtime usage count for SYSC_QUIRK_NO_IDLE_ON_INIT + * and SYSC_QUIRK_NO_RESET_ON_INIT + */ if (pm_runtime_active(ddata->dev)) pm_runtime_put_sync(ddata->dev); } @@ -2439,7 +2464,8 @@ static int sysc_probe(struct platform_device *pdev) INIT_DELAYED_WORK(&ddata->idle_work, ti_sysc_idle); /* At least earlycon won't survive without deferred idle */ - if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE_ON_INIT | + if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | + SYSC_QUIRK_NO_IDLE_ON_INIT | SYSC_QUIRK_NO_RESET_ON_INIT)) { schedule_delayed_work(&ddata->idle_work, 3000); } else { From 2783d0638a51e8dba6319d9f4a2b445995a4fad1 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 5 Sep 2019 13:01:29 -0700 Subject: [PATCH 012/312] bus: ti-sysc: Fix handling of invalid clocks We can currently get "Unable to handle kernel paging request at virtual address" for invalid clocks with dts node but no driver: (__clk_get_hw) from [] (ti_sysc_find_one_clockdomain+0x18/0x34) (ti_sysc_find_one_clockdomain) from [] (ti_sysc_clkdm_init+0x34/0xdc) (ti_sysc_clkdm_init) from [] (sysc_probe+0xa50/0x10e8) (sysc_probe) from [] (platform_drv_probe+0x58/0xa8) Let's add IS_ERR checks to ti_sysc_clkdm_init() as And let's start treating clk_get() with -ENOENT as a proper error. If the clock name is specified in device tree we must succeed with clk_get() to continue. For modules with no clock names specified in device tree we will just ignore the clocks. Fixes: 2b2f7def058a ("bus: ti-sysc: Add support for missing clockdomain handling") Acked-by: Roger Quadros Tested-by: Keerthy Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/pdata-quirks.c | 4 ++-- drivers/bus/ti-sysc.c | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c index 6c6f8fce854e..d942a3357090 100644 --- a/arch/arm/mach-omap2/pdata-quirks.c +++ b/arch/arm/mach-omap2/pdata-quirks.c @@ -491,11 +491,11 @@ static int ti_sysc_clkdm_init(struct device *dev, struct clk *fck, struct clk *ick, struct ti_sysc_cookie *cookie) { - if (fck) + if (!IS_ERR(fck)) cookie->clkdm = ti_sysc_find_one_clockdomain(fck); if (cookie->clkdm) return 0; - if (ick) + if (!IS_ERR(ick)) cookie->clkdm = ti_sysc_find_one_clockdomain(ick); if (cookie->clkdm) return 0; diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index da88de487792..24583d82b584 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -280,9 +280,6 @@ static int sysc_get_one_clock(struct sysc *ddata, const char *name) ddata->clocks[index] = devm_clk_get(ddata->dev, name); if (IS_ERR(ddata->clocks[index])) { - if (PTR_ERR(ddata->clocks[index]) == -ENOENT) - return 0; - dev_err(ddata->dev, "clock get error for %s: %li\n", name, PTR_ERR(ddata->clocks[index])); @@ -357,7 +354,7 @@ static int sysc_get_clocks(struct sysc *ddata) continue; error = sysc_get_one_clock(ddata, name); - if (error && error != -ENOENT) + if (error) return error; } From 4957eccf979b025286b39388fd1a60cde601a10a Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:49 -0500 Subject: [PATCH 013/312] ARM: omap2plus_defconfig: Fix missing video When the panel-dpi driver was removed, the simple-panels driver was never enabled, so anyone who used the panel-dpi driver lost video, and those who used it inconjunction with simple-panels would have to manually enable CONFIG_DRM_PANEL_SIMPLE. This patch makes CONFIG_DRM_PANEL_SIMPLE a module in the same way the deprecated panel-dpi was. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index c7bf9c493646..64eb896907bf 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -363,6 +363,7 @@ CONFIG_DRM_OMAP_PANEL_TPO_TD028TTEC1=m CONFIG_DRM_OMAP_PANEL_TPO_TD043MTEA1=m CONFIG_DRM_OMAP_PANEL_NEC_NL8048HL11=m CONFIG_DRM_TILCDC=m +CONFIG_DRM_PANEL_SIMPLE=m CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_MODE_HELPERS=y From f9f5518a38684e031d913f40482721ff553f5ba2 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:50 -0500 Subject: [PATCH 014/312] ARM: dts: logicpd-torpedo-baseboard: Fix missing video A previous commit removed the panel-dpi driver, which made the Torpedo video stop working because it relied on the dpi driver for setting video timings. Now that the simple-panel driver is available in omap2plus, this patch migrates the Torpedo dev kits to use a similar panel and remove the manual timing requirements. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- .../boot/dts/logicpd-torpedo-baseboard.dtsi | 39 ++++--------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi b/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi index 642e809e757a..449cc7616da6 100644 --- a/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi +++ b/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi @@ -108,7 +108,6 @@ &dss { status = "ok"; vdds_dsi-supply = <&vpll2>; - vdda_video-supply = <&video_reg>; pinctrl-names = "default"; pinctrl-0 = <&dss_dpi_pins1>; port { @@ -124,44 +123,20 @@ display0 = &lcd0; }; - video_reg: video_reg { + lcd0: display { + /* This isn't the exact LCD, but the timings meet spec */ + /* To make it work, set CONFIG_OMAP2_DSS_MIN_FCK_PER_PCK=4 */ + compatible = "newhaven,nhd-4.3-480272ef-atxl"; + label = "15"; pinctrl-names = "default"; pinctrl-0 = <&panel_pwr_pins>; - compatible = "regulator-fixed"; - regulator-name = "fixed-supply"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - gpio = <&gpio5 27 GPIO_ACTIVE_HIGH>; /* gpio155, lcd INI */ - }; - - lcd0: display { - compatible = "panel-dpi"; - label = "15"; - status = "okay"; - /* default-on; */ - pinctrl-names = "default"; - + backlight = <&bl>; + enable-gpios = <&gpio5 27 GPIO_ACTIVE_HIGH>; port { lcd_in: endpoint { remote-endpoint = <&dpi_out>; }; }; - - panel-timing { - clock-frequency = <9000000>; - hactive = <480>; - vactive = <272>; - hfront-porch = <3>; - hback-porch = <2>; - hsync-len = <42>; - vback-porch = <3>; - vfront-porch = <4>; - vsync-len = <11>; - hsync-active = <0>; - vsync-active = <0>; - de-active = <1>; - pixelclk-active = <1>; - }; }; bl: backlight { From 24cf23276a54dd2825d3e3965c1b1b453e2a113d Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 28 Aug 2019 13:33:51 -0500 Subject: [PATCH 015/312] ARM: dts: am3517-evm: Fix missing video A previous commit removed the panel-dpi driver, which made the video on the AM3517-evm stop working because it relied on the dpi driver for setting video timings. Now that the simple-panel driver is available in omap2plus, this patch migrates the am3517-evm to use a similar panel and remove the manual timing requirements. Fixes: 8bf4b1621178 ("drm/omap: Remove panel-dpi driver") Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am3517-evm.dts | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/arm/boot/dts/am3517-evm.dts b/arch/arm/boot/dts/am3517-evm.dts index ebfe28c2f544..a1fd3e63e86e 100644 --- a/arch/arm/boot/dts/am3517-evm.dts +++ b/arch/arm/boot/dts/am3517-evm.dts @@ -124,10 +124,11 @@ }; lcd0: display@0 { - compatible = "panel-dpi"; + /* This isn't the exact LCD, but the timings meet spec */ + /* To make it work, set CONFIG_OMAP2_DSS_MIN_FCK_PER_PCK=4 */ + compatible = "newhaven,nhd-4.3-480272ef-atxl"; label = "15"; - status = "okay"; - pinctrl-names = "default"; + backlight = <&bl>; enable-gpios = <&gpio6 16 GPIO_ACTIVE_HIGH>; /* gpio176, lcd INI */ vcc-supply = <&vdd_io_reg>; @@ -136,22 +137,6 @@ remote-endpoint = <&dpi_out>; }; }; - - panel-timing { - clock-frequency = <9000000>; - hactive = <480>; - vactive = <272>; - hfront-porch = <3>; - hback-porch = <2>; - hsync-len = <42>; - vback-porch = <3>; - vfront-porch = <4>; - vsync-len = <11>; - hsync-active = <0>; - vsync-active = <0>; - de-active = <1>; - pixelclk-active = <1>; - }; }; bl: backlight { From a932b77b4d1939ad173f18be87da409427fb705c Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Tue, 20 Aug 2019 07:17:27 -0500 Subject: [PATCH 016/312] ARM: dts: logicpd-som-lv: Fix i2c2 and i2c3 Pin mux When the pinmux configuration was added, it was accidentally placed into the omap3_pmx_wkup node when it should have been placed into the omap3_pmx_core. This error was accidentally propagated to stable by me when I blindly requested the pull after seeing I2C issues without actually reviewing the content of the pinout. Since the bootloader previously muxed these correctly in the past, was a hidden error. This patch moves the i2c2_pins and i2c3_pins to the correct node which should eliminate i2c bus errors and timeouts due to the fact the bootloader uses the save device tree that no longer properly assigns these pins. Fixes: 5fe3c0fa0d54 ("ARM: dts: Add pinmuxing for i2c2 and i2c3 for LogicPD SOM-LV") #4.9+ Signed-off-by: Adam Ford Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/logicpd-som-lv.dtsi | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi index 5563ee54c960..b56524cc7fe2 100644 --- a/arch/arm/boot/dts/logicpd-som-lv.dtsi +++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi @@ -228,6 +228,20 @@ >; }; + i2c2_pins: pinmux_i2c2_pins { + pinctrl-single,pins = < + OMAP3_CORE1_IOPAD(0x21be, PIN_INPUT | MUX_MODE0) /* i2c2_scl */ + OMAP3_CORE1_IOPAD(0x21c0, PIN_INPUT | MUX_MODE0) /* i2c2_sda */ + >; + }; + + i2c3_pins: pinmux_i2c3_pins { + pinctrl-single,pins = < + OMAP3_CORE1_IOPAD(0x21c2, PIN_INPUT | MUX_MODE0) /* i2c3_scl */ + OMAP3_CORE1_IOPAD(0x21c4, PIN_INPUT | MUX_MODE0) /* i2c3_sda */ + >; + }; + tsc2004_pins: pinmux_tsc2004_pins { pinctrl-single,pins = < OMAP3_CORE1_IOPAD(0x2186, PIN_INPUT | MUX_MODE4) /* mcbsp4_dr.gpio_153 */ @@ -249,18 +263,6 @@ OMAP3_WKUP_IOPAD(0x2a0c, PIN_OUTPUT | MUX_MODE4) /* sys_boot1.gpio_3 */ >; }; - i2c2_pins: pinmux_i2c2_pins { - pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x21be, PIN_INPUT | MUX_MODE0) /* i2c2_scl */ - OMAP3_CORE1_IOPAD(0x21c0, PIN_INPUT | MUX_MODE0) /* i2c2_sda */ - >; - }; - i2c3_pins: pinmux_i2c3_pins { - pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x21c2, PIN_INPUT | MUX_MODE0) /* i2c3_scl */ - OMAP3_CORE1_IOPAD(0x21c4, PIN_INPUT | MUX_MODE0) /* i2c3_sda */ - >; - }; }; &omap3_pmx_core2 { From a4c8723a162e6244fb01944fbf446750575dba59 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 6 Sep 2019 12:57:46 -0700 Subject: [PATCH 017/312] bus: ti-sysc: Remove unpaired sysc_clkdm_deny_idle() Commit d098913a10f8 ("bus: ti-sysc: Fix clock handling for no-idle quirks") fixed handling for no-idle quirk modules that are not enabled by the bootloader. But it also caused unpaired clockdomain calls that won't allow idling the system. That's because clkdm_allow_idle_nolock() and clkdm_deny_idle_nolock() have usage count with clkdm->forcewake_count. Let's drop the unpaired sysc_clkdm_deny_idle() to fix idling of devices. Fixes: d098913a10f8 ("bus: ti-sysc: Fix clock handling for no-idle quirks") Cc: Keerthy Cc: Vignesh Raghavendra Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 24583d82b584..364ee498feb3 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -2363,7 +2363,6 @@ static void ti_sysc_idle(struct work_struct *work) */ if (ddata->cfg.quirks & (SYSC_QUIRK_NO_IDLE | SYSC_QUIRK_NO_IDLE_ON_INIT)) { - sysc_clkdm_deny_idle(ddata); sysc_disable_main_clocks(ddata); sysc_disable_opt_clocks(ddata); sysc_clkdm_allow_idle(ddata); From b6749e20d5710c955fc6d4322f8fd98c915b2573 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 1 Sep 2019 22:12:35 +0100 Subject: [PATCH 018/312] arm64: KVM: Drop hyp_alternate_select for checking for ARM64_WORKAROUND_834220 There is no reason for using hyp_alternate_select when checking for ARM64_WORKAROUND_834220, as each of the capabilities is also backed by a static key. Just replace the KVM-specific construct with cpus_have_const_cap(ARM64_WORKAROUND_834220). Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Reviewed-by: Andrew Jones --- arch/arm64/kvm/hyp/switch.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index adaf266d8de8..a15baca9aca0 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -229,20 +229,6 @@ static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) } } -static bool __hyp_text __true_value(void) -{ - return true; -} - -static bool __hyp_text __false_value(void) -{ - return false; -} - -static hyp_alternate_select(__check_arm_834220, - __false_value, __true_value, - ARM64_WORKAROUND_834220); - static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) { u64 par, tmp; @@ -298,7 +284,8 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) * resolve the IPA using the AT instruction. */ if (!(esr & ESR_ELx_S1PTW) && - (__check_arm_834220()() || (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { + (cpus_have_const_cap(ARM64_WORKAROUND_834220) || + (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { if (!__translate_far_to_hpfar(far, &hpfar)) return false; } else { From aa979fa899c5cf592e63ce7b34bc767224225c6a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 1 Sep 2019 22:12:36 +0100 Subject: [PATCH 019/312] arm64: KVM: Replace hyp_alternate_select with has_vhe() Given that the TLB invalidation path is pretty rarely used, there was never any advantage to using hyp_alternate_select() here. has_vhe(), being a glorified static key, is the right tool for the job. Off you go. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Reviewed-by: Andrew Jones --- arch/arm64/kvm/hyp/tlb.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index c466060b76d6..eb0efc5557f3 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -67,10 +67,14 @@ static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm, isb(); } -static hyp_alternate_select(__tlb_switch_to_guest, - __tlb_switch_to_guest_nvhe, - __tlb_switch_to_guest_vhe, - ARM64_HAS_VIRT_HOST_EXTN); +static void __hyp_text __tlb_switch_to_guest(struct kvm *kvm, + struct tlb_inv_context *cxt) +{ + if (has_vhe()) + __tlb_switch_to_guest_vhe(kvm, cxt); + else + __tlb_switch_to_guest_nvhe(kvm, cxt); +} static void __hyp_text __tlb_switch_to_host_vhe(struct kvm *kvm, struct tlb_inv_context *cxt) @@ -98,10 +102,14 @@ static void __hyp_text __tlb_switch_to_host_nvhe(struct kvm *kvm, write_sysreg(0, vttbr_el2); } -static hyp_alternate_select(__tlb_switch_to_host, - __tlb_switch_to_host_nvhe, - __tlb_switch_to_host_vhe, - ARM64_HAS_VIRT_HOST_EXTN); +static void __hyp_text __tlb_switch_to_host(struct kvm *kvm, + struct tlb_inv_context *cxt) +{ + if (has_vhe()) + __tlb_switch_to_host_vhe(kvm, cxt); + else + __tlb_switch_to_host_nvhe(kvm, cxt); +} void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { @@ -111,7 +119,7 @@ void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - __tlb_switch_to_guest()(kvm, &cxt); + __tlb_switch_to_guest(kvm, &cxt); /* * We could do so much better if we had the VA as well. @@ -154,7 +162,7 @@ void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) if (!has_vhe() && icache_is_vpipt()) __flush_icache_all(); - __tlb_switch_to_host()(kvm, &cxt); + __tlb_switch_to_host(kvm, &cxt); } void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) @@ -165,13 +173,13 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - __tlb_switch_to_guest()(kvm, &cxt); + __tlb_switch_to_guest(kvm, &cxt); __tlbi(vmalls12e1is); dsb(ish); isb(); - __tlb_switch_to_host()(kvm, &cxt); + __tlb_switch_to_host(kvm, &cxt); } void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) @@ -180,13 +188,13 @@ void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) struct tlb_inv_context cxt; /* Switch to requested VMID */ - __tlb_switch_to_guest()(kvm, &cxt); + __tlb_switch_to_guest(kvm, &cxt); __tlbi(vmalle1); dsb(nsh); isb(); - __tlb_switch_to_host()(kvm, &cxt); + __tlb_switch_to_host(kvm, &cxt); } void __hyp_text __kvm_flush_vm_context(void) From 084b5a80e87258997632dbd975ec3e5fda6bb943 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 1 Sep 2019 22:12:37 +0100 Subject: [PATCH 020/312] arm64: KVM: Kill hyp_alternate_select() hyp_alternate_select() is now completely unused. Goodbye. Signed-off-by: Marc Zyngier Reviewed-by: Christoffer Dall Reviewed-by: Andrew Jones --- arch/arm64/include/asm/kvm_hyp.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 86825aa20852..97f21cc66657 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -47,30 +47,6 @@ #define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1) #define write_sysreg_el2(v,r) write_sysreg_elx(v, r, _EL2, _EL1) -/** - * hyp_alternate_select - Generates patchable code sequences that are - * used to switch between two implementations of a function, depending - * on the availability of a feature. - * - * @fname: a symbol name that will be defined as a function returning a - * function pointer whose type will match @orig and @alt - * @orig: A pointer to the default function, as returned by @fname when - * @cond doesn't hold - * @alt: A pointer to the alternate function, as returned by @fname - * when @cond holds - * @cond: a CPU feature (as described in asm/cpufeature.h) - */ -#define hyp_alternate_select(fname, orig, alt, cond) \ -typeof(orig) * __hyp_text fname(void) \ -{ \ - typeof(alt) *val = orig; \ - asm volatile(ALTERNATIVE("nop \n", \ - "mov %0, %1 \n", \ - cond) \ - : "+r" (val) : "r" (alt)); \ - return val; \ -} - int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); From 38c7a30a9d5ff8dd0c7a9ab1441aa5a96cf60afa Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 10 Sep 2019 10:26:46 -0700 Subject: [PATCH 021/312] Documentation/process: Volunteer as the ambassador for Intel Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Sasha Levin Cc: Ben Hutchings Cc: Thomas Gleixner Cc: Laura Abbott Cc: Andrew Cooper Cc: Trilok Soni Cc: Kees Cook Cc: Tony Luck Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Dan Williams Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Link: https://lore.kernel.org/r/20190910172646.25BFCE7B@viggo.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 402636356fbe..e57b9f39c69f 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -216,7 +216,7 @@ an involved disclosed party. The current ambassadors list: ARM AMD IBM - Intel + Intel Tony Luck Qualcomm Trilok Soni Microsoft Sasha Levin From aac60f1a867773de9eb164013d89c99f3ea1f009 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 11 Sep 2019 02:33:35 +0000 Subject: [PATCH 022/312] KVM: arm/arm64: vgic: Use the appropriate TRACE_INCLUDE_PATH Commit 49dfe94fe5ad ("KVM: arm/arm64: Fix TRACE_INCLUDE_PATH") fixes TRACE_INCLUDE_PATH to the correct relative path to the define_trace.h and explains why did the old one work. The same fix should be applied to virt/kvm/arm/vgic/trace.h. Reviewed-by: Masahiro Yamada Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h index 55fed77a9f73..4fd4f6db181b 100644 --- a/virt/kvm/arm/vgic/trace.h +++ b/virt/kvm/arm/vgic/trace.h @@ -30,7 +30,7 @@ TRACE_EVENT(vgic_update_irq_pending, #endif /* _TRACE_VGIC_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm/vgic +#define TRACE_INCLUDE_PATH ../../virt/kvm/arm/vgic #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace From 473ef57ad8edc25efd083a583a5f6604b47d3822 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Sep 2019 12:19:48 -0400 Subject: [PATCH 023/312] afs dynroot: switch to simple_dir_operations no point reinventing it (with wrong ->read(), BTW). Signed-off-by: Al Viro --- fs/afs/dynroot.c | 7 ------- fs/afs/inode.c | 2 +- fs/afs/internal.h | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index bcd1bafb0278..4150280509ff 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,13 +10,6 @@ #include #include "internal.h" -const struct file_operations afs_dynroot_file_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - /* * Probe to see if a cell may exist. This prevents positive dentries from * being created unnecessarily. diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 7b1c18c32f48..46d2d7cb461d 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -443,7 +443,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &afs_dynroot_file_operations; + inode->i_fop = &simple_dir_operations; } else { inode->i_op = &afs_autocell_inode_operations; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f66a3be12fd6..9b2a2a21e3dc 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -910,7 +910,6 @@ extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ -extern const struct file_operations afs_dynroot_file_operations; extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; From 11ed5cf064beac3ca345bb75fdf2429e03ac106a Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 9 Sep 2019 16:21:30 +0100 Subject: [PATCH 024/312] firmware: arm_scmi: reset: fix reset_state assignment in scmi_domain_reset Fix the copy paste typo that incorrectly assigns domain_id with the passed 'state' parameter instead of reset_state. Fixes: 95a15d80aa0d ("firmware: arm_scmi: Add RESET protocol in SCMI v2.0") Reported-by: Etienne Carriere Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/reset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c index 64cc81915581..ab42c21c5517 100644 --- a/drivers/firmware/arm_scmi/reset.c +++ b/drivers/firmware/arm_scmi/reset.c @@ -150,7 +150,7 @@ static int scmi_domain_reset(const struct scmi_handle *handle, u32 domain, dom = t->tx.buf; dom->domain_id = cpu_to_le32(domain); dom->flags = cpu_to_le32(flags); - dom->domain_id = cpu_to_le32(state); + dom->reset_state = cpu_to_le32(state); if (rdom->async_reset) ret = scmi_do_xfer_with_response(handle, t); From 61423712dbb86e02af4aa5de65b9041493c92cac Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 9 Sep 2019 16:21:07 +0100 Subject: [PATCH 025/312] reset: reset-scmi: add missing handle initialisation scmi_reset_data->handle needs to be initialised at probe, so that it can be later used to access scmi reset protocol APIs using the same. Since it was tested with a module that obtained handle elsewhere, it was missed easily. Add the missing scmi_reset_data->handle initialisation to fix the issue. Fixes: c8ae9c2da1cc ("reset: Add support for resets provided by SCMI") Acked-by: Philipp Zabel Reported-by: Etienne Carriere Signed-off-by: Sudeep Holla --- drivers/reset/reset-scmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/reset/reset-scmi.c b/drivers/reset/reset-scmi.c index c6d3c8427f14..b46df80ec6c3 100644 --- a/drivers/reset/reset-scmi.c +++ b/drivers/reset/reset-scmi.c @@ -102,6 +102,7 @@ static int scmi_reset_probe(struct scmi_device *sdev) data->rcdev.owner = THIS_MODULE; data->rcdev.of_node = np; data->rcdev.nr_resets = handle->reset_ops->num_domains_get(handle); + data->handle = handle; return devm_reset_controller_register(dev, &data->rcdev); } From 7fd25e6fc035f4b04b75bca6d7e8daa069603a76 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 19 Sep 2019 14:12:34 +0200 Subject: [PATCH 026/312] ieee802154: atusb: fix use-after-free at disconnect The disconnect callback was accessing the hardware-descriptor private data after having having freed it. Fixes: 7490b008d123 ("ieee802154: add support for atusb transceiver") Cc: stable # 4.2 Cc: Alexander Aring Reported-by: syzbot+f4509a9138a1472e7e80@syzkaller.appspotmail.com Signed-off-by: Johan Hovold Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/atusb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/atusb.c b/drivers/net/ieee802154/atusb.c index ceddb424f887..0dd0ba915ab9 100644 --- a/drivers/net/ieee802154/atusb.c +++ b/drivers/net/ieee802154/atusb.c @@ -1137,10 +1137,11 @@ static void atusb_disconnect(struct usb_interface *interface) ieee802154_unregister_hw(atusb->hw); + usb_put_dev(atusb->usb_dev); + ieee802154_free_hw(atusb->hw); usb_set_intfdata(interface, NULL); - usb_put_dev(atusb->usb_dev); pr_debug("%s done\n", __func__); } From b74d957f6317e60924d0d3c0c01750814c24611b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 29 Aug 2019 15:43:49 +0200 Subject: [PATCH 027/312] ARM: aspeed: ast2500 is ARMv6K Linux supports both the original ARMv6 level (early ARM1136) and ARMv6K (later ARM1136, ARM1176 and ARM11mpcore). ast2500 falls into the second categoy, being based on arm1176jzf-s. This is enabled by default when using ARCH_MULTI_V6, so we should not 'select CPU_V6'. Removing this will lead to more efficient use of atomic instructions. Fixes: 8c2ed9bcfbeb ("arm: Add Aspeed machine") Signed-off-by: Arnd Bergmann Reviewed-by: Joel Stanley --- arch/arm/mach-aspeed/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/mach-aspeed/Kconfig b/arch/arm/mach-aspeed/Kconfig index a15c3a291386..44ee6bac255e 100644 --- a/arch/arm/mach-aspeed/Kconfig +++ b/arch/arm/mach-aspeed/Kconfig @@ -26,7 +26,6 @@ config MACH_ASPEED_G4 config MACH_ASPEED_G5 bool "Aspeed SoC 5th Generation" depends on ARCH_MULTI_V6 - select CPU_V6 select PINCTRL_ASPEED_G5 help Say yes if you intend to run on an Aspeed ast2500 or similar From 7e7db86c7e1088e768438fe6c894d748b0c32abe Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 19 Sep 2019 04:00:55 -0500 Subject: [PATCH 028/312] smb3: allow decryption keys to be dumped by admin for debugging In order to debug certain problems it is important to be able to decrypt network traces (e.g. wireshark) but to do this we need to be able to dump out the encryption/decryption keys. Dumping them to an ioctl is safer than dumping then to dmesg, (and better than showing all keys in a pseudofile). Restrict this to root (CAP_SYS_ADMIN), and only for a mount that this admin has access to. Sample smbinfo output: SMB3.0 encryption Session Id: 0x82d2ec52 Session Key: a5 6d 81 d0 e c1 ca e1 d8 13 aa 20 e8 f2 cc 71 Server Encryption Key: 1a c3 be ba 3d fc dc 3c e bc 93 9e 50 9e 19 c1 Server Decryption Key: e0 d4 d9 43 1b a2 1b e3 d8 76 77 49 56 f7 20 88 Reviewed-by: Aurelien Aptel Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifs_ioctl.h | 9 +++++++++ fs/cifs/ioctl.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 6c3bd07868d7..0f0dc1c1fe41 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -57,9 +57,18 @@ struct smb_query_info { /* char buffer[]; */ } __packed; +struct smb3_key_debug_info { + __u64 Suid; + __u16 cipher_type; + __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) +#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 76ddd98b6298..1a01e108d75e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -164,6 +164,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); + struct smb3_key_debug_info pkey_inf; int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; @@ -270,6 +271,34 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = -EOPNOTSUPP; break; + case CIFS_DUMP_KEY: + if (pSMBFile == NULL) + break; + if (!capable(CAP_SYS_ADMIN)) { + rc = -EACCES; + break; + } + + tcon = tlink_tcon(pSMBFile->tlink); + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + break; + } + pkey_inf.cipher_type = + le16_to_cpu(tcon->ses->server->cipher_type); + pkey_inf.Suid = tcon->ses->Suid; + memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response, + 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + memcpy(pkey_inf.smb3decryptionkey, + tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + memcpy(pkey_inf.smb3encryptionkey, + tcon->ses->smb3encryptionkey, SMB3_SIGN_KEY_SIZE); + if (copy_to_user((void __user *)arg, &pkey_inf, + sizeof(struct smb3_key_debug_info))) + rc = -EFAULT; + else + rc = 0; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; From dd89d82e751473d13da0daea1bfb15d5634071c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 15 May 2019 12:34:21 +0300 Subject: [PATCH 029/312] thermal: thermal_mmio: remove some dead code The platform_get_resource() function doesn't return error pointers, it returns NULL. The way this is normally done, is that we pass the NULL resource to devm_ioremap_resource() and then check for errors from that. See the comment in front of devm_ioremap_resource() for more details. Signed-off-by: Dan Carpenter Acked-by: Talel Shenhar Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_mmio.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/thermal/thermal_mmio.c b/drivers/thermal/thermal_mmio.c index de3cceea23bc..40524fa13533 100644 --- a/drivers/thermal/thermal_mmio.c +++ b/drivers/thermal/thermal_mmio.c @@ -53,13 +53,6 @@ static int thermal_mmio_probe(struct platform_device *pdev) return -ENOMEM; resource = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (IS_ERR(resource)) { - dev_err(&pdev->dev, - "fail to get platform memory resource (%ld)\n", - PTR_ERR(resource)); - return PTR_ERR(resource); - } - sensor->mmio_base = devm_ioremap_resource(&pdev->dev, resource); if (IS_ERR(sensor->mmio_base)) { dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n", From 8581d51055a08cc6eb061c8856062290e8582ce4 Mon Sep 17 00:00:00 2001 From: "Lowry Li (Arm Technology China)" Date: Wed, 31 Jul 2019 11:04:38 +0000 Subject: [PATCH 030/312] drm: Free the writeback_job when it with an empty fb Adds the check if the writeback_job with an empty fb, then it should be freed in atomic_check phase. With this change, the driver users will not check empty fb case any more. So refined accordingly. Signed-off-by: Lowry Li (Arm Technology China) Reviewed-by: Liviu Dudau Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/1564571048-15029-2-git-send-email-lowry.li@arm.com --- .../drm/arm/display/komeda/komeda_wb_connector.c | 3 +-- drivers/gpu/drm/arm/malidp_mw.c | 4 ++-- drivers/gpu/drm/drm_atomic.c | 13 +++++++++---- drivers/gpu/drm/rcar-du/rcar_du_writeback.c | 4 ++-- drivers/gpu/drm/vc4/vc4_txp.c | 5 ++--- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c index 2851cac94d86..23fbee268119 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c @@ -43,9 +43,8 @@ komeda_wb_encoder_atomic_check(struct drm_encoder *encoder, struct komeda_data_flow_cfg dflow; int err; - if (!writeback_job || !writeback_job->fb) { + if (!writeback_job) return 0; - } if (!crtc_st->active) { DRM_DEBUG_ATOMIC("Cannot write the composition result out on a inactive CRTC.\n"); diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c index 2e812525025d..a59227b2cdb5 100644 --- a/drivers/gpu/drm/arm/malidp_mw.c +++ b/drivers/gpu/drm/arm/malidp_mw.c @@ -130,7 +130,7 @@ malidp_mw_encoder_atomic_check(struct drm_encoder *encoder, struct drm_framebuffer *fb; int i, n_planes; - if (!conn_state->writeback_job || !conn_state->writeback_job->fb) + if (!conn_state->writeback_job) return 0; fb = conn_state->writeback_job->fb; @@ -247,7 +247,7 @@ void malidp_mw_atomic_commit(struct drm_device *drm, mw_state = to_mw_state(conn_state); - if (conn_state->writeback_job && conn_state->writeback_job->fb) { + if (conn_state->writeback_job) { struct drm_framebuffer *fb = conn_state->writeback_job->fb; DRM_DEV_DEBUG_DRIVER(drm->dev, diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index 419381abbdd1..14aeaf736321 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -430,10 +430,15 @@ static int drm_atomic_connector_check(struct drm_connector *connector, return -EINVAL; } - if (writeback_job->out_fence && !writeback_job->fb) { - DRM_DEBUG_ATOMIC("[CONNECTOR:%d:%s] requesting out-fence without framebuffer\n", - connector->base.id, connector->name); - return -EINVAL; + if (!writeback_job->fb) { + if (writeback_job->out_fence) { + DRM_DEBUG_ATOMIC("[CONNECTOR:%d:%s] requesting out-fence without framebuffer\n", + connector->base.id, connector->name); + return -EINVAL; + } + + drm_writeback_cleanup_job(writeback_job); + state->writeback_job = NULL; } return 0; diff --git a/drivers/gpu/drm/rcar-du/rcar_du_writeback.c b/drivers/gpu/drm/rcar-du/rcar_du_writeback.c index ae07290bba6a..04efa78d70b6 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_writeback.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_writeback.c @@ -147,7 +147,7 @@ static int rcar_du_wb_enc_atomic_check(struct drm_encoder *encoder, struct drm_device *dev = encoder->dev; struct drm_framebuffer *fb; - if (!conn_state->writeback_job || !conn_state->writeback_job->fb) + if (!conn_state->writeback_job) return 0; fb = conn_state->writeback_job->fb; @@ -221,7 +221,7 @@ void rcar_du_writeback_setup(struct rcar_du_crtc *rcrtc, unsigned int i; state = rcrtc->writeback.base.state; - if (!state || !state->writeback_job || !state->writeback_job->fb) + if (!state || !state->writeback_job) return; fb = state->writeback_job->fb; diff --git a/drivers/gpu/drm/vc4/vc4_txp.c b/drivers/gpu/drm/vc4/vc4_txp.c index 96f91c1b4b6e..e92fa1275034 100644 --- a/drivers/gpu/drm/vc4/vc4_txp.c +++ b/drivers/gpu/drm/vc4/vc4_txp.c @@ -229,7 +229,7 @@ static int vc4_txp_connector_atomic_check(struct drm_connector *conn, int i; conn_state = drm_atomic_get_new_connector_state(state, conn); - if (!conn_state->writeback_job || !conn_state->writeback_job->fb) + if (!conn_state->writeback_job) return 0; crtc_state = drm_atomic_get_new_crtc_state(state, conn_state->crtc); @@ -269,8 +269,7 @@ static void vc4_txp_connector_atomic_commit(struct drm_connector *conn, u32 ctrl; int i; - if (WARN_ON(!conn_state->writeback_job || - !conn_state->writeback_job->fb)) + if (WARN_ON(!conn_state->writeback_job)) return; mode = &conn_state->crtc->state->adjusted_mode; From b1066a123538044117f0a78ba8c6a50cf5a04c86 Mon Sep 17 00:00:00 2001 From: "Lowry Li (Arm Technology China)" Date: Wed, 31 Jul 2019 11:04:45 +0000 Subject: [PATCH 031/312] drm: Clear the fence pointer when writeback job signaled During it signals the completion of a writeback job, after releasing the out_fence, we'd clear the pointer. Check if fence left over in drm_writeback_cleanup_job(), release it. Signed-off-by: Lowry Li (Arm Technology China) Reviewed-by: Brian Starkey Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/1564571048-15029-3-git-send-email-lowry.li@arm.com --- drivers/gpu/drm/drm_writeback.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c index ff138b6ec48b..43d9e3bb3a94 100644 --- a/drivers/gpu/drm/drm_writeback.c +++ b/drivers/gpu/drm/drm_writeback.c @@ -324,6 +324,9 @@ void drm_writeback_cleanup_job(struct drm_writeback_job *job) if (job->fb) drm_framebuffer_put(job->fb); + if (job->out_fence) + dma_fence_put(job->out_fence); + kfree(job); } EXPORT_SYMBOL(drm_writeback_cleanup_job); @@ -366,25 +369,29 @@ drm_writeback_signal_completion(struct drm_writeback_connector *wb_connector, { unsigned long flags; struct drm_writeback_job *job; + struct dma_fence *out_fence; spin_lock_irqsave(&wb_connector->job_lock, flags); job = list_first_entry_or_null(&wb_connector->job_queue, struct drm_writeback_job, list_entry); - if (job) { + if (job) list_del(&job->list_entry); - if (job->out_fence) { - if (status) - dma_fence_set_error(job->out_fence, status); - dma_fence_signal(job->out_fence); - dma_fence_put(job->out_fence); - } - } + spin_unlock_irqrestore(&wb_connector->job_lock, flags); if (WARN_ON(!job)) return; + out_fence = job->out_fence; + if (out_fence) { + if (status) + dma_fence_set_error(out_fence, status); + dma_fence_signal(out_fence); + dma_fence_put(out_fence); + job->out_fence = NULL; + } + INIT_WORK(&job->cleanup_work, cleanup_work); queue_work(system_long_wq, &job->cleanup_work); } From 0ec64895b05269c1814ea5befcadcd1184a12915 Mon Sep 17 00:00:00 2001 From: John Pittman Date: Tue, 17 Sep 2019 14:52:00 -0400 Subject: [PATCH 032/312] nvmet: change ppl to lpp In nvmet_bdev_set_limits() the number of logical blocks per physical block is calculated, but the opposite is mentioned in the associated comment and reflected in the variable name. Correct the comment and adjust the variable name to reflect the calculation done. Signed-off-by: John Pittman Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/target/io-cmd-bdev.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index de0bff70ebb6..32008d85172b 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -11,10 +11,10 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) { const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; - /* Number of physical blocks per logical block. */ - const u32 ppl = ql->physical_block_size / ql->logical_block_size; - /* Physical blocks per logical block, 0's based. */ - const __le16 ppl0b = to0based(ppl); + /* Number of logical blocks per physical block. */ + const u32 lpp = ql->physical_block_size / ql->logical_block_size; + /* Logical blocks per physical block, 0's based. */ + const __le16 lpp0b = to0based(lpp); /* * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, @@ -25,9 +25,9 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) * field from the identify controller data structure should be used. */ id->nsfeat |= 1 << 1; - id->nawun = ppl0b; - id->nawupf = ppl0b; - id->nacwu = ppl0b; + id->nawun = lpp0b; + id->nawupf = lpp0b; + id->nacwu = lpp0b; /* * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and @@ -36,7 +36,7 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) */ id->nsfeat |= 1 << 4; /* NPWG = Namespace Preferred Write Granularity. 0's based */ - id->npwg = ppl0b; + id->npwg = lpp0b; /* NPWA = Namespace Preferred Write Alignment. 0's based */ id->npwa = id->npwg; /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ From b224726de5e496dbf78147a66755c3d81e28bdd2 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 18 Sep 2019 00:27:20 +0000 Subject: [PATCH 033/312] nvme-pci: Fix a race in controller removal User space programs like udevd may try to read to partitions at the same time the driver detects a namespace is unusable, and may deadlock if revalidate_disk() is called while such a process is waiting to enter the frozen queue. On detecting a dead namespace, move the disk revalidate after unblocking dispatchers that may be holding bd_butex. changelog Suggested-by: Keith Busch Signed-off-by: Balbir Singh Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 108f60b46804..0c385b1994fe 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -102,10 +102,13 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) */ if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; - revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); /* Forcibly unquiesce queues to avoid blocking dispatch */ blk_mq_unquiesce_queue(ns->queue); + /* + * Revalidate after unblocking dispatchers that may be holding bd_butex + */ + revalidate_disk(ns->disk); } static void nvme_queue_scan(struct nvme_ctrl *ctrl) From c9c53749375ccce0191f89b86732e642f914bd82 Mon Sep 17 00:00:00 2001 From: Laurence Oberman Date: Wed, 11 Sep 2019 09:56:42 -0400 Subject: [PATCH 034/312] scsi: bnx2fc: Handle scope bits when array returns BUSY or TSF The qla2xxx driver had this issue as well when the newer array firmware returned the retry_delay_timer in the fcp_rsp. The bnx2fc is not handling the masking of the scope bits either so the retry_delay_timestamp value lands up being a large value added to the timer timestamp delaying I/O for up to 27 Minutes. This patch adds similar code to handle this to the bnx2fc driver to avoid the huge delay. Link: https://lore.kernel.org/r/1568210202-12794-1-git-send-email-loberman@redhat.com Signed-off-by: Laurence Oberman Reported-by: David Jeffery Reported-by: kbuild test robot Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_io.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_io.c b/drivers/scsi/bnx2fc/bnx2fc_io.c index da00ca5fa5dc..401743e2b429 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_io.c +++ b/drivers/scsi/bnx2fc/bnx2fc_io.c @@ -1923,6 +1923,7 @@ void bnx2fc_process_scsi_cmd_compl(struct bnx2fc_cmd *io_req, struct fcoe_fcp_rsp_payload *fcp_rsp; struct bnx2fc_rport *tgt = io_req->tgt; struct scsi_cmnd *sc_cmd; + u16 scope = 0, qualifier = 0; /* scsi_cmd_cmpl is called with tgt lock held */ @@ -1990,12 +1991,30 @@ void bnx2fc_process_scsi_cmd_compl(struct bnx2fc_cmd *io_req, if (io_req->cdb_status == SAM_STAT_TASK_SET_FULL || io_req->cdb_status == SAM_STAT_BUSY) { - /* Set the jiffies + retry_delay_timer * 100ms - for the rport/tgt */ - tgt->retry_delay_timestamp = jiffies + - fcp_rsp->retry_delay_timer * HZ / 10; + /* Newer array firmware with BUSY or + * TASK_SET_FULL may return a status that needs + * the scope bits masked. + * Or a huge delay timestamp up to 27 minutes + * can result. + */ + if (fcp_rsp->retry_delay_timer) { + /* Upper 2 bits */ + scope = fcp_rsp->retry_delay_timer + & 0xC000; + /* Lower 14 bits */ + qualifier = fcp_rsp->retry_delay_timer + & 0x3FFF; + } + if (scope > 0 && qualifier > 0 && + qualifier <= 0x3FEF) { + /* Set the jiffies + + * retry_delay_timer * 100ms + * for the rport/tgt + */ + tgt->retry_delay_timestamp = jiffies + + (qualifier * HZ / 10); + } } - } if (io_req->fcp_resid) scsi_set_resid(sc_cmd, io_req->fcp_resid); From f51913eef23f74c3bd07899dc7f1ed6df9e521d8 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Wed, 18 Sep 2019 12:20:38 +0800 Subject: [PATCH 035/312] scsi: ufs: skip shutdown if hba is not powered In some cases, hba may go through shutdown flow without successful initialization and then make system hang. For example, if ufshcd_change_power_mode() gets error and leads to ufshcd_hba_exit() to release resources of the host, future shutdown flow may hang the system since the host register will be accessed in unpowered state. To solve this issue, simply add checking to skip shutdown for above kind of situation. Link: https://lore.kernel.org/r/1568780438-28753-1-git-send-email-stanley.chu@mediatek.com Signed-off-by: Stanley Chu Acked-by: Bean Huo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index c4a015e42045..57b2c3a8def3 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8140,6 +8140,9 @@ int ufshcd_shutdown(struct ufs_hba *hba) { int ret = 0; + if (!hba->is_powered) + goto out; + if (ufshcd_is_ufs_dev_poweroff(hba) && ufshcd_is_link_off(hba)) goto out; From 4b062e7cf940aa4f38fedc3a964b24cb095373f0 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Thu, 19 Sep 2019 16:55:48 +0900 Subject: [PATCH 036/312] scsi: qedf: Remove always false 'tmp_prio < 0' statement Since tmp_prio is declared as u8, the following statement is always false. tmp_prio < 0 So remove 'always false' statement. Link: https://lore.kernel.org/r/20190919075548.GA112801@LGEARND20B15 Signed-off-by: Austin Kim Acked-by: Saurav Kashyap Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index 9c24f3834d70..1d2c111bdf82 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -596,7 +596,7 @@ static void qedf_dcbx_handler(void *dev, struct qed_dcbx_get *get, u32 mib_type) tmp_prio = get->operational.app_prio.fcoe; if (qedf_default_prio > -1) qedf->prio = qedf_default_prio; - else if (tmp_prio < 0 || tmp_prio > 7) { + else if (tmp_prio > 7) { QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "FIP/FCoE prio %d out of range, setting to %d.\n", tmp_prio, QEDF_DEFAULT_PRIO); From 0ed8810276907c8a633dc8cecc48dabb6678cd23 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 6 Sep 2019 10:24:20 -0700 Subject: [PATCH 037/312] scsi: storvsc: setup 1:1 mapping between hardware queue and CPU queue storvsc doesn't use a dedicated hardware queue for a given CPU queue. When issuing I/O, it selects returning CPU (hardware queue) dynamically based on vmbus channel usage across all channels. This patch advertises num_present_cpus() as number of hardware queues. This will have upper layer setup 1:1 mapping between hardware queue and CPU queue and avoid unnecessary locking when issuing I/O. Link: https://lore.kernel.org/r/1567790660-48142-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Long Li Reviewed-by: Michael Kelley Signed-off-by: Martin K. Petersen --- drivers/scsi/storvsc_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index ed8b9ac805e6..542d2bac2922 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1837,8 +1837,7 @@ static int storvsc_probe(struct hv_device *device, /* * Set the number of HW queues we are supporting. */ - if (stor_device->num_sc != 0) - host->nr_hw_queues = stor_device->num_sc + 1; + host->nr_hw_queues = num_present_cpus(); /* * Set the error handler work queue. From 70054aa39a013fa52eff432f2223b8bd5c0048f8 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Sat, 7 Sep 2019 09:07:30 +0800 Subject: [PATCH 038/312] scsi: megaraid: disable device when probe failed after enabled device For pci device, need to disable device when probe failed after enabled device. Link: https://lore.kernel.org/r/1567818450-173315-1-git-send-email-chenxiang66@hisilicon.com Signed-off-by: Xiang Chen Reviewed-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 45a66048801b..ff6d4aa92421 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -4183,11 +4183,11 @@ megaraid_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) */ if (pdev->subsystem_vendor == PCI_VENDOR_ID_COMPAQ && pdev->subsystem_device == 0xC000) - return -ENODEV; + goto out_disable_device; /* Now check the magic signature byte */ pci_read_config_word(pdev, PCI_CONF_AMISIG, &magic); if (magic != HBA_SIGNATURE_471 && magic != HBA_SIGNATURE) - return -ENODEV; + goto out_disable_device; /* Ok it is probably a megaraid */ } From 4b6b1bb68628ec49e4cbfabd8ac17a169f079cd8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 13:40:35 +0800 Subject: [PATCH 039/312] scsi: hisi_sas: Make three functions static Fix sparse warnings: drivers/scsi/hisi_sas/hisi_sas_main.c:3686:6: warning: symbol 'hisi_sas_debugfs_release' was not declared. Should it be static? drivers/scsi/hisi_sas/hisi_sas_main.c:3708:5: warning: symbol 'hisi_sas_debugfs_alloc' was not declared. Should it be static? drivers/scsi/hisi_sas/hisi_sas_main.c:3799:6: warning: symbol 'hisi_sas_debugfs_bist_init' was not declared. Should it be static? Link: https://lore.kernel.org/r/20190923054035.19036-1-yuehaibing@huawei.com Reported-by: Hulk Robot Signed-off-by: YueHaibing Acked-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index d1513fdf1e00..0847e682797b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -3683,7 +3683,7 @@ void hisi_sas_debugfs_work_handler(struct work_struct *work) } EXPORT_SYMBOL_GPL(hisi_sas_debugfs_work_handler); -void hisi_sas_debugfs_release(struct hisi_hba *hisi_hba) +static void hisi_sas_debugfs_release(struct hisi_hba *hisi_hba) { struct device *dev = hisi_hba->dev; int i; @@ -3705,7 +3705,7 @@ void hisi_sas_debugfs_release(struct hisi_hba *hisi_hba) devm_kfree(dev, hisi_hba->debugfs_port_reg[i]); } -int hisi_sas_debugfs_alloc(struct hisi_hba *hisi_hba) +static int hisi_sas_debugfs_alloc(struct hisi_hba *hisi_hba) { const struct hisi_sas_hw *hw = hisi_hba->hw; struct device *dev = hisi_hba->dev; @@ -3796,7 +3796,7 @@ fail: return -ENOMEM; } -void hisi_sas_debugfs_bist_init(struct hisi_hba *hisi_hba) +static void hisi_sas_debugfs_bist_init(struct hisi_hba *hisi_hba) { hisi_hba->debugfs_bist_dentry = debugfs_create_dir("bist", hisi_hba->debugfs_dir); From 248a445adfc8c33ffd67cf1f2e336578e34f9e21 Mon Sep 17 00:00:00 2001 From: Himanshu Madhani Date: Thu, 12 Sep 2019 11:09:05 -0700 Subject: [PATCH 040/312] scsi: qla2xxx: Silence fwdump template message Print if fwdt template is present or not, only when ql2xextended_error_logging is enabled. Link: https://lore.kernel.org/r/20190912180918.6436-2-hmadhani@marvell.com Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index b6cdf108994c..f6a5d78f93c9 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -3190,7 +3190,7 @@ qla2x00_alloc_fw_dump(scsi_qla_host_t *vha) for (j = 0; j < 2; j++, fwdt++) { if (!fwdt->template) { - ql_log(ql_log_warn, vha, 0x00ba, + ql_dbg(ql_dbg_init, vha, 0x00ba, "-> fwdt%u no template\n", j); continue; } From c3b6a1d397420a0fdd97af2f06abfb78adc370df Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:06 -0700 Subject: [PATCH 041/312] scsi: qla2xxx: Fix unbound sleep in fcport delete path. There are instances, though rare, where a LOGO request cannot be sent out and the thread in free session done can wait indefinitely. Fix this by putting an upper bound to sleep. Link: https://lore.kernel.org/r/20190912180918.6436-3-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_target.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 0ffda6171614..b58ecd2d7fb6 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -1020,6 +1020,7 @@ void qlt_free_session_done(struct work_struct *work) if (logout_started) { bool traced = false; + u16 cnt = 0; while (!READ_ONCE(sess->logout_completed)) { if (!traced) { @@ -1029,6 +1030,9 @@ void qlt_free_session_done(struct work_struct *work) traced = true; } msleep(100); + cnt++; + if (cnt > 200) + break; } ql_dbg(ql_dbg_disc, vha, 0xf087, From fd5564ba54e0d8a9e3e823d311b764232e09eb5f Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:07 -0700 Subject: [PATCH 042/312] scsi: qla2xxx: Fix stale mem access on driver unload On driver unload, 'remove_one' thread was allowed to advance, while session cleanup still lag behind. This patch ensures session deletion will finish before remove_one can advance. Link: https://lore.kernel.org/r/20190912180918.6436-4-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_os.c | 1 + drivers/scsi/qla2xxx/qla_target.c | 21 ++++++++------------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 51154c049385..42980e52fb41 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1118,6 +1118,7 @@ qla2x00_wait_for_sess_deletion(scsi_qla_host_t *vha) qla2x00_mark_all_devices_lost(vha, 0); wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha), 10*HZ); + flush_workqueue(vha->hw->wq); } /* diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index b58ecd2d7fb6..b5315be00b4d 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -953,7 +953,7 @@ void qlt_free_session_done(struct work_struct *work) struct qla_hw_data *ha = vha->hw; unsigned long flags; bool logout_started = false; - scsi_qla_host_t *base_vha; + scsi_qla_host_t *base_vha = pci_get_drvdata(ha->pdev); struct qlt_plogi_ack_t *own = sess->plogi_link[QLT_PLOGI_LINK_SAME_WWN]; @@ -1105,6 +1105,7 @@ void qlt_free_session_done(struct work_struct *work) } spin_unlock_irqrestore(&ha->tgt.sess_lock, flags); + sess->free_pending = 0; ql_dbg(ql_dbg_tgt_mgt, vha, 0xf001, "Unregistration of sess %p %8phC finished fcp_cnt %d\n", @@ -1113,17 +1114,8 @@ void qlt_free_session_done(struct work_struct *work) if (tgt && (tgt->sess_count == 0)) wake_up_all(&tgt->waitQ); - if (vha->fcport_count == 0) - wake_up_all(&vha->fcport_waitQ); - - base_vha = pci_get_drvdata(ha->pdev); - - sess->free_pending = 0; - - if (test_bit(PFLG_DRIVER_REMOVING, &base_vha->pci_flags)) - return; - - if ((!tgt || !tgt->tgt_stop) && !LOOP_TRANSITION(vha)) { + if (!test_bit(PFLG_DRIVER_REMOVING, &base_vha->pci_flags) && + (!tgt || !tgt->tgt_stop) && !LOOP_TRANSITION(vha)) { switch (vha->host->active_mode) { case MODE_INITIATOR: case MODE_DUAL: @@ -1136,6 +1128,9 @@ void qlt_free_session_done(struct work_struct *work) break; } } + + if (vha->fcport_count == 0) + wake_up_all(&vha->fcport_waitQ); } /* ha->tgt.sess_lock supposed to be held on entry */ @@ -1165,7 +1160,7 @@ void qlt_unreg_sess(struct fc_port *sess) sess->last_login_gen = sess->login_gen; INIT_WORK(&sess->free_work, qlt_free_session_done); - schedule_work(&sess->free_work); + queue_work(sess->vha->hw->wq, &sess->free_work); } EXPORT_SYMBOL(qlt_unreg_sess); From f5187b7d1ac66b61676f896751d3af9fcf8dd592 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:08 -0700 Subject: [PATCH 043/312] scsi: qla2xxx: Optimize NPIV tear down process In the case of NPIV port is being torn down, this patch will set a flag to indicate VPORT_DELETE. This would prevent relogin to be triggered. Link: https://lore.kernel.org/r/20190912180918.6436-5-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_attr.c | 2 ++ drivers/scsi/qla2xxx/qla_def.h | 1 + drivers/scsi/qla2xxx/qla_gs.c | 3 ++- drivers/scsi/qla2xxx/qla_mid.c | 32 +++++++++++++++++++++---------- drivers/scsi/qla2xxx/qla_os.c | 7 ++++++- drivers/scsi/qla2xxx/qla_target.c | 1 + 6 files changed, 34 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index e9c449ef515c..8b3015361428 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -2920,6 +2920,8 @@ qla24xx_vport_delete(struct fc_vport *fc_vport) struct qla_hw_data *ha = vha->hw; uint16_t id = vha->vp_idx; + set_bit(VPORT_DELETE, &vha->dpc_flags); + while (test_bit(LOOP_RESYNC_ACTIVE, &vha->dpc_flags) || test_bit(FCPORT_UPDATE_NEEDED, &vha->dpc_flags)) msleep(1000); diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 873a6aef1c5c..fa038568beb6 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -4394,6 +4394,7 @@ typedef struct scsi_qla_host { #define IOCB_WORK_ACTIVE 31 #define SET_ZIO_THRESHOLD_NEEDED 32 #define ISP_ABORT_TO_ROM 33 +#define VPORT_DELETE 34 unsigned long pci_flags; #define PFLG_DISCONNECTED 0 /* PCI device removed */ diff --git a/drivers/scsi/qla2xxx/qla_gs.c b/drivers/scsi/qla2xxx/qla_gs.c index dc0e36676313..5298ed10059f 100644 --- a/drivers/scsi/qla2xxx/qla_gs.c +++ b/drivers/scsi/qla2xxx/qla_gs.c @@ -3102,7 +3102,8 @@ int qla24xx_post_gpnid_work(struct scsi_qla_host *vha, port_id_t *id) { struct qla_work_evt *e; - if (test_bit(UNLOADING, &vha->dpc_flags)) + if (test_bit(UNLOADING, &vha->dpc_flags) || + (vha->vp_idx && test_bit(VPORT_DELETE, &vha->dpc_flags))) return 0; e = qla2x00_alloc_work(vha, QLA_EVT_GPNID); diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index 1a9a11ae7285..6afad68e5ba2 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -66,6 +66,7 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha) uint16_t vp_id; struct qla_hw_data *ha = vha->hw; unsigned long flags = 0; + u8 i; mutex_lock(&ha->vport_lock); /* @@ -75,8 +76,9 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha) * ensures no active vp_list traversal while the vport is removed * from the queue) */ - wait_event_timeout(vha->vref_waitq, !atomic_read(&vha->vref_count), - 10*HZ); + for (i = 0; i < 10 && atomic_read(&vha->vref_count); i++) + wait_event_timeout(vha->vref_waitq, + atomic_read(&vha->vref_count), HZ); spin_lock_irqsave(&ha->vport_slock, flags); if (atomic_read(&vha->vref_count)) { @@ -262,6 +264,9 @@ qla2x00_alert_all_vps(struct rsp_que *rsp, uint16_t *mb) spin_lock_irqsave(&ha->vport_slock, flags); list_for_each_entry(vha, &ha->vp_list, list) { if (vha->vp_idx) { + if (test_bit(VPORT_DELETE, &vha->dpc_flags)) + continue; + atomic_inc(&vha->vref_count); spin_unlock_irqrestore(&ha->vport_slock, flags); @@ -300,6 +305,20 @@ qla2x00_alert_all_vps(struct rsp_que *rsp, uint16_t *mb) int qla2x00_vp_abort_isp(scsi_qla_host_t *vha) { + fc_port_t *fcport; + + /* + * To exclusively reset vport, we need to log it out first. + * Note: This control_vp can fail if ISP reset is already + * issued, this is expected, as the vp would be already + * logged out due to ISP reset. + */ + if (!test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags)) { + qla24xx_control_vp(vha, VCE_COMMAND_DISABLE_VPS_LOGO_ALL); + list_for_each_entry(fcport, &vha->vp_fcports, list) + fcport->logout_on_delete = 0; + } + /* * Physical port will do most of the abort and recovery work. We can * just treat it as a loop down @@ -312,16 +331,9 @@ qla2x00_vp_abort_isp(scsi_qla_host_t *vha) atomic_set(&vha->loop_down_timer, LOOP_DOWN_TIME); } - /* - * To exclusively reset vport, we need to log it out first. Note: this - * control_vp can fail if ISP reset is already issued, this is - * expected, as the vp would be already logged out due to ISP reset. - */ - if (!test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags)) - qla24xx_control_vp(vha, VCE_COMMAND_DISABLE_VPS_LOGO_ALL); - ql_dbg(ql_dbg_taskm, vha, 0x801d, "Scheduling enable of Vport %d.\n", vha->vp_idx); + return qla24xx_enable_vp(vha); } diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 42980e52fb41..5eda86ff6606 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1115,9 +1115,14 @@ static inline int test_fcport_count(scsi_qla_host_t *vha) void qla2x00_wait_for_sess_deletion(scsi_qla_host_t *vha) { + u8 i; + qla2x00_mark_all_devices_lost(vha, 0); - wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha), 10*HZ); + for (i = 0; i < 10; i++) + wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha), + HZ); + flush_workqueue(vha->hw->wq); } diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index b5315be00b4d..a06e56224a55 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -1115,6 +1115,7 @@ void qlt_free_session_done(struct work_struct *work) wake_up_all(&tgt->waitQ); if (!test_bit(PFLG_DRIVER_REMOVING, &base_vha->pci_flags) && + !(vha->vp_idx && test_bit(VPORT_DELETE, &vha->dpc_flags)) && (!tgt || !tgt->tgt_stop) && !LOOP_TRANSITION(vha)) { switch (vha->host->active_mode) { case MODE_INITIATOR: From 7f2a398d59d658818f3d219645164676fbbc88e8 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:09 -0700 Subject: [PATCH 044/312] scsi: qla2xxx: Fix N2N link reset Fix stalled link recovery for N2N with FC-NVMe connection. Link: https://lore.kernel.org/r/20190912180918.6436-6-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_def.h | 3 +- drivers/scsi/qla2xxx/qla_init.c | 105 +++++++++++++++++++++++--------- drivers/scsi/qla2xxx/qla_mbx.c | 23 ++++++- drivers/scsi/qla2xxx/qla_os.c | 4 ++ 4 files changed, 102 insertions(+), 33 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index fa038568beb6..6ffa9877c28b 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -2396,6 +2396,7 @@ typedef struct fc_port { unsigned int query:1; unsigned int id_changed:1; unsigned int scan_needed:1; + unsigned int n2n_flag:1; struct completion nvme_del_done; uint32_t nvme_prli_service_param; @@ -2446,7 +2447,6 @@ typedef struct fc_port { uint8_t fc4_type; uint8_t fc4f_nvme; uint8_t scan_state; - uint8_t n2n_flag; unsigned long last_queue_full; unsigned long last_ramp_up; @@ -3036,6 +3036,7 @@ enum scan_flags_t { enum fc4type_t { FS_FC4TYPE_FCP = BIT_0, FS_FC4TYPE_NVME = BIT_1, + FS_FCP_IS_N2N = BIT_7, }; struct fab_scan_rp { diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index f6a5d78f93c9..67f522a8a9d4 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -746,12 +746,15 @@ static void qla24xx_handle_gnl_done_event(scsi_qla_host_t *vha, break; default: if ((id.b24 != fcport->d_id.b24 && - fcport->d_id.b24) || + fcport->d_id.b24 && + fcport->loop_id != FC_NO_LOOP_ID) || (fcport->loop_id != FC_NO_LOOP_ID && fcport->loop_id != loop_id)) { ql_dbg(ql_dbg_disc, vha, 0x20e3, "%s %d %8phC post del sess\n", __func__, __LINE__, fcport->port_name); + if (fcport->n2n_flag) + fcport->d_id.b24 = 0; qlt_schedule_sess_for_deletion(fcport); return; } @@ -759,6 +762,8 @@ static void qla24xx_handle_gnl_done_event(scsi_qla_host_t *vha, } fcport->loop_id = loop_id; + if (fcport->n2n_flag) + fcport->d_id.b24 = id.b24; wwn = wwn_to_u64(fcport->port_name); qlt_find_sess_invalidate_other(vha, wwn, @@ -972,7 +977,7 @@ static void qla24xx_async_gnl_sp_done(srb_t *sp, int res) wwn = wwn_to_u64(e->port_name); ql_dbg(ql_dbg_disc + ql_dbg_verbose, vha, 0x20e8, - "%s %8phC %02x:%02x:%02x state %d/%d lid %x \n", + "%s %8phC %02x:%02x:%02x CLS %x/%x lid %x \n", __func__, (void *)&wwn, e->port_id[2], e->port_id[1], e->port_id[0], e->current_login_state, e->last_login_state, (loop_id & 0x7fff)); @@ -1499,7 +1504,8 @@ int qla24xx_fcport_handle_login(struct scsi_qla_host *vha, fc_port_t *fcport) (fcport->fw_login_state == DSC_LS_PRLI_PEND))) return 0; - if (fcport->fw_login_state == DSC_LS_PLOGI_COMP) { + if (fcport->fw_login_state == DSC_LS_PLOGI_COMP && + !N2N_TOPO(vha->hw)) { if (time_before_eq(jiffies, fcport->plogi_nack_done_deadline)) { set_bit(RELOGIN_NEEDED, &vha->dpc_flags); return 0; @@ -1570,8 +1576,9 @@ int qla24xx_fcport_handle_login(struct scsi_qla_host *vha, fc_port_t *fcport) qla24xx_post_gpdb_work(vha, fcport, 0); } else { ql_dbg(ql_dbg_disc, vha, 0x2118, - "%s %d %8phC post NVMe PRLI\n", - __func__, __LINE__, fcport->port_name); + "%s %d %8phC post %s PRLI\n", + __func__, __LINE__, fcport->port_name, + fcport->fc4f_nvme ? "NVME" : "FC"); qla24xx_post_prli_work(vha, fcport); } break; @@ -1853,17 +1860,38 @@ qla24xx_handle_prli_done_event(struct scsi_qla_host *vha, struct event_arg *ea) break; } - if (ea->fcport->n2n_flag) { + if (ea->fcport->fc4f_nvme) { ql_dbg(ql_dbg_disc, vha, 0x2118, "%s %d %8phC post fc4 prli\n", __func__, __LINE__, ea->fcport->port_name); ea->fcport->fc4f_nvme = 0; - ea->fcport->n2n_flag = 0; qla24xx_post_prli_work(vha, ea->fcport); + return; + } + + /* at this point both PRLI NVME & PRLI FCP failed */ + if (N2N_TOPO(vha->hw)) { + if (ea->fcport->n2n_link_reset_cnt < 3) { + ea->fcport->n2n_link_reset_cnt++; + /* + * remote port is not sending Plogi. Reset + * link to kick start his state machine + */ + set_bit(N2N_LINK_RESET, &vha->dpc_flags); + } else { + ql_log(ql_log_warn, vha, 0x2119, + "%s %d %8phC Unable to reconnect\n", + __func__, __LINE__, ea->fcport->port_name); + } + } else { + /* + * switch connect. login failed. Take connection + * down and allow relogin to retrigger + */ + ea->fcport->flags &= ~FCF_ASYNC_SENT; + ea->fcport->keep_nport_handle = 0; + qlt_schedule_sess_for_deletion(ea->fcport); } - ql_dbg(ql_dbg_disc, vha, 0x2119, - "%s %d %8phC unhandle event of %x\n", - __func__, __LINE__, ea->fcport->port_name, ea->data[0]); break; } } @@ -5000,28 +5028,47 @@ qla2x00_configure_local_loop(scsi_qla_host_t *vha) unsigned long flags; /* Inititae N2N login. */ - if (test_and_clear_bit(N2N_LOGIN_NEEDED, &vha->dpc_flags)) { - /* borrowing */ - u32 *bp, i, sz; + if (N2N_TOPO(ha)) { + if (test_and_clear_bit(N2N_LOGIN_NEEDED, &vha->dpc_flags)) { + /* borrowing */ + u32 *bp, i, sz; - memset(ha->init_cb, 0, ha->init_cb_size); - sz = min_t(int, sizeof(struct els_plogi_payload), - ha->init_cb_size); - rval = qla24xx_get_port_login_templ(vha, ha->init_cb_dma, - (void *)ha->init_cb, sz); - if (rval == QLA_SUCCESS) { - bp = (uint32_t *)ha->init_cb; - for (i = 0; i < sz/4 ; i++, bp++) - *bp = cpu_to_be32(*bp); + memset(ha->init_cb, 0, ha->init_cb_size); + sz = min_t(int, sizeof(struct els_plogi_payload), + ha->init_cb_size); + rval = qla24xx_get_port_login_templ(vha, + ha->init_cb_dma, (void *)ha->init_cb, sz); + if (rval == QLA_SUCCESS) { + bp = (uint32_t *)ha->init_cb; + for (i = 0; i < sz/4 ; i++, bp++) + *bp = cpu_to_be32(*bp); - memcpy(&ha->plogi_els_payld.data, (void *)ha->init_cb, - sizeof(ha->plogi_els_payld.data)); - set_bit(RELOGIN_NEEDED, &vha->dpc_flags); - } else { - ql_dbg(ql_dbg_init, vha, 0x00d1, - "PLOGI ELS param read fail.\n"); + memcpy(&ha->plogi_els_payld.data, + (void *)ha->init_cb, + sizeof(ha->plogi_els_payld.data)); + set_bit(RELOGIN_NEEDED, &vha->dpc_flags); + } else { + ql_dbg(ql_dbg_init, vha, 0x00d1, + "PLOGI ELS param read fail.\n"); + goto skip_login; + } + } + + list_for_each_entry(fcport, &vha->vp_fcports, list) { + if (fcport->n2n_flag) { + qla24xx_fcport_handle_login(vha, fcport); + return QLA_SUCCESS; + } + } +skip_login: + spin_lock_irqsave(&vha->work_lock, flags); + vha->scan.scan_retry++; + spin_unlock_irqrestore(&vha->work_lock, flags); + + if (vha->scan.scan_retry < MAX_SCAN_RETRIES) { + set_bit(LOCAL_LOOP_UPDATE, &vha->dpc_flags); + set_bit(LOOP_RESYNC_NEEDED, &vha->dpc_flags); } - return QLA_SUCCESS; } found_devs = 0; diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 4c858e2d0ea8..6d6e10812b42 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -2249,7 +2249,7 @@ qla2x00_lip_reset(scsi_qla_host_t *vha) mbx_cmd_t mc; mbx_cmd_t *mcp = &mc; - ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x105a, + ql_dbg(ql_dbg_disc, vha, 0x105a, "Entered %s.\n", __func__); if (IS_CNA_CAPABLE(vha->hw)) { @@ -3883,14 +3883,23 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, case TOPO_N2N: ha->current_topology = ISP_CFG_N; spin_lock_irqsave(&vha->hw->tgt.sess_lock, flags); + list_for_each_entry(fcport, &vha->vp_fcports, list) { + fcport->scan_state = QLA_FCPORT_SCAN; + fcport->n2n_flag = 0; + } + fcport = qla2x00_find_fcport_by_wwpn(vha, rptid_entry->u.f1.port_name, 1); spin_unlock_irqrestore(&vha->hw->tgt.sess_lock, flags); if (fcport) { fcport->plogi_nack_done_deadline = jiffies + HZ; - fcport->dm_login_expire = jiffies + 3*HZ; + fcport->dm_login_expire = jiffies + 2*HZ; fcport->scan_state = QLA_FCPORT_FOUND; + fcport->n2n_flag = 1; + if (vha->flags.nvme_enabled) + fcport->fc4f_nvme = 1; + switch (fcport->disc_state) { case DSC_DELETED: set_bit(RELOGIN_NEEDED, @@ -3924,7 +3933,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, rptid_entry->u.f1.port_name, rptid_entry->u.f1.node_name, NULL, - FC4_TYPE_UNKNOWN); + FS_FCP_IS_N2N); } /* if our portname is higher then initiate N2N login */ @@ -4023,6 +4032,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, list_for_each_entry(fcport, &vha->vp_fcports, list) { fcport->scan_state = QLA_FCPORT_SCAN; + fcport->n2n_flag = 0; } fcport = qla2x00_find_fcport_by_wwpn(vha, @@ -4032,6 +4042,13 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, fcport->login_retry = vha->hw->login_retry_count; fcport->plogi_nack_done_deadline = jiffies + HZ; fcport->scan_state = QLA_FCPORT_FOUND; + fcport->n2n_flag = 1; + fcport->d_id.b.domain = + rptid_entry->u.f2.remote_nport_id[2]; + fcport->d_id.b.area = + rptid_entry->u.f2.remote_nport_id[1]; + fcport->d_id.b.al_pa = + rptid_entry->u.f2.remote_nport_id[0]; } } } diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 5eda86ff6606..8411dd5acd43 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -5033,6 +5033,10 @@ void qla24xx_create_new_sess(struct scsi_qla_host *vha, struct qla_work_evt *e) memcpy(fcport->port_name, e->u.new_sess.port_name, WWN_SIZE); + + if (e->u.new_sess.fc4_type & FS_FCP_IS_N2N) + fcport->n2n_flag = 1; + } else { ql_dbg(ql_dbg_disc, vha, 0xffff, "%s %8phC mem alloc fail.\n", From f3f1938bb673b1b5ad182c4608f5f8a24921eea3 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:10 -0700 Subject: [PATCH 045/312] scsi: qla2xxx: Fix N2N link up fail During link up/bounce, qla driver would do command flush as part of cleanup. In this case, the flush can intefere with FW state. This patch allows FW to be in control of link up. Link: https://lore.kernel.org/r/20190912180918.6436-7-hmadhani@marvell.com Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_mbx.c | 2 ++ drivers/scsi/qla2xxx/qla_os.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 6d6e10812b42..1cc6913f76c4 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -3897,6 +3897,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, fcport->dm_login_expire = jiffies + 2*HZ; fcport->scan_state = QLA_FCPORT_FOUND; fcport->n2n_flag = 1; + fcport->keep_nport_handle = 1; if (vha->flags.nvme_enabled) fcport->fc4f_nvme = 1; @@ -4042,6 +4043,7 @@ qla24xx_report_id_acquisition(scsi_qla_host_t *vha, fcport->login_retry = vha->hw->login_retry_count; fcport->plogi_nack_done_deadline = jiffies + HZ; fcport->scan_state = QLA_FCPORT_FOUND; + fcport->keep_nport_handle = 1; fcport->n2n_flag = 1; fcport->d_id.b.domain = rptid_entry->u.f2.remote_nport_id[2]; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 8411dd5acd43..ee47de9fbc05 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -5135,11 +5135,9 @@ void qla24xx_create_new_sess(struct scsi_qla_host *vha, struct qla_work_evt *e) if (dfcp) qlt_schedule_sess_for_deletion(tfcp); - - if (N2N_TOPO(vha->hw)) - fcport->flags &= ~FCF_FABRIC_DEVICE; - if (N2N_TOPO(vha->hw)) { + fcport->flags &= ~FCF_FABRIC_DEVICE; + fcport->keep_nport_handle = 1; if (vha->flags.nvme_enabled) { fcport->fc4f_nvme = 1; fcport->n2n_flag = 1; From 0aabb6b699f72dca96988d3f428e222f932dc889 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 12 Sep 2019 11:09:11 -0700 Subject: [PATCH 046/312] scsi: qla2xxx: Fix Nport ID display value For N2N, the NPort ID is assigned by driver in the PLOGI ELS. According to FW Spec the byte order for SID is not the same as DID. Link: https://lore.kernel.org/r/20190912180918.6436-8-hmadhani@marvell.com Reviewed-by: Roman Bolshakov Tested-by: Roman Bolshakov Signed-off-by: Quinn Tran Signed-off-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_iocb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index e92e52aa6e9b..518eb954cf42 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -2656,9 +2656,10 @@ qla24xx_els_logo_iocb(srb_t *sp, struct els_entry_24xx *els_iocb) els_iocb->port_id[0] = sp->fcport->d_id.b.al_pa; els_iocb->port_id[1] = sp->fcport->d_id.b.area; els_iocb->port_id[2] = sp->fcport->d_id.b.domain; - els_iocb->s_id[0] = vha->d_id.b.al_pa; - els_iocb->s_id[1] = vha->d_id.b.area; - els_iocb->s_id[2] = vha->d_id.b.domain; + /* For SID the byte order is different than DID */ + els_iocb->s_id[1] = vha->d_id.b.al_pa; + els_iocb->s_id[2] = vha->d_id.b.area; + els_iocb->s_id[0] = vha->d_id.b.domain; if (elsio->u.els_logo.els_cmd == ELS_DCMD_PLOGI) { els_iocb->control_flags = 0; From d2f15428d6a0ebfc0edc364094d7c4a2de7037ed Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 22 Sep 2019 00:55:46 -0500 Subject: [PATCH 047/312] smb3: fix leak in "open on server" perf counter We were not bumping up the "open on server" (num_remote_opens) counter (in some cases) on opens of the share root so could end up showing as a negative value. CC: Stable Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2ops.c | 5 +++++ fs/cifs/smb2pdu.c | 1 + 2 files changed, 6 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eaed18061314..3010066a8709 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -751,6 +751,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) goto oshr_exit; } + atomic_inc(&tcon->num_remote_opens); + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; oparms.fid->volatile_fid = o_rsp->VolatileFileId; @@ -1176,6 +1178,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, flags, 3, rqst, resp_buftype, rsp_iov); + /* no need to bump num_remote_opens because handle immediately closed */ sea_exit: kfree(ea); @@ -1518,6 +1521,8 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; + + /* No need to bump num_remote_opens since handle immediately closed */ if (qi.flags & PASSTHRU_FSCTL) { pqi = (struct smb_query_info __user *)arg; io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87066f1af12c..5f2491efd950 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2352,6 +2352,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rqst.rq_iov = iov; rqst.rq_nvec = n_iov; + /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); /* resource #4: response buffer */ From 388962e8e9ce8b253e91cbaed94e78a07dc92d84 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:06:18 +0800 Subject: [PATCH 048/312] fs/cifs/smb2pdu.c: Make SMB2_notify_init static Fix sparse warnings: fs/cifs/smb2pdu.c:3200:1: warning: symbol 'SMB2_notify_init' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5f2491efd950..ea08e6159481 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3181,7 +3181,7 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, * See MS-SMB2 2.2.35 and 2.2.36 */ -int +static int SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 completion_filter, bool watch_tree) From 8559ad8e89185ca55d8c9ef1e0b20f58578a4055 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 24 Sep 2019 10:13:47 +0800 Subject: [PATCH 049/312] fs/cifs/sess.c: Remove set but not used variable 'capabilities' Fixes gcc '-Wunused-but-set-variable' warning: fs/cifs/sess.c: In function sess_auth_lanman: fs/cifs/sess.c:910:8: warning: variable capabilities set but not used [-Wunused-but-set-variable] Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Steve French --- fs/cifs/sess.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4c764ff7edd2..85bd644f9773 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -698,7 +698,6 @@ sess_auth_lanman(struct sess_data *sess_data) char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; char lnm_session_key[CIFS_AUTH_RESP_SIZE]; - __u32 capabilities; __u16 bytes_remaining; /* lanman 2 style sessionsetup */ @@ -709,7 +708,7 @@ sess_auth_lanman(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + (void)cifs_ssetup_hdr(ses, pSMB); pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; From 63d37fb4ce5ae7bf1e58f906d1bf25f036fe79b2 Mon Sep 17 00:00:00 2001 From: Murphy Zhou Date: Sat, 21 Sep 2019 19:26:00 +0800 Subject: [PATCH 050/312] CIFS: fix max ea value size It should not be larger then the slab max buf size. If user specifies a larger size, it passes this check and goes straightly to SMB2_set_info_init performing an insecure memcpy. Signed-off-by: Murphy Zhou Reviewed-by: Aurelien Aptel CC: Stable Signed-off-by: Steve French --- fs/cifs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 9076150758d8..db4ba8f6077e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -31,7 +31,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" -#define MAX_EA_VALUE_SIZE 65535 +#define MAX_EA_VALUE_SIZE CIFSMaxBufSize #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" #define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ #define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ From 34c0989c05314b0288b058a4d1f6e58e2d320929 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:28 +0200 Subject: [PATCH 051/312] iommu/amd: Fix pages leak in free_pagetable() Take into account the gathered freelist in free_sub_pt(), otherwise we end up leaking all that pages. Fixes: 409afa44f9ba ("iommu/amd: Introduce free_sub_pt() function") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1ed3b98324ba..138547446345 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1425,7 +1425,7 @@ static void free_pagetable(struct protection_domain *domain) BUG_ON(domain->mode < PAGE_MODE_NONE || domain->mode > PAGE_MODE_6_LEVEL); - free_sub_pt(root, domain->mode, freelist); + freelist = free_sub_pt(root, domain->mode, freelist); free_page_list(freelist); } From 6ccb72f8374e17d60b58a7bfd5570496332c54e2 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:29 +0200 Subject: [PATCH 052/312] iommu/amd: Fix downgrading default page-sizes in alloc_pte() Downgrading an existing large mapping to a mapping using smaller page-sizes works only for the mappings created with page-mode 7 (i.e. non-default page size). Treat large mappings created with page-mode 0 (i.e. default page size) like a non-present mapping and allow to overwrite it in alloc_pte(). While around, make sure that we flush the TLB only if we change an existing mapping, otherwise we might end up acting on garbage PTEs. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 138547446345..c7e28a8d25d1 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1490,6 +1490,7 @@ static u64 *alloc_pte(struct protection_domain *domain, pte_level = PM_PTE_LEVEL(__pte); if (!IOMMU_PTE_PRESENT(__pte) || + pte_level == PAGE_MODE_NONE || pte_level == PAGE_MODE_7_LEVEL) { page = (u64 *)get_zeroed_page(gfp); if (!page) @@ -1500,7 +1501,7 @@ static u64 *alloc_pte(struct protection_domain *domain, /* pte could have been changed somewhere. */ if (cmpxchg64(pte, __pte, __npte) != __pte) free_page((unsigned long)page); - else if (pte_level == PAGE_MODE_7_LEVEL) + else if (IOMMU_PTE_PRESENT(__pte)) domain->updated = true; continue; From 7f1f1683c1e27c280556766cfd2c219957cebe4f Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:30 +0200 Subject: [PATCH 053/312] iommu/amd: Introduce first_pte_l7() helper Given an arbitrary pte that is part of a large mapping, this function returns the first pte of the series (and optionally the mapped size and number of PTEs) It will be re-used in a subsequent patch to replace an existing L7 mapping. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 40 ++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index c7e28a8d25d1..a227e7a9b8b7 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -501,6 +501,29 @@ static void iommu_uninit_device(struct device *dev) */ } +/* + * Helper function to get the first pte of a large mapping + */ +static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, + unsigned long *count) +{ + unsigned long pte_mask, pg_size, cnt; + u64 *fpte; + + pg_size = PTE_PAGE_SIZE(*pte); + cnt = PAGE_SIZE_PTE_COUNT(pg_size); + pte_mask = ~((cnt << 3) - 1); + fpte = (u64 *)(((unsigned long)pte) & pte_mask); + + if (page_size) + *page_size = pg_size; + + if (count) + *count = cnt; + + return fpte; +} + /**************************************************************************** * * Interrupt handling functions @@ -1567,17 +1590,12 @@ static u64 *fetch_pte(struct protection_domain *domain, *page_size = PTE_LEVEL_PAGE_SIZE(level); } - if (PM_PTE_LEVEL(*pte) == 0x07) { - unsigned long pte_mask; - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - *page_size = pte_mask = PTE_PAGE_SIZE(*pte); - pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); - pte = (u64 *)(((unsigned long)pte) & pte_mask); - } + /* + * If we have a series of large PTEs, make + * sure to return a pointer to the first one. + */ + if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) + pte = first_pte_l7(pte, page_size, NULL); return pte; } From cc449541f2a8a646ad5ac45cb1c7b59c3ed6b310 Mon Sep 17 00:00:00 2001 From: Andrei Dulea Date: Fri, 13 Sep 2019 16:42:31 +0200 Subject: [PATCH 054/312] iommu/amd: Unmap all L7 PTEs when downgrading page-sizes When replacing a large mapping created with page-mode 7 (i.e. non-default page size), tear down the entire series of replicated PTEs. Besides providing access to the old mapping, another thing that might go wrong with this issue is on the fetch_pte() code path that can return a PDE entry of the newly re-mapped range. While at it, make sure that we flush the TLB in case alloc_pte() fails and returns NULL at a lower level. Fixes: 6d568ef9a622 ("iommu/amd: Allow downgrading page-sizes in alloc_pte()") Signed-off-by: Andrei Dulea --- drivers/iommu/amd_iommu.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a227e7a9b8b7..fda9923542c9 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1512,10 +1512,32 @@ static u64 *alloc_pte(struct protection_domain *domain, __pte = *pte; pte_level = PM_PTE_LEVEL(__pte); - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE || + /* + * If we replace a series of large PTEs, we need + * to tear down all of them. + */ + if (IOMMU_PTE_PRESENT(__pte) && pte_level == PAGE_MODE_7_LEVEL) { + unsigned long count, i; + u64 *lpte; + + lpte = first_pte_l7(pte, NULL, &count); + + /* + * Unmap the replicated PTEs that still match the + * original large mapping + */ + for (i = 0; i < count; ++i) + cmpxchg64(&lpte[i], __pte, 0ULL); + + domain->updated = true; + continue; + } + + if (!IOMMU_PTE_PRESENT(__pte) || + pte_level == PAGE_MODE_NONE) { page = (u64 *)get_zeroed_page(gfp); + if (!page) return NULL; @@ -1646,8 +1668,10 @@ static int iommu_map_page(struct protection_domain *dom, count = PAGE_SIZE_PTE_COUNT(page_size); pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp); - if (!pte) + if (!pte) { + update_domain(dom); return -ENOMEM; + } for (i = 0; i < count; ++i) freelist = free_clear_pte(&pte[i], pte[i], freelist); From 0b15e02f0cc4fb34a9160de7ba6db3a4013dc1b7 Mon Sep 17 00:00:00 2001 From: Filippo Sironi Date: Tue, 10 Sep 2019 19:49:21 +0200 Subject: [PATCH 055/312] iommu/amd: Wait for completion of IOTLB flush in attach_device To make sure the domain tlb flush completes before the function returns, explicitly wait for its completion. Signed-off-by: Filippo Sironi Fixes: 42a49f965a8d ("amd-iommu: flush domain tlb when attaching a new device") [joro: Added commit message and fixes tag] Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index fda9923542c9..7bdce3b10f3d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2212,6 +2212,8 @@ skip_ats_check: */ domain_flush_tlb_pde(domain); + domain_flush_complete(domain); + return ret; } From 9f7fec0ba89108b9385f1b9fb167861224912a4a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Sep 2019 13:08:52 +0100 Subject: [PATCH 056/312] Btrfs: fix selftests failure due to uninitialized i_mode in test inodes Some of the self tests create a test inode, setup some extents and then do calls to btrfs_get_extent() to test that the corresponding extent maps exist and are correct. However btrfs_get_extent(), since the 5.2 merge window, now errors out when it finds a regular or prealloc extent for an inode that does not correspond to a regular file (its ->i_mode is not S_IFREG). This causes the self tests to fail sometimes, specially when KASAN, slub_debug and page poisoning are enabled: $ modprobe btrfs modprobe: ERROR: could not insert 'btrfs': Invalid argument $ dmesg [ 9414.691648] Btrfs loaded, crc32c=crc32c-intel, debug=on, assert=on, integrity-checker=on, ref-verify=on [ 9414.692655] BTRFS: selftest: sectorsize: 4096 nodesize: 4096 [ 9414.692658] BTRFS: selftest: running btrfs free space cache tests [ 9414.692918] BTRFS: selftest: running extent only tests [ 9414.693061] BTRFS: selftest: running bitmap only tests [ 9414.693366] BTRFS: selftest: running bitmap and extent tests [ 9414.696455] BTRFS: selftest: running space stealing from bitmap to extent tests [ 9414.697131] BTRFS: selftest: running extent buffer operation tests [ 9414.697133] BTRFS: selftest: running btrfs_split_item tests [ 9414.697564] BTRFS: selftest: running extent I/O tests [ 9414.697583] BTRFS: selftest: running find delalloc tests [ 9415.081125] BTRFS: selftest: running find_first_clear_extent_bit test [ 9415.081278] BTRFS: selftest: running extent buffer bitmap tests [ 9415.124192] BTRFS: selftest: running inode tests [ 9415.124195] BTRFS: selftest: running btrfs_get_extent tests [ 9415.127909] BTRFS: selftest: running hole first btrfs_get_extent test [ 9415.128343] BTRFS critical (device (efault)): regular/prealloc extent found for non-regular inode 256 [ 9415.131428] BTRFS: selftest: fs/btrfs/tests/inode-tests.c:904 expected a real extent, got 0 This happens because the test inodes are created without ever initializing the i_mode field of the inode, and neither VFS's new_inode() nor the btrfs callback btrfs_alloc_inode() initialize the i_mode. Initialization of the i_mode is done through the various callbacks used by the VFS to create new inodes (regular files, directories, symlinks, tmpfiles, etc), which all call btrfs_new_inode() which in turn calls inode_init_owner(), which sets the inode's i_mode. Since the tests only uses new_inode() to create the test inodes, the i_mode was never initialized. This always happens on a VM I used with kasan, slub_debug and many other debug facilities enabled. It also happened to someone who reported this on bugzilla (on a 5.3-rc). Fix this by setting i_mode to S_IFREG at btrfs_new_test_inode(). Fixes: 6bf9e4bd6a2778 ("btrfs: inode: Verify inode mode to avoid NULL pointer dereference") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204397 Signed-off-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tests/btrfs-tests.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b5e80563efaa..99fe9bf3fdac 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -52,7 +52,13 @@ static struct file_system_type test_type = { struct inode *btrfs_new_test_inode(void) { - return new_inode(test_mnt->mnt_sb); + struct inode *inode; + + inode = new_inode(test_mnt->mnt_sb); + if (inode) + inode_init_owner(inode, NULL, S_IFREG); + + return inode; } static int btrfs_init_test_fs(void) From eb5b64f142504a597d67e2109d603055ff765e52 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 13 Sep 2019 14:54:07 +0100 Subject: [PATCH 057/312] btrfs: adjust dirty_metadata_bytes after writeback failure of extent buffer Before, if a eb failed to write out, we would end up triggering a BUG_ON(). As of f4340622e0226 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up"), we no longer BUG_ON(), so we should make life consistent and add back the unwritten bytes to dirty_metadata_bytes. Fixes: f4340622e022 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up") CC: stable@vger.kernel.org # 5.2+ Reviewed-by: Filipe Manana Signed-off-by: Dennis Zhou Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dc5e6939856..67066ece4de3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3728,11 +3728,20 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_fs_info *fs_info; SetPageError(page); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; + /* + * If we error out, we should add back the dirty_metadata_bytes + * to make it consistent. + */ + fs_info = eb->fs_info; + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + eb->len, fs_info->dirty_metadata_batch); + /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. From 0607eb1d452d45c5ac4c745a9e9e0d95152ea9d0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Sep 2019 17:42:28 +0100 Subject: [PATCH 058/312] Btrfs: fix missing error return if writeback for extent buffer never started If lock_extent_buffer_for_io() fails, it returns a negative value, but its caller btree_write_cache_pages() ignores such error. This means that a call to flush_write_bio(), from lock_extent_buffer_for_io(), might have failed. We should make btree_write_cache_pages() notice such error values and stop immediatelly, making sure filemap_fdatawrite_range() returns an error to the transaction commit path. A failure from flush_write_bio() should also result in the endio callback end_bio_extent_buffer_writepage() being invoked, which sets the BTRFS_FS_*_ERR bits appropriately, so that there's no risk a transaction or log commit doesn't catch a writeback failure. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 67066ece4de3..380ced84d4b1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3978,6 +3978,10 @@ retry: if (!ret) { free_extent_buffer(eb); continue; + } else if (ret < 0) { + done = 1; + free_extent_buffer(eb); + break; } ret = write_one_eb(eb, wbc, &epd); From 13fc1d271a2e3ab8a02071e711add01fab9271f6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 24 Sep 2019 10:49:54 +0100 Subject: [PATCH 059/312] Btrfs: fix race setting up and completing qgroup rescan workers There is a race between setting up a qgroup rescan worker and completing a qgroup rescan worker that can lead to callers of the qgroup rescan wait ioctl to either not wait for the rescan worker to complete or to hang forever due to missing wake ups. The following diagram shows a sequence of steps that illustrates the race. CPU 1 CPU 2 CPU 3 btrfs_ioctl_quota_rescan() btrfs_qgroup_rescan() qgroup_rescan_init() mutex_lock(&fs_info->qgroup_rescan_lock) spin_lock(&fs_info->qgroup_lock) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN init_completion( &fs_info->qgroup_rescan_completion) fs_info->qgroup_rescan_running = true mutex_unlock(&fs_info->qgroup_rescan_lock) spin_unlock(&fs_info->qgroup_lock) btrfs_init_work() --> starts the worker btrfs_qgroup_rescan_worker() mutex_lock(&fs_info->qgroup_rescan_lock) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN mutex_unlock(&fs_info->qgroup_rescan_lock) starts transaction, updates qgroup status item, etc btrfs_ioctl_quota_rescan() btrfs_qgroup_rescan() qgroup_rescan_init() mutex_lock(&fs_info->qgroup_rescan_lock) spin_lock(&fs_info->qgroup_lock) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN init_completion( &fs_info->qgroup_rescan_completion) fs_info->qgroup_rescan_running = true mutex_unlock(&fs_info->qgroup_rescan_lock) spin_unlock(&fs_info->qgroup_lock) btrfs_init_work() --> starts another worker mutex_lock(&fs_info->qgroup_rescan_lock) fs_info->qgroup_rescan_running = false mutex_unlock(&fs_info->qgroup_rescan_lock) complete_all(&fs_info->qgroup_rescan_completion) Before the rescan worker started by the task at CPU 3 completes, if another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at fs_info->qgroup_flags, which is expected and correct behaviour. However if other task calls btrfs_ioctl_quota_rescan_wait() before the rescan worker started by the task at CPU 3 completes, it will return immediately without waiting for the new rescan worker to complete, because fs_info->qgroup_rescan_running is set to false by CPU 2. This race is making test case btrfs/171 (from fstests) to fail often: btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad) --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100 +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100 @@ -1,2 +1,3 @@ QA output created by 171 +ERROR: quota rescan failed: Operation now in progress Silence is golden ... (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff) That is because the test calls the btrfs-progs commands "qgroup quota rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs" commands 'qgroup assign' and 'qgroup remove' often call the rescan start ioctl after calling the qgroup assign ioctl, btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait for a rescan worker to complete. Another problem the race can cause is missing wake ups for waiters, since the call to complete_all() happens outside a critical section and after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence diagram above, if we have a waiter for the first rescan task (executed by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and before it calls complete_all() against fs_info->qgroup_rescan_completion, the task at CPU 3 calls init_completion() against fs_info->qgroup_rescan_completion which re-initilizes its wait queue to an empty queue, therefore causing the rescan worker at CPU 2 to call complete_all() against an empty queue, never waking up the task waiting for that rescan worker. Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting fs_info->qgroup_rescan_running to false in the same critical section, delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the call to complete_all() in that same critical section. This gives the protection needed to avoid rescan wait ioctl callers not waiting for a running rescan worker and the lost wake ups problem, since setting that rescan flag and boolean as well as initializing the wait queue is done already in a critical section delimited by that mutex (at qgroup_rescan_init()). Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion") Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8d3bd799ac7d..52701c1be109 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3166,9 +3166,6 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (!btrfs_fs_closing(fs_info)) - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -3184,16 +3181,30 @@ out: trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", err); - goto done; } - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", err); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (!btrfs_fs_closing(fs_info)) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (trans) { + ret = update_qgroup_status_item(trans); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d", + err); + } } + fs_info->qgroup_rescan_running = false; + complete_all(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!trans) + return; + btrfs_end_transaction(trans); if (btrfs_fs_closing(fs_info)) { @@ -3204,12 +3215,6 @@ out: } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } - -done: - mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = false; - mutex_unlock(&fs_info->qgroup_rescan_lock); - complete_all(&fs_info->qgroup_rescan_completion); } /* From a6f197f889ce0a5fd583674d9fa39259f5c8f72f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 09:54:40 +0530 Subject: [PATCH 060/312] powerpc/book3s64: Export has_transparent_hugepage() related functions. In later patch, we want to use hash_transparent_hugepage() in a kernel module. Export two related functions. Acked-by: Michael Ellerman Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190924042440.27946-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/include/asm/book3s/64/radix.h | 8 +++++++- arch/powerpc/mm/book3s64/hash_pgtable.c | 2 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 7 ------- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index e04a839cb5b9..65a6966f1de4 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -254,7 +254,13 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); -extern int radix__has_transparent_hugepage(void); +static inline int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} #endif extern int __meminit radix__vmemmap_create_mapping(unsigned long start, diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index d1f390ac9cdb..64733b9cb20a 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -406,6 +406,8 @@ int hash__has_transparent_hugepage(void) return 1; } +EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b4ca9e95e678..dc7a38f0a45b 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1057,13 +1057,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } -int radix__has_transparent_hugepage(void) -{ - /* For radix 2M at PMD level means thp */ - if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) - return 1; - return 0; -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, From f537669978a7abae29c1d3b489f300e4d8f47005 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 5 Sep 2019 21:16:03 +0530 Subject: [PATCH 061/312] libnvdimm/dax: Pick the right alignment default when creating dax devices Allow arch to provide the supported alignments and use hugepage alignment only if we support hugepage. Right now we depend on compile time configs whereas this patch switch this to runtime discovery. Architectures like ppc64 can have THP enabled in code, but then can have hugepage size disabled by the hypervisor. This allows us to create dax devices with PAGE_SIZE alignment in this case. Existing dax namespace with alignment larger than PAGE_SIZE will fail to initialize in this specific case. We still allow fsdax namespace initialization. With respect to identifying whether to enable hugepage fault for a dax device, if THP is enabled during compile, we default to taking hugepage fault and in dax fault handler if we find the fault size > alignment we retry with PAGE_SIZE fault size. This also addresses the below failure scenario on ppc64 ndctl create-namespace --mode=devdax | grep align "align":16777216, "align":16777216 cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments 65536 16777216 daxio.static-debug -z -o /dev/dax0.0 Bus error (core dumped) $ dmesg | tail lpar: Failed hash pte insert with error -4 hash-mmu: mm: Hashing failure ! EA=0x7fff17000000 access=0x8000000000000006 current=daxio hash-mmu: trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 pte=0xc000000501002b86 daxio[3860]: bus error (7) at 7fff17000000 nip 7fff973c007c lr 7fff973bff34 code 2 in libpmem.so.1.0.0[7fff973b0000+20000] daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c e93f0088 f93f0120 daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128 e93f0088 39290008 f93f0110 The failure was due to guest kernel using wrong page size. The namespaces created with 16M alignment will appear as below on a config with 16M page size disabled. $ ndctl list -Ni [ { "dev":"namespace0.1", "mode":"fsdax", "map":"dev", "size":5351931904, "uuid":"fc6e9667-461a-4718-82b4-69b24570bddb", "align":16777216, "blockdev":"pmem0.1", "supported_alignments":[ 65536 ] }, { "dev":"namespace0.0", "mode":"fsdax", <==== devdax 16M alignment marked disabled. "map":"mem", "size":5368709120, "uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484", "state":"disabled" } ] Cc: linux-mm@kvack.org Cc: "Kirill A. Shutemov" Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190905154603.10349-8-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/nd.h | 6 +--- drivers/nvdimm/pfn_devs.c | 75 ++++++++++++++++++++++++++++----------- include/linux/huge_mm.h | 7 +++- 3 files changed, 61 insertions(+), 27 deletions(-) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index e89af4b2d8e9..ee5c04070ef9 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -289,11 +289,7 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region) struct nd_pfn *to_nd_pfn(struct device *dev); #if IS_ENABLED(CONFIG_NVDIMM_PFN) -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE -#else -#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE -#endif +#define MAX_NVDIMM_ALIGN 4 int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns); bool is_nd_pfn(struct device *dev); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index ce9ef18282dd..934cdcaaae97 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -103,39 +103,42 @@ static ssize_t align_show(struct device *dev, return sprintf(buf, "%ld\n", nd_pfn->align); } -static const unsigned long *nd_pfn_supported_alignments(void) +static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments) { - /* - * This needs to be a non-static variable because the *_SIZE - * macros aren't always constants. - */ - const unsigned long supported_alignments[] = { - PAGE_SIZE, -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - HPAGE_PMD_SIZE, -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD - HPAGE_PUD_SIZE, -#endif -#endif - 0, - }; - static unsigned long data[ARRAY_SIZE(supported_alignments)]; - memcpy(data, supported_alignments, sizeof(data)); + alignments[0] = PAGE_SIZE; - return data; + if (has_transparent_hugepage()) { + alignments[1] = HPAGE_PMD_SIZE; + if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + alignments[2] = HPAGE_PUD_SIZE; + } + + return alignments; +} + +/* + * Use pmd mapping if supported as default alignment + */ +static unsigned long nd_pfn_default_alignment(void) +{ + + if (has_transparent_hugepage()) + return HPAGE_PMD_SIZE; + return PAGE_SIZE; } static ssize_t align_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, }; ssize_t rc; nd_device_lock(dev); nvdimm_bus_lock(dev); rc = nd_size_select_store(dev, buf, &nd_pfn->align, - nd_pfn_supported_alignments()); + nd_pfn_supported_alignments(aligns)); dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); @@ -259,7 +262,10 @@ static DEVICE_ATTR_RO(size); static ssize_t supported_alignments_show(struct device *dev, struct device_attribute *attr, char *buf) { - return nd_size_select_show(0, nd_pfn_supported_alignments(), buf); + unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + + return nd_size_select_show(0, + nd_pfn_supported_alignments(aligns), buf); } static DEVICE_ATTR_RO(supported_alignments); @@ -302,7 +308,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn, return NULL; nd_pfn->mode = PFN_MODE_NONE; - nd_pfn->align = PFN_DEFAULT_ALIGNMENT; + nd_pfn->align = nd_pfn_default_alignment(); dev = &nd_pfn->dev; device_initialize(&nd_pfn->dev); if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) { @@ -412,6 +418,21 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) return 0; } +static bool nd_supported_alignment(unsigned long align) +{ + int i; + unsigned long supported[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + + if (align == 0) + return false; + + nd_pfn_supported_alignments(supported); + for (i = 0; supported[i]; i++) + if (align == supported[i]) + return true; + return false; +} + /** * nd_pfn_validate - read and validate info-block * @nd_pfn: fsdax namespace runtime state / properties @@ -496,6 +517,18 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) return -EOPNOTSUPP; } + /* + * Check whether the we support the alignment. For Dax if the + * superblock alignment is not matching, we won't initialize + * the device. + */ + if (!nd_supported_alignment(align) && + !memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) { + dev_err(&nd_pfn->dev, "init failed, alignment mismatch: " + "%ld:%ld\n", nd_pfn->align, align); + return -EOPNOTSUPP; + } + if (!nd_pfn->uuid) { /* * When probing a namepace via nd_pfn_probe() the uuid diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..376a81ff2c96 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -108,7 +108,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; - + /* + * For dax vmas, try to always use hugepage mappings. If the kernel does + * not support hugepages, fsdax mappings will fallback to PAGE_SIZE + * mappings, and device-dax namespaces, that try to guarantee a given + * mapping size, will fail to enable + */ if (vma_is_dax(vma)) return true; From 86aa66687442ef45909ff9814b82b4d2bb892294 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 9 Aug 2019 13:17:26 +0530 Subject: [PATCH 062/312] =?UTF-8?q?libnvdimm:=20Fix=20endian=20conversion?= =?UTF-8?q?=20issues=C2=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nd_label->dpa issue was observed when trying to enable the namespace created with little-endian kernel on a big-endian kernel. That made me run `sparse` on the rest of the code and other changes are the result of that. Fixes: d9b83c756953 ("libnvdimm, btt: rework error clearing") Fixes: 9dedc73a4658 ("libnvdimm/btt: Fix LBA masking during 'free list' population") Reviewed-by: Vishal Verma Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190809074726.27815-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/btt.c | 8 ++++---- drivers/nvdimm/namespace_devs.c | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index a8d56887ec88..3e9f45aec8d1 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -392,9 +392,9 @@ static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; if (++(arena->freelist[lane].seq) == 4) arena->freelist[lane].seq = 1; - if (ent_e_flag(ent->old_map)) + if (ent_e_flag(le32_to_cpu(ent->old_map))) arena->freelist[lane].has_err = 1; - arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map)); + arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map)); return ret; } @@ -560,8 +560,8 @@ static int btt_freelist_init(struct arena_info *arena) * FIXME: if error clearing fails during init, we want to make * the BTT read-only */ - if (ent_e_flag(log_new.old_map) && - !ent_normal(log_new.old_map)) { + if (ent_e_flag(le32_to_cpu(log_new.old_map)) && + !ent_normal(le32_to_cpu(log_new.old_map))) { arena->freelist[i].has_err = 1; ret = arena_clear_freelist_error(arena, i); if (ret) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 43401325c874..cca0a3ba1d2c 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1987,7 +1987,7 @@ static struct device *create_namespace_pmem(struct nd_region *nd_region, nd_mapping = &nd_region->mapping[i]; label_ent = list_first_entry_or_null(&nd_mapping->labels, typeof(*label_ent), list); - label0 = label_ent ? label_ent->label : 0; + label0 = label_ent ? label_ent->label : NULL; if (!label0) { WARN_ON(1); @@ -2322,8 +2322,9 @@ static struct device **scan_labels(struct nd_region *nd_region) continue; /* skip labels that describe extents outside of the region */ - if (nd_label->dpa < nd_mapping->start || nd_label->dpa > map_end) - continue; + if (__le64_to_cpu(nd_label->dpa) < nd_mapping->start || + __le64_to_cpu(nd_label->dpa) > map_end) + continue; i = add_namespace_resource(nd_region, nd_label, devs, count); if (i < 0) From cf387d9644d8c78721cf9b77af9f67bb5b04da16 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 10 Sep 2019 11:58:25 +0530 Subject: [PATCH 063/312] libnvdimm/altmap: Track namespace boundaries in altmap With PFN_MODE_PMEM namespace, the memmap area is allocated from the device area. Some architectures map the memmap area with large page size. On architectures like ppc64, 16MB page for memap mapping can map 262144 pfns. This maps a namespace size of 16G. When populating memmap region with 16MB page from the device area, make sure the allocated space is not used to map resources outside this namespace. Such usage of device area will prevent a namespace destroy. Add resource end pnf in altmap and use that to check if the memmap area allocation can map pfn outside the namespace. On ppc64 in such case we fallback to allocation from memory. This fix kernel crash reported below: [ 132.034989] WARNING: CPU: 13 PID: 13719 at mm/memremap.c:133 devm_memremap_pages_release+0x2d8/0x2e0 [ 133.464754] BUG: Unable to handle kernel data access at 0xc00c00010b204000 [ 133.464760] Faulting instruction address: 0xc00000000007580c [ 133.464766] Oops: Kernel access of bad area, sig: 11 [#1] [ 133.464771] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ..... [ 133.464901] NIP [c00000000007580c] vmemmap_free+0x2ac/0x3d0 [ 133.464906] LR [c0000000000757f8] vmemmap_free+0x298/0x3d0 [ 133.464910] Call Trace: [ 133.464914] [c000007cbfd0f7b0] [c0000000000757f8] vmemmap_free+0x298/0x3d0 (unreliable) [ 133.464921] [c000007cbfd0f8d0] [c000000000370a44] section_deactivate+0x1a4/0x240 [ 133.464928] [c000007cbfd0f980] [c000000000386270] __remove_pages+0x3a0/0x590 [ 133.464935] [c000007cbfd0fa50] [c000000000074158] arch_remove_memory+0x88/0x160 [ 133.464942] [c000007cbfd0fae0] [c0000000003be8c0] devm_memremap_pages_release+0x150/0x2e0 [ 133.464949] [c000007cbfd0fb70] [c000000000738ea0] devm_action_release+0x30/0x50 [ 133.464955] [c000007cbfd0fb90] [c00000000073a5a4] release_nodes+0x344/0x400 [ 133.464961] [c000007cbfd0fc40] [c00000000073378c] device_release_driver_internal+0x15c/0x250 [ 133.464968] [c000007cbfd0fc80] [c00000000072fd14] unbind_store+0x104/0x110 [ 133.464973] [c000007cbfd0fcd0] [c00000000072ee24] drv_attr_store+0x44/0x70 [ 133.464981] [c000007cbfd0fcf0] [c0000000004a32bc] sysfs_kf_write+0x6c/0xa0 [ 133.464987] [c000007cbfd0fd10] [c0000000004a1dfc] kernfs_fop_write+0x17c/0x250 [ 133.464993] [c000007cbfd0fd60] [c0000000003c348c] __vfs_write+0x3c/0x70 [ 133.464999] [c000007cbfd0fd80] [c0000000003c75d0] vfs_write+0xd0/0x250 djbw: Aneesh notes that this crash can likely be triggered in any kernel that supports 'papr_scm', so flagging that commit for -stable consideration. Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions") Cc: Reported-by: Sachin Sant Signed-off-by: Aneesh Kumar K.V Reviewed-by: Pankaj Gupta Tested-by: Santosh Sivaraj Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20190910062826.10041-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/mm/init_64.c | 17 ++++++++++++++++- drivers/nvdimm/pfn_devs.c | 2 ++ include/linux/memremap.h | 1 + 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a44f6281ca3a..4e08246acd79 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -172,6 +172,21 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } +static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) +{ + unsigned long nr_pfn = page_size / sizeof(struct page); + unsigned long start_pfn = page_to_pfn((struct page *)start); + + if ((start_pfn + nr_pfn) > altmap->end_pfn) + return true; + + if (start_pfn < altmap->base_pfn) + return true; + + return false; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { @@ -194,7 +209,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, * fail due to alignment issues when using 16MB hugepages, so * fall back to system memory if the altmap allocation fail. */ - if (altmap) { + if (altmap && !altmap_cross_boundary(altmap, start, page_size)) { p = altmap_alloc_block_buf(page_size, altmap); if (!p) pr_debug("altmap block allocation failed, falling back to system memory"); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 934cdcaaae97..80c7992bc538 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -672,9 +672,11 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) struct nd_namespace_common *ndns = nd_pfn->ndns; struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); resource_size_t base = nsio->res.start + start_pad; + resource_size_t end = nsio->res.end - end_trunc; struct vmem_altmap __altmap = { .base_pfn = init_altmap_base(base), .reserve = init_altmap_reserve(base), + .end_pfn = PHYS_PFN(end), }; memcpy(res, &nsio->res, sizeof(*res)); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8a5b2a19945..c70996fe48c8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,6 +17,7 @@ struct device; */ struct vmem_altmap { const unsigned long base_pfn; + const unsigned long end_pfn; const unsigned long reserve; unsigned long free; unsigned long align; From 59f08896f058a92f03a0041b397a1a227c5e8529 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 17 Sep 2019 21:21:49 -0700 Subject: [PATCH 064/312] libnvdimm/nfit_test: Fix acpi_handle redefinition After commit 62974fc389b3 ("libnvdimm: Enable unit test infrastructure compile checks"), clang warns: In file included from ../drivers/nvdimm/../../tools/testing/nvdimm/test/iomap.c:15: ../drivers/nvdimm/../../tools/testing/nvdimm/test/nfit_test.h:206:15: warning: redefinition of typedef 'acpi_handle' is a C11 feature [-Wtypedef-redefinition] typedef void *acpi_handle; ^ ../include/acpi/actypes.h:424:15: note: previous definition is here typedef void *acpi_handle; /* Actually a ptr to a NS Node */ ^ 1 warning generated. The include chain: iomap.c -> linux/acpi.h -> acpi/acpi.h -> acpi/actypes.h nfit_test.h Avoid this by including linux/acpi.h in nfit_test.h, which allows us to remove both the typedef and the forward declaration of acpi_object. Link: https://github.com/ClangBuiltLinux/linux/issues/660 Signed-off-by: Nathan Chancellor Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20190918042148.77553-1-natechancellor@gmail.com Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/nfit_test.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index 448d686da8b1..0bf5640f1f07 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -4,6 +4,7 @@ */ #ifndef __NFIT_TEST_H__ #define __NFIT_TEST_H__ +#include #include #include #include @@ -202,9 +203,6 @@ struct nd_intel_lss { __u32 status; } __packed; -union acpi_object; -typedef void *acpi_handle; - typedef struct nfit_test_resource *(*nfit_test_lookup_fn)(resource_size_t); typedef union acpi_object *(*nfit_test_evaluate_dsm_fn)(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, From c42adf87e4e7ed77f6ffe288dc90f980d07d68df Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 19 Sep 2019 14:03:55 +0530 Subject: [PATCH 065/312] libnvdimm/region: Initialize bad block for volatile namespaces We do check for a bad block during namespace init and that use region bad block list. We need to initialize the bad block for volatile regions for this to work. We also observe a lockdep warning as below because the lock is not initialized correctly since we skip bad block init for volatile regions. INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc1-15699-g3dee241c937e #149 Call Trace: [c0000000f95cb250] [c00000000147dd84] dump_stack+0xe8/0x164 (unreliable) [c0000000f95cb2a0] [c00000000022ccd8] register_lock_class+0x308/0xa60 [c0000000f95cb3a0] [c000000000229cc0] __lock_acquire+0x170/0x1ff0 [c0000000f95cb4c0] [c00000000022c740] lock_acquire+0x220/0x270 [c0000000f95cb580] [c000000000a93230] badblocks_check+0xc0/0x290 [c0000000f95cb5f0] [c000000000d97540] nd_pfn_validate+0x5c0/0x7f0 [c0000000f95cb6d0] [c000000000d98300] nd_dax_probe+0xd0/0x1f0 [c0000000f95cb760] [c000000000d9b66c] nd_pmem_probe+0x10c/0x160 [c0000000f95cb790] [c000000000d7f5ec] nvdimm_bus_probe+0x10c/0x240 [c0000000f95cb820] [c000000000d0f844] really_probe+0x254/0x4e0 [c0000000f95cb8b0] [c000000000d0fdfc] driver_probe_device+0x16c/0x1e0 [c0000000f95cb930] [c000000000d10238] device_driver_attach+0x68/0xa0 [c0000000f95cb970] [c000000000d1040c] __driver_attach+0x19c/0x1c0 [c0000000f95cb9f0] [c000000000d0c4c4] bus_for_each_dev+0x94/0x130 [c0000000f95cba50] [c000000000d0f014] driver_attach+0x34/0x50 [c0000000f95cba70] [c000000000d0e208] bus_add_driver+0x178/0x2f0 [c0000000f95cbb00] [c000000000d117c8] driver_register+0x108/0x170 [c0000000f95cbb70] [c000000000d7edb0] __nd_driver_register+0xe0/0x100 [c0000000f95cbbd0] [c000000001a6baa4] nd_pmem_driver_init+0x34/0x48 [c0000000f95cbbf0] [c0000000000106f4] do_one_initcall+0x1d4/0x4b0 [c0000000f95cbcd0] [c0000000019f499c] kernel_init_freeable+0x544/0x65c [c0000000f95cbdb0] [c000000000010d6c] kernel_init+0x2c/0x180 [c0000000f95cbe20] [c00000000000b954] ret_from_kernel_thread+0x5c/0x68 Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20190919083355.26340-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 2 +- drivers/nvdimm/region.c | 4 ++-- drivers/nvdimm/region_devs.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 75a58a6e9615..d47412dcdf38 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -180,7 +180,7 @@ static int nvdimm_clear_badblocks_region(struct device *dev, void *data) sector_t sector; /* make sure device is a region */ - if (!is_nd_pmem(dev)) + if (!is_memory(dev)) return 0; nd_region = to_nd_region(dev); diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 37bf8719a2a4..0f6978e72e7c 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -34,7 +34,7 @@ static int nd_region_probe(struct device *dev) if (rc) return rc; - if (is_nd_pmem(&nd_region->dev)) { + if (is_memory(&nd_region->dev)) { struct resource ndr_res; if (devm_init_badblocks(dev, &nd_region->bb)) @@ -123,7 +123,7 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event) struct nd_region *nd_region = to_nd_region(dev); struct resource res; - if (is_nd_pmem(&nd_region->dev)) { + if (is_memory(&nd_region->dev)) { res.start = nd_region->ndr_start; res.end = nd_region->ndr_start + nd_region->ndr_size - 1; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 3fd6b59abd33..ab91890f2486 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -632,11 +632,11 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) if (!is_memory(dev) && a == &dev_attr_dax_seed.attr) return 0; - if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) + if (!is_memory(dev) && a == &dev_attr_badblocks.attr) return 0; if (a == &dev_attr_resource.attr) { - if (is_nd_pmem(dev)) + if (is_memory(dev)) return 0400; else return 0; From 674f31a352da5e9f621f757b9a89262f486533a0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 24 Sep 2019 10:34:49 -0700 Subject: [PATCH 066/312] libnvdimm: prevent nvdimm from requesting key when security is disabled Current implementation attempts to request keys from the keyring even when security is not enabled. Change behavior so when security is disabled it will skip key request. Error messages seen when no keys are installed and libnvdimm is loaded: request-key[4598]: Cannot find command to construct key 661489677 request-key[4606]: Cannot find command to construct key 34713726 Cc: stable@vger.kernel.org Fixes: 4c6926a23b76 ("acpi/nfit, libnvdimm: Add unlock of nvdimm support for Intel DIMMs") Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/156934642272.30222.5230162488753445916.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dan Williams --- drivers/nvdimm/security.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c index 9e45b207ff01..89b85970912d 100644 --- a/drivers/nvdimm/security.c +++ b/drivers/nvdimm/security.c @@ -177,6 +177,10 @@ static int __nvdimm_security_unlock(struct nvdimm *nvdimm) || !nvdimm->sec.flags) return -EIO; + /* No need to go further if security is disabled */ + if (test_bit(NVDIMM_SECURITY_DISABLED, &nvdimm->sec.flags)) + return 0; + if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) { dev_dbg(dev, "Security operation in progress.\n"); return -EBUSY; From 4c806b897d6075bfa5067e524fb058c57ab64e7b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Sep 2019 17:13:27 +0530 Subject: [PATCH 067/312] libnvdimm/region: Enable MAP_SYNC for volatile regions Some environments want to use a host tmpfs/ramdisk to back guest pmem. While the data is not persisted relative to the host it *is* persisted relative to guest crashes / reboots. The guest is free to use dax and MAP_SYNC to keep filesystem metadata consistent with dax accesses without requiring guest fsync(). The guest can also observe that the region is volatile and skip cache flushing as global visibility is enough to "persist" data relative to the host staying alive over guest reset events. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20190924114327.14700-1-aneesh.kumar@linux.ibm.com [djbw: reword the changelog] Signed-off-by: Dan Williams --- drivers/nvdimm/region_devs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index ab91890f2486..ef423ba1a711 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1168,6 +1168,9 @@ EXPORT_SYMBOL_GPL(nvdimm_has_cache); bool is_nvdimm_sync(struct nd_region *nd_region) { + if (is_nd_volatile(&nd_region->dev)) + return true; + return is_nd_pmem(&nd_region->dev) && !test_bit(ND_REGION_ASYNC, &nd_region->flags); } From 131ea1ed3322c6ec06eb8f276f226c8a1f3bbf1b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Sep 2019 23:27:34 -0500 Subject: [PATCH 068/312] smb3: Add missing reparse tags Additional reparse tags were described for WSL and file sync. Add missing defines for these tags. Some will be useful for POSIX extensions (as discussed at Storage Developer Conference). Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/smbfsctl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 08628e6a42ac..1ff28529cf4b 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -144,6 +144,17 @@ #define IO_REPARSE_APPXSTREAM 0xC0000014 /* NFS symlinks, Win 8/SMB3 and later */ #define IO_REPARSE_TAG_NFS 0x80000014 +/* + * AzureFileSync - see + * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering + */ +#define IO_REPARSE_TAG_AZ_FILE_SYNC 0x8000001e +/* WSL reparse tags */ +#define IO_REPARSE_TAG_LX_SYMLINK 0xA000001D +#define IO_REPARSE_TAG_AF_UNIX 0x80000023 +#define IO_REPARSE_TAG_LX_FIFO 0x80000024 +#define IO_REPARSE_TAG_LX_CHR 0x80000025 +#define IO_REPARSE_TAG_LX_BLK 0x80000026 /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ From cb063a83ca321fbf0cb2b4044186f241d89f3dc1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:18 +0200 Subject: [PATCH 069/312] thermal: db8500: Finalize device tree conversion At some point there was an attempt to convert the DB8500 thermal sensor to device tree: a probe path was added and the device tree was augmented for the Snowball board. The switchover was never completed: instead the thermal devices came from from the PRCMU MFD device and the probe on the Snowball was confused as another set of configuration appeared from the device tree. Move over to a device-tree only approach, as we fixed up the device trees. Cc: Vincent Guittot Acked-by: Lee Jones Reviewed-by: Daniel Lezcano Signed-off-by: Linus Walleij Signed-off-by: Eduardo Valentin --- drivers/mfd/db8500-prcmu.c | 53 +------------------- drivers/thermal/Kconfig | 2 +- drivers/thermal/db8500_thermal.c | 30 +++++------ include/linux/platform_data/db8500_thermal.h | 29 ----------- 4 files changed, 17 insertions(+), 97 deletions(-) delete mode 100644 include/linux/platform_data/db8500_thermal.h diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 90e0f21bc49c..518439c79826 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -36,7 +36,6 @@ #include #include #include -#include #include "dbx500-prcmu-regs.h" /* Index of different voltages to be used when accessing AVSData */ @@ -2984,53 +2983,6 @@ static struct ux500_wdt_data db8500_wdt_pdata = { .timeout = 600, /* 10 minutes */ .has_28_bits_resolution = true, }; -/* - * Thermal Sensor - */ - -static struct resource db8500_thsens_resources[] = { - { - .name = "IRQ_HOTMON_LOW", - .start = IRQ_PRCMU_HOTMON_LOW, - .end = IRQ_PRCMU_HOTMON_LOW, - .flags = IORESOURCE_IRQ, - }, - { - .name = "IRQ_HOTMON_HIGH", - .start = IRQ_PRCMU_HOTMON_HIGH, - .end = IRQ_PRCMU_HOTMON_HIGH, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct db8500_thsens_platform_data db8500_thsens_data = { - .trip_points[0] = { - .temp = 70000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[1] = { - .temp = 75000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[2] = { - .temp = 80000, - .type = THERMAL_TRIP_ACTIVE, - .cdev_name = { - [0] = "thermal-cpufreq-0", - }, - }, - .trip_points[3] = { - .temp = 85000, - .type = THERMAL_TRIP_CRITICAL, - }, - .num_trips = 4, -}; static const struct mfd_cell common_prcmu_devs[] = { { @@ -3054,10 +3006,7 @@ static const struct mfd_cell db8500_prcmu_devs[] = { }, { .name = "db8500-thermal", - .num_resources = ARRAY_SIZE(db8500_thsens_resources), - .resources = db8500_thsens_resources, - .platform_data = &db8500_thsens_data, - .pdata_size = sizeof(db8500_thsens_data), + .of_compatible = "stericsson,db8500-thermal", }, }; diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 9966364a6deb..001a21abcc28 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -310,7 +310,7 @@ config DOVE_THERMAL config DB8500_THERMAL tristate "DB8500 thermal management" - depends on MFD_DB8500_PRCMU + depends on MFD_DB8500_PRCMU && OF default y help Adds DB8500 thermal management implementation according to the thermal diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index b71a999d17d6..d650ae5fdf2a 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -13,13 +13,24 @@ #include #include #include -#include #include #include #include #define PRCMU_DEFAULT_MEASURE_TIME 0xFFF #define PRCMU_DEFAULT_LOW_TEMP 0 +#define COOLING_DEV_MAX 8 + +struct db8500_trip_point { + unsigned long temp; + enum thermal_trip_type type; + char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; +}; + +struct db8500_thsens_platform_data { + struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; + int num_trips; +}; struct db8500_thermal_zone { struct thermal_zone_device *therm_dev; @@ -301,7 +312,6 @@ static void db8500_thermal_work(struct work_struct *work) dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n"); } -#ifdef CONFIG_OF static struct db8500_thsens_platform_data* db8500_thermal_parse_dt(struct platform_device *pdev) { @@ -370,13 +380,6 @@ err_parse_dt: dev_err(&pdev->dev, "Parsing device tree data error.\n"); return NULL; } -#else -static inline struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct platform_device *pdev) -{ - return NULL; -} -#endif static int db8500_thermal_probe(struct platform_device *pdev) { @@ -386,11 +389,10 @@ static int db8500_thermal_probe(struct platform_device *pdev) int low_irq, high_irq, ret = 0; unsigned long dft_low, dft_high; - if (np) - ptrips = db8500_thermal_parse_dt(pdev); - else - ptrips = dev_get_platdata(&pdev->dev); + if (!np) + return -EINVAL; + ptrips = db8500_thermal_parse_dt(pdev); if (!ptrips) return -EINVAL; @@ -498,13 +500,11 @@ static int db8500_thermal_resume(struct platform_device *pdev) return 0; } -#ifdef CONFIG_OF static const struct of_device_id db8500_thermal_match[] = { { .compatible = "stericsson,db8500-thermal" }, {}, }; MODULE_DEVICE_TABLE(of, db8500_thermal_match); -#endif static struct platform_driver db8500_thermal_driver = { .driver = { diff --git a/include/linux/platform_data/db8500_thermal.h b/include/linux/platform_data/db8500_thermal.h deleted file mode 100644 index 55e55750a165..000000000000 --- a/include/linux/platform_data/db8500_thermal.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * db8500_thermal.h - DB8500 Thermal Management Implementation - * - * Copyright (C) 2012 ST-Ericsson - * Copyright (C) 2012 Linaro Ltd. - * - * Author: Hongbo Zhang - */ - -#ifndef _DB8500_THERMAL_H_ -#define _DB8500_THERMAL_H_ - -#include - -#define COOLING_DEV_MAX 8 - -struct db8500_trip_point { - unsigned long temp; - enum thermal_trip_type type; - char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; -}; - -struct db8500_thsens_platform_data { - struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; - int num_trips; -}; - -#endif /* _DB8500_THERMAL_H_ */ From 3de9e4dff889531d517c8808898972e96fa507b2 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:19 +0200 Subject: [PATCH 070/312] thermal: db8500: Use dev helper variable The code gets easier to read like this. Cc: Vincent Guittot Reviewed-by: Daniel Lezcano Signed-off-by: Linus Walleij Signed-off-by: Eduardo Valentin --- drivers/thermal/db8500_thermal.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index d650ae5fdf2a..d250bcf3c10c 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -313,16 +313,16 @@ static void db8500_thermal_work(struct work_struct *work) } static struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct platform_device *pdev) + db8500_thermal_parse_dt(struct device *dev) { struct db8500_thsens_platform_data *ptrips; - struct device_node *np = pdev->dev.of_node; + struct device_node *np = dev->of_node; char prop_name[32]; const char *tmp_str; u32 tmp_data; int i, j; - ptrips = devm_kzalloc(&pdev->dev, sizeof(*ptrips), GFP_KERNEL); + ptrips = devm_kzalloc(dev, sizeof(*ptrips), GFP_KERNEL); if (!ptrips) return NULL; @@ -377,7 +377,7 @@ static struct db8500_thsens_platform_data* return ptrips; err_parse_dt: - dev_err(&pdev->dev, "Parsing device tree data error.\n"); + dev_err(dev, "Parsing device tree data error.\n"); return NULL; } @@ -385,18 +385,19 @@ static int db8500_thermal_probe(struct platform_device *pdev) { struct db8500_thermal_zone *pzone = NULL; struct db8500_thsens_platform_data *ptrips = NULL; - struct device_node *np = pdev->dev.of_node; + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; int low_irq, high_irq, ret = 0; unsigned long dft_low, dft_high; if (!np) return -EINVAL; - ptrips = db8500_thermal_parse_dt(pdev); + ptrips = db8500_thermal_parse_dt(dev); if (!ptrips) return -EINVAL; - pzone = devm_kzalloc(&pdev->dev, sizeof(*pzone), GFP_KERNEL); + pzone = devm_kzalloc(dev, sizeof(*pzone), GFP_KERNEL); if (!pzone) return -ENOMEM; @@ -410,31 +411,31 @@ static int db8500_thermal_probe(struct platform_device *pdev) low_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_LOW"); if (low_irq < 0) { - dev_err(&pdev->dev, "Get IRQ_HOTMON_LOW failed.\n"); + dev_err(dev, "Get IRQ_HOTMON_LOW failed.\n"); ret = low_irq; goto out_unlock; } - ret = devm_request_threaded_irq(&pdev->dev, low_irq, NULL, + ret = devm_request_threaded_irq(dev, low_irq, NULL, prcmu_low_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, "dbx500_temp_low", pzone); if (ret < 0) { - dev_err(&pdev->dev, "Failed to allocate temp low irq.\n"); + dev_err(dev, "Failed to allocate temp low irq.\n"); goto out_unlock; } high_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_HIGH"); if (high_irq < 0) { - dev_err(&pdev->dev, "Get IRQ_HOTMON_HIGH failed.\n"); + dev_err(dev, "Get IRQ_HOTMON_HIGH failed.\n"); ret = high_irq; goto out_unlock; } - ret = devm_request_threaded_irq(&pdev->dev, high_irq, NULL, + ret = devm_request_threaded_irq(dev, high_irq, NULL, prcmu_high_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, "dbx500_temp_high", pzone); if (ret < 0) { - dev_err(&pdev->dev, "Failed to allocate temp high irq.\n"); + dev_err(dev, "Failed to allocate temp high irq.\n"); goto out_unlock; } @@ -442,11 +443,11 @@ static int db8500_thermal_probe(struct platform_device *pdev) ptrips->num_trips, 0, pzone, &thdev_ops, NULL, 0, 0); if (IS_ERR(pzone->therm_dev)) { - dev_err(&pdev->dev, "Register thermal zone device failed.\n"); + dev_err(dev, "Register thermal zone device failed.\n"); ret = PTR_ERR(pzone->therm_dev); goto out_unlock; } - dev_info(&pdev->dev, "Thermal zone device registered.\n"); + dev_info(dev, "Thermal zone device registered.\n"); dft_low = PRCMU_DEFAULT_LOW_TEMP; dft_high = ptrips->trip_points[0].temp; From 6c375eccded41df8033ed55a1b785531b304fc67 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 28 Aug 2019 15:03:20 +0200 Subject: [PATCH 071/312] thermal: db8500: Rewrite to be a pure OF sensor This patch rewrites the DB8500 thermal sensor to be a pure OF sensor, so that it can be used with thermal zones defined in the device tree. This driver was initially merged before we had generic thermal zone device tree bindings, and now it gets modernized to the way we do things these days. The old driver depended on a set of trigger points provided in the device tree or platform data to interpolate the current temperature between trigger points depending on whether the trend was rising or falling. This was bad because the trigger points should be used for defining temperature zone policies and bind to cooling devices. As the PRCMU (power reset control management unit) can only issue IRQs when we pass temperature trigger points upward or downward We instead define a number of temperature points inside the driver ranging from 15 to 100 degrees celsius. The effect is that when we register the device we quickly trigger 15, 20 ... up to the room temperature in succession and then we get continous event IRQs also under normal operating conditions, and the temperature of the system is now reported more accurately (+/- 2.5 degrees celsius) while in the past the first trigger point was at 70 degrees and the average temperature was simply reported as 35 degrees celsius (between 70 degrees and 0) until we passed 70 degrees which didn't accurately represent the temperature of the system. As a result of dropping all the trigger points from the driver and reusing the core DT thermal zone management code we reduce the code footprint quite a bit. Cc: Vincent Guittot Suggested-by: Daniel Lezcano Signed-off-by: Linus Walleij Reviewed-by: Daniel Lezcano Signed-off-by: Eduardo Valentin --- drivers/thermal/db8500_thermal.c | 477 +++++++------------------------ 1 file changed, 107 insertions(+), 370 deletions(-) diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index d250bcf3c10c..372dbbaaafb8 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -3,9 +3,9 @@ * db8500_thermal.c - DB8500 Thermal Management Implementation * * Copyright (C) 2012 ST-Ericsson - * Copyright (C) 2012 Linaro Ltd. + * Copyright (C) 2012-2019 Linaro Ltd. * - * Author: Hongbo Zhang + * Authors: Hongbo Zhang, Linus Walleij */ #include @@ -19,458 +19,202 @@ #define PRCMU_DEFAULT_MEASURE_TIME 0xFFF #define PRCMU_DEFAULT_LOW_TEMP 0 -#define COOLING_DEV_MAX 8 -struct db8500_trip_point { - unsigned long temp; - enum thermal_trip_type type; - char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH]; -}; - -struct db8500_thsens_platform_data { - struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS]; - int num_trips; +/** + * db8500_thermal_points - the interpolation points that trigger + * interrupts + */ +static const unsigned long db8500_thermal_points[] = { + 15000, + 20000, + 25000, + 30000, + 35000, + 40000, + 45000, + 50000, + 55000, + 60000, + 65000, + 70000, + 75000, + 80000, + /* + * This is where things start to get really bad for the + * SoC and the thermal zones should be set up to trigger + * critical temperature at 85000 mC so we don't get above + * this point. + */ + 85000, + 90000, + 95000, + 100000, }; struct db8500_thermal_zone { - struct thermal_zone_device *therm_dev; - struct mutex th_lock; - struct work_struct therm_work; - struct db8500_thsens_platform_data *trip_tab; - enum thermal_device_mode mode; + struct thermal_zone_device *tz; enum thermal_trend trend; - unsigned long cur_temp_pseudo; + unsigned long interpolated_temp; unsigned int cur_index; }; -/* Local function to check if thermal zone matches cooling devices */ -static int db8500_thermal_match_cdev(struct thermal_cooling_device *cdev, - struct db8500_trip_point *trip_point) -{ - int i; - - if (!strlen(cdev->type)) - return -EINVAL; - - for (i = 0; i < COOLING_DEV_MAX; i++) { - if (!strcmp(trip_point->cdev_name[i], cdev->type)) - return 0; - } - - return -ENODEV; -} - -/* Callback to bind cooling device to thermal zone */ -static int db8500_cdev_bind(struct thermal_zone_device *thermal, - struct thermal_cooling_device *cdev) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned long max_state, upper, lower; - int i, ret = -EINVAL; - - cdev->ops->get_max_state(cdev, &max_state); - - for (i = 0; i < ptrips->num_trips; i++) { - if (db8500_thermal_match_cdev(cdev, &ptrips->trip_points[i])) - continue; - - upper = lower = i > max_state ? max_state : i; - - ret = thermal_zone_bind_cooling_device(thermal, i, cdev, - upper, lower, THERMAL_WEIGHT_DEFAULT); - - dev_info(&cdev->device, "%s bind to %d: %d-%s\n", cdev->type, - i, ret, ret ? "fail" : "succeed"); - } - - return ret; -} - -/* Callback to unbind cooling device from thermal zone */ -static int db8500_cdev_unbind(struct thermal_zone_device *thermal, - struct thermal_cooling_device *cdev) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - int i, ret = -EINVAL; - - for (i = 0; i < ptrips->num_trips; i++) { - if (db8500_thermal_match_cdev(cdev, &ptrips->trip_points[i])) - continue; - - ret = thermal_zone_unbind_cooling_device(thermal, i, cdev); - - dev_info(&cdev->device, "%s unbind from %d: %s\n", cdev->type, - i, ret ? "fail" : "succeed"); - } - - return ret; -} - /* Callback to get current temperature */ -static int db8500_sys_get_temp(struct thermal_zone_device *thermal, int *temp) +static int db8500_thermal_get_temp(void *data, int *temp) { - struct db8500_thermal_zone *pzone = thermal->devdata; + struct db8500_thermal_zone *th = data; /* * TODO: There is no PRCMU interface to get temperature data currently, * so a pseudo temperature is returned , it works for thermal framework * and this will be fixed when the PRCMU interface is available. */ - *temp = pzone->cur_temp_pseudo; + *temp = th->interpolated_temp; return 0; } /* Callback to get temperature changing trend */ -static int db8500_sys_get_trend(struct thermal_zone_device *thermal, - int trip, enum thermal_trend *trend) +static int db8500_thermal_get_trend(void *data, int trip, enum thermal_trend *trend) { - struct db8500_thermal_zone *pzone = thermal->devdata; + struct db8500_thermal_zone *th = data; - *trend = pzone->trend; + *trend = th->trend; return 0; } -/* Callback to get thermal zone mode */ -static int db8500_sys_get_mode(struct thermal_zone_device *thermal, - enum thermal_device_mode *mode) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - - mutex_lock(&pzone->th_lock); - *mode = pzone->mode; - mutex_unlock(&pzone->th_lock); - - return 0; -} - -/* Callback to set thermal zone mode */ -static int db8500_sys_set_mode(struct thermal_zone_device *thermal, - enum thermal_device_mode mode) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - - mutex_lock(&pzone->th_lock); - - pzone->mode = mode; - if (mode == THERMAL_DEVICE_ENABLED) - schedule_work(&pzone->therm_work); - - mutex_unlock(&pzone->th_lock); - - return 0; -} - -/* Callback to get trip point type */ -static int db8500_sys_get_trip_type(struct thermal_zone_device *thermal, - int trip, enum thermal_trip_type *type) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - - if (trip >= ptrips->num_trips) - return -EINVAL; - - *type = ptrips->trip_points[trip].type; - - return 0; -} - -/* Callback to get trip point temperature */ -static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal, - int trip, int *temp) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - - if (trip >= ptrips->num_trips) - return -EINVAL; - - *temp = ptrips->trip_points[trip].temp; - - return 0; -} - -/* Callback to get critical trip point temperature */ -static int db8500_sys_get_crit_temp(struct thermal_zone_device *thermal, - int *temp) -{ - struct db8500_thermal_zone *pzone = thermal->devdata; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - int i; - - for (i = ptrips->num_trips - 1; i > 0; i--) { - if (ptrips->trip_points[i].type == THERMAL_TRIP_CRITICAL) { - *temp = ptrips->trip_points[i].temp; - return 0; - } - } - - return -EINVAL; -} - -static struct thermal_zone_device_ops thdev_ops = { - .bind = db8500_cdev_bind, - .unbind = db8500_cdev_unbind, - .get_temp = db8500_sys_get_temp, - .get_trend = db8500_sys_get_trend, - .get_mode = db8500_sys_get_mode, - .set_mode = db8500_sys_set_mode, - .get_trip_type = db8500_sys_get_trip_type, - .get_trip_temp = db8500_sys_get_trip_temp, - .get_crit_temp = db8500_sys_get_crit_temp, +static struct thermal_zone_of_device_ops thdev_ops = { + .get_temp = db8500_thermal_get_temp, + .get_trend = db8500_thermal_get_trend, }; -static void db8500_thermal_update_config(struct db8500_thermal_zone *pzone, - unsigned int idx, enum thermal_trend trend, - unsigned long next_low, unsigned long next_high) +static void db8500_thermal_update_config(struct db8500_thermal_zone *th, + unsigned int idx, + enum thermal_trend trend, + unsigned long next_low, + unsigned long next_high) { prcmu_stop_temp_sense(); - pzone->cur_index = idx; - pzone->cur_temp_pseudo = (next_low + next_high)/2; - pzone->trend = trend; + th->cur_index = idx; + th->interpolated_temp = (next_low + next_high)/2; + th->trend = trend; + /* + * The PRCMU accept absolute temperatures in celsius so divide + * down the millicelsius with 1000 + */ prcmu_config_hotmon((u8)(next_low/1000), (u8)(next_high/1000)); prcmu_start_temp_sense(PRCMU_DEFAULT_MEASURE_TIME); } static irqreturn_t prcmu_low_irq_handler(int irq, void *irq_data) { - struct db8500_thermal_zone *pzone = irq_data; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned int idx = pzone->cur_index; + struct db8500_thermal_zone *th = irq_data; + unsigned int idx = th->cur_index; unsigned long next_low, next_high; - if (unlikely(idx == 0)) + if (idx == 0) /* Meaningless for thermal management, ignoring it */ return IRQ_HANDLED; if (idx == 1) { - next_high = ptrips->trip_points[0].temp; + next_high = db8500_thermal_points[0]; next_low = PRCMU_DEFAULT_LOW_TEMP; } else { - next_high = ptrips->trip_points[idx-1].temp; - next_low = ptrips->trip_points[idx-2].temp; + next_high = db8500_thermal_points[idx - 1]; + next_low = db8500_thermal_points[idx - 2]; } idx -= 1; - db8500_thermal_update_config(pzone, idx, THERMAL_TREND_DROPPING, - next_low, next_high); - - dev_dbg(&pzone->therm_dev->device, + db8500_thermal_update_config(th, idx, THERMAL_TREND_DROPPING, + next_low, next_high); + dev_dbg(&th->tz->device, "PRCMU set max %ld, min %ld\n", next_high, next_low); - schedule_work(&pzone->therm_work); + thermal_zone_device_update(th->tz, THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } static irqreturn_t prcmu_high_irq_handler(int irq, void *irq_data) { - struct db8500_thermal_zone *pzone = irq_data; - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned int idx = pzone->cur_index; + struct db8500_thermal_zone *th = irq_data; + unsigned int idx = th->cur_index; unsigned long next_low, next_high; + int num_points = ARRAY_SIZE(db8500_thermal_points); - if (idx < ptrips->num_trips - 1) { - next_high = ptrips->trip_points[idx+1].temp; - next_low = ptrips->trip_points[idx].temp; + if (idx < num_points - 1) { + next_high = db8500_thermal_points[idx+1]; + next_low = db8500_thermal_points[idx]; idx += 1; - db8500_thermal_update_config(pzone, idx, THERMAL_TREND_RAISING, - next_low, next_high); + db8500_thermal_update_config(th, idx, THERMAL_TREND_RAISING, + next_low, next_high); - dev_dbg(&pzone->therm_dev->device, - "PRCMU set max %ld, min %ld\n", next_high, next_low); - } else if (idx == ptrips->num_trips - 1) - pzone->cur_temp_pseudo = ptrips->trip_points[idx].temp + 1; + dev_info(&th->tz->device, + "PRCMU set max %ld, min %ld\n", next_high, next_low); + } else if (idx == num_points - 1) + /* So we roof out 1 degree over the max point */ + th->interpolated_temp = db8500_thermal_points[idx] + 1; - schedule_work(&pzone->therm_work); + thermal_zone_device_update(th->tz, THERMAL_EVENT_UNSPECIFIED); return IRQ_HANDLED; } -static void db8500_thermal_work(struct work_struct *work) -{ - enum thermal_device_mode cur_mode; - struct db8500_thermal_zone *pzone; - - pzone = container_of(work, struct db8500_thermal_zone, therm_work); - - mutex_lock(&pzone->th_lock); - cur_mode = pzone->mode; - mutex_unlock(&pzone->th_lock); - - if (cur_mode == THERMAL_DEVICE_DISABLED) - return; - - thermal_zone_device_update(pzone->therm_dev, THERMAL_EVENT_UNSPECIFIED); - dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n"); -} - -static struct db8500_thsens_platform_data* - db8500_thermal_parse_dt(struct device *dev) -{ - struct db8500_thsens_platform_data *ptrips; - struct device_node *np = dev->of_node; - char prop_name[32]; - const char *tmp_str; - u32 tmp_data; - int i, j; - - ptrips = devm_kzalloc(dev, sizeof(*ptrips), GFP_KERNEL); - if (!ptrips) - return NULL; - - if (of_property_read_u32(np, "num-trips", &tmp_data)) - goto err_parse_dt; - - if (tmp_data > THERMAL_MAX_TRIPS) - goto err_parse_dt; - - ptrips->num_trips = tmp_data; - - for (i = 0; i < ptrips->num_trips; i++) { - sprintf(prop_name, "trip%d-temp", i); - if (of_property_read_u32(np, prop_name, &tmp_data)) - goto err_parse_dt; - - ptrips->trip_points[i].temp = tmp_data; - - sprintf(prop_name, "trip%d-type", i); - if (of_property_read_string(np, prop_name, &tmp_str)) - goto err_parse_dt; - - if (!strcmp(tmp_str, "active")) - ptrips->trip_points[i].type = THERMAL_TRIP_ACTIVE; - else if (!strcmp(tmp_str, "passive")) - ptrips->trip_points[i].type = THERMAL_TRIP_PASSIVE; - else if (!strcmp(tmp_str, "hot")) - ptrips->trip_points[i].type = THERMAL_TRIP_HOT; - else if (!strcmp(tmp_str, "critical")) - ptrips->trip_points[i].type = THERMAL_TRIP_CRITICAL; - else - goto err_parse_dt; - - sprintf(prop_name, "trip%d-cdev-num", i); - if (of_property_read_u32(np, prop_name, &tmp_data)) - goto err_parse_dt; - - if (tmp_data > COOLING_DEV_MAX) - goto err_parse_dt; - - for (j = 0; j < tmp_data; j++) { - sprintf(prop_name, "trip%d-cdev-name%d", i, j); - if (of_property_read_string(np, prop_name, &tmp_str)) - goto err_parse_dt; - - if (strlen(tmp_str) >= THERMAL_NAME_LENGTH) - goto err_parse_dt; - - strcpy(ptrips->trip_points[i].cdev_name[j], tmp_str); - } - } - return ptrips; - -err_parse_dt: - dev_err(dev, "Parsing device tree data error.\n"); - return NULL; -} - static int db8500_thermal_probe(struct platform_device *pdev) { - struct db8500_thermal_zone *pzone = NULL; - struct db8500_thsens_platform_data *ptrips = NULL; + struct db8500_thermal_zone *th = NULL; struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; int low_irq, high_irq, ret = 0; - unsigned long dft_low, dft_high; - if (!np) - return -EINVAL; - - ptrips = db8500_thermal_parse_dt(dev); - if (!ptrips) - return -EINVAL; - - pzone = devm_kzalloc(dev, sizeof(*pzone), GFP_KERNEL); - if (!pzone) + th = devm_kzalloc(dev, sizeof(*th), GFP_KERNEL); + if (!th) return -ENOMEM; - mutex_init(&pzone->th_lock); - mutex_lock(&pzone->th_lock); - - pzone->mode = THERMAL_DEVICE_DISABLED; - pzone->trip_tab = ptrips; - - INIT_WORK(&pzone->therm_work, db8500_thermal_work); - low_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_LOW"); if (low_irq < 0) { - dev_err(dev, "Get IRQ_HOTMON_LOW failed.\n"); - ret = low_irq; - goto out_unlock; + dev_err(dev, "Get IRQ_HOTMON_LOW failed\n"); + return low_irq; } ret = devm_request_threaded_irq(dev, low_irq, NULL, prcmu_low_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, - "dbx500_temp_low", pzone); + "dbx500_temp_low", th); if (ret < 0) { - dev_err(dev, "Failed to allocate temp low irq.\n"); - goto out_unlock; + dev_err(dev, "failed to allocate temp low irq\n"); + return ret; } high_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_HIGH"); if (high_irq < 0) { - dev_err(dev, "Get IRQ_HOTMON_HIGH failed.\n"); - ret = high_irq; - goto out_unlock; + dev_err(dev, "Get IRQ_HOTMON_HIGH failed\n"); + return high_irq; } ret = devm_request_threaded_irq(dev, high_irq, NULL, prcmu_high_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT, - "dbx500_temp_high", pzone); + "dbx500_temp_high", th); if (ret < 0) { - dev_err(dev, "Failed to allocate temp high irq.\n"); - goto out_unlock; + dev_err(dev, "failed to allocate temp high irq\n"); + return ret; } - pzone->therm_dev = thermal_zone_device_register("db8500_thermal_zone", - ptrips->num_trips, 0, pzone, &thdev_ops, NULL, 0, 0); - - if (IS_ERR(pzone->therm_dev)) { - dev_err(dev, "Register thermal zone device failed.\n"); - ret = PTR_ERR(pzone->therm_dev); - goto out_unlock; + /* register of thermal sensor and get info from DT */ + th->tz = devm_thermal_zone_of_sensor_register(dev, 0, th, &thdev_ops); + if (IS_ERR(th->tz)) { + dev_err(dev, "register thermal zone sensor failed\n"); + return PTR_ERR(th->tz); } - dev_info(dev, "Thermal zone device registered.\n"); + dev_info(dev, "thermal zone sensor registered\n"); - dft_low = PRCMU_DEFAULT_LOW_TEMP; - dft_high = ptrips->trip_points[0].temp; + /* Start measuring at the lowest point */ + db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE, + PRCMU_DEFAULT_LOW_TEMP, + db8500_thermal_points[0]); - db8500_thermal_update_config(pzone, 0, THERMAL_TREND_STABLE, - dft_low, dft_high); - - platform_set_drvdata(pdev, pzone); - pzone->mode = THERMAL_DEVICE_ENABLED; - -out_unlock: - mutex_unlock(&pzone->th_lock); - - return ret; -} - -static int db8500_thermal_remove(struct platform_device *pdev) -{ - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - - thermal_zone_device_unregister(pzone->therm_dev); - cancel_work_sync(&pzone->therm_work); - mutex_destroy(&pzone->th_lock); + platform_set_drvdata(pdev, th); return 0; } @@ -478,9 +222,6 @@ static int db8500_thermal_remove(struct platform_device *pdev) static int db8500_thermal_suspend(struct platform_device *pdev, pm_message_t state) { - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - - flush_work(&pzone->therm_work); prcmu_stop_temp_sense(); return 0; @@ -488,15 +229,12 @@ static int db8500_thermal_suspend(struct platform_device *pdev, static int db8500_thermal_resume(struct platform_device *pdev) { - struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev); - struct db8500_thsens_platform_data *ptrips = pzone->trip_tab; - unsigned long dft_low, dft_high; + struct db8500_thermal_zone *th = platform_get_drvdata(pdev); - dft_low = PRCMU_DEFAULT_LOW_TEMP; - dft_high = ptrips->trip_points[0].temp; - - db8500_thermal_update_config(pzone, 0, THERMAL_TREND_STABLE, - dft_low, dft_high); + /* Resume and start measuring at the lowest point */ + db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE, + PRCMU_DEFAULT_LOW_TEMP, + db8500_thermal_points[0]); return 0; } @@ -515,7 +253,6 @@ static struct platform_driver db8500_thermal_driver = { .probe = db8500_thermal_probe, .suspend = db8500_thermal_suspend, .resume = db8500_thermal_resume, - .remove = db8500_thermal_remove, }; module_platform_driver(db8500_thermal_driver); From 21a045e430e56725628f361dede8a882e92469b9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 20 Sep 2019 21:45:33 +0200 Subject: [PATCH 072/312] ieee802154: mcr20a: simplify a bit 'mcr20a_handle_rx_read_buf_complete()' Use a 'skb_put_data()' variant instead of rewritting it. The __skb_put_data variant is safe here. It is obvious that the skb can not overflow. It has just been allocated a few lines above with the same 'len'. Signed-off-by: Christophe JAILLET Acked-by: Xue Liu Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mcr20a.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/mcr20a.c b/drivers/net/ieee802154/mcr20a.c index 17f2300e63ee..8dc04e2590b1 100644 --- a/drivers/net/ieee802154/mcr20a.c +++ b/drivers/net/ieee802154/mcr20a.c @@ -800,7 +800,7 @@ mcr20a_handle_rx_read_buf_complete(void *context) if (!skb) return; - memcpy(skb_put(skb, len), lp->rx_buf, len); + __skb_put_data(skb, lp->rx_buf, len); ieee802154_rx_irqsafe(lp->hw, skb, lp->rx_lqi[0]); print_hex_dump_debug("mcr20a rx: ", DUMP_PREFIX_OFFSET, 16, 1, From 1fac4a54374f7ef385938f3c6cf7649c0fe4f6cd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 23 Sep 2019 14:56:14 +0800 Subject: [PATCH 073/312] btrfs: relocation: fix use-after-free on dead relocation roots [BUG] One user reported a reproducible KASAN report about use-after-free: BTRFS info (device sdi1): balance: start -dvrange=1256811659264..1256811659265 BTRFS info (device sdi1): relocating block group 1256811659264 flags data|raid0 ================================================================== BUG: KASAN: use-after-free in btrfs_init_reloc_root+0x2cd/0x340 [btrfs] Write of size 8 at addr ffff88856f671710 by task kworker/u24:10/261579 CPU: 2 PID: 261579 Comm: kworker/u24:10 Tainted: P OE 5.2.11-arch1-1-kasan #4 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X99 Extreme4, BIOS P3.80 04/06/2018 Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs] Call Trace: dump_stack+0x7b/0xba print_address_description+0x6c/0x22e ? btrfs_init_reloc_root+0x2cd/0x340 [btrfs] __kasan_report.cold+0x1b/0x3b ? btrfs_init_reloc_root+0x2cd/0x340 [btrfs] kasan_report+0x12/0x17 __asan_report_store8_noabort+0x17/0x20 btrfs_init_reloc_root+0x2cd/0x340 [btrfs] record_root_in_trans+0x2a0/0x370 [btrfs] btrfs_record_root_in_trans+0xf4/0x140 [btrfs] start_transaction+0x1ab/0xe90 [btrfs] btrfs_join_transaction+0x1d/0x20 [btrfs] btrfs_finish_ordered_io+0x7bf/0x18a0 [btrfs] ? lock_repin_lock+0x400/0x400 ? __kmem_cache_shutdown.cold+0x140/0x1ad ? btrfs_unlink_subvol+0x9b0/0x9b0 [btrfs] finish_ordered_fn+0x15/0x20 [btrfs] normal_work_helper+0x1bd/0xca0 [btrfs] ? process_one_work+0x819/0x1720 ? kasan_check_read+0x11/0x20 btrfs_endio_write_helper+0x12/0x20 [btrfs] process_one_work+0x8c9/0x1720 ? pwq_dec_nr_in_flight+0x2f0/0x2f0 ? worker_thread+0x1d9/0x1030 worker_thread+0x98/0x1030 kthread+0x2bb/0x3b0 ? process_one_work+0x1720/0x1720 ? kthread_park+0x120/0x120 ret_from_fork+0x35/0x40 Allocated by task 369692: __kasan_kmalloc.part.0+0x44/0xc0 __kasan_kmalloc.constprop.0+0xba/0xc0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x138/0x260 btrfs_read_tree_root+0x92/0x360 [btrfs] btrfs_read_fs_root+0x10/0xb0 [btrfs] create_reloc_root+0x47d/0xa10 [btrfs] btrfs_init_reloc_root+0x1e2/0x340 [btrfs] record_root_in_trans+0x2a0/0x370 [btrfs] btrfs_record_root_in_trans+0xf4/0x140 [btrfs] start_transaction+0x1ab/0xe90 [btrfs] btrfs_start_transaction+0x1e/0x20 [btrfs] __btrfs_prealloc_file_range+0x1c2/0xa00 [btrfs] btrfs_prealloc_file_range+0x13/0x20 [btrfs] prealloc_file_extent_cluster+0x29f/0x570 [btrfs] relocate_file_extent_cluster+0x193/0xc30 [btrfs] relocate_data_extent+0x1f8/0x490 [btrfs] relocate_block_group+0x600/0x1060 [btrfs] btrfs_relocate_block_group+0x3a0/0xa00 [btrfs] btrfs_relocate_chunk+0x9e/0x180 [btrfs] btrfs_balance+0x14e4/0x2fc0 [btrfs] btrfs_ioctl_balance+0x47f/0x640 [btrfs] btrfs_ioctl+0x119d/0x8380 [btrfs] do_vfs_ioctl+0x9f5/0x1060 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x73/0xb0 do_syscall_64+0xa5/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 369692: __kasan_slab_free+0x14f/0x210 kasan_slab_free+0xe/0x10 kfree+0xd8/0x270 btrfs_drop_snapshot+0x154c/0x1eb0 [btrfs] clean_dirty_subvols+0x227/0x340 [btrfs] relocate_block_group+0x972/0x1060 [btrfs] btrfs_relocate_block_group+0x3a0/0xa00 [btrfs] btrfs_relocate_chunk+0x9e/0x180 [btrfs] btrfs_balance+0x14e4/0x2fc0 [btrfs] btrfs_ioctl_balance+0x47f/0x640 [btrfs] btrfs_ioctl+0x119d/0x8380 [btrfs] do_vfs_ioctl+0x9f5/0x1060 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x73/0xb0 do_syscall_64+0xa5/0x370 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff88856f671100 which belongs to the cache kmalloc-4k of size 4096 The buggy address is located 1552 bytes inside of 4096-byte region [ffff88856f671100, ffff88856f672100) The buggy address belongs to the page: page:ffffea0015bd9c00 refcount:1 mapcount:0 mapping:ffff88864400e600 index:0x0 compound_mapcount: 0 flags: 0x2ffff0000010200(slab|head) raw: 02ffff0000010200 dead000000000100 dead000000000200 ffff88864400e600 raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88856f671600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88856f671680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff88856f671700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88856f671780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88856f671800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== BTRFS info (device sdi1): 1 enospc errors during balance BTRFS info (device sdi1): balance: ended with status: -28 [CAUSE] The problem happens when finish_ordered_io() get called with balance still running, while the reloc root of that subvolume is already dead. (Tree is swap already done, but tree not yet deleted for possible qgroup usage.) That means root->reloc_root still exists, but that reloc_root can be under btrfs_drop_snapshot(), thus we shouldn't access it. The following race could cause the use-after-free problem: CPU1 | CPU2 -------------------------------------------------------------------------- | relocate_block_group() | |- unset_reloc_control(rc) | |- btrfs_commit_transaction() btrfs_finish_ordered_io() | |- clean_dirty_subvols() |- btrfs_join_transaction() | | |- record_root_in_trans() | | |- btrfs_init_reloc_root() | | |- if (root->reloc_root) | | | | |- root->reloc_root = NULL | | |- btrfs_drop_snapshot(reloc_root); |- reloc_root->last_trans| = trans->transid | ^^^^^^^^^^^^^^^^^^^^^^ Use after free [FIX] Fix it by the following modifications: - Test if the root has dead reloc tree before accessing root->reloc_root If the root has BTRFS_ROOT_DEAD_RELOC_TREE, then we don't need to create or update root->reloc_tree - Clear the BTRFS_ROOT_DEAD_RELOC_TREE flag until we have fully dropped reloc tree To co-operate with above modification, so as long as BTRFS_ROOT_DEAD_RELOC_TREE is still set, we won't try to re-create reloc tree at record_root_in_trans(). Reported-by: Cebtenzzre Fixes: d2311e698578 ("btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots") CC: stable@vger.kernel.org # 5.1+ Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2f0e25afa486..00504657b602 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1435,6 +1435,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + /* + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return 0; + if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; @@ -2187,7 +2194,6 @@ static int clean_dirty_subvols(struct reloc_control *rc) /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; - clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; if (reloc_root) { @@ -2196,6 +2202,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) if (ret2 < 0 && !ret) ret = ret2; } + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_fs_root(root); } else { /* Orphan reloc tree, just clean it up */ From fab273595507a9ec7035df6d5512a955d80a80ba Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 25 Sep 2019 10:13:27 +0800 Subject: [PATCH 074/312] btrfs: Fix a regression which we can't convert to SINGLE profile [BUG] With v5.3 kernel, we can't convert to SINGLE profile: # btrfs balance start -f -dconvert=single $mnt ERROR: error during balancing '/mnt/btrfs': Invalid argument # dmesg -t | tail validate_convert_profile: data profile=0x1000000000000 allowed=0x20 is_valid=1 final=0x1000000000000 ret=1 BTRFS error (device dm-3): balance: invalid convert data profile single [CAUSE] With the extra debug output added, it shows that the @allowed bit is lacking the special in-memory only SINGLE profile bit. Thus we fail at that (profile & ~allowed) check. This regression is caused by commit 081db89b13cb ("btrfs: use raid_attr to get allowed profiles for balance conversion") and the fact that we don't use any bit to indicate SINGLE profile on-disk, but uses special in-memory only bit to help distinguish different profiles. [FIX] Add that BTRFS_AVAIL_ALLOC_BIT_SINGLE to @allowed, so the code should be the same as it was and fix the regression. Reported-by: Chris Murphy Fixes: 081db89b13cb ("btrfs: use raid_attr to get allowed profiles for balance conversion") CC: stable@vger.kernel.org # 5.3+ Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a324480bc88b..cdd7af424033 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4063,7 +4063,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } num_devices = btrfs_num_devices(fs_info); - allowed = 0; + + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a + * special bit for it, to make it easier to distinguish. Thus we need + * to set it manually, or balance would refuse the profile. + */ + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; From ddef29578a81a1d4d8f2b26a7adbfe21407ee3ea Mon Sep 17 00:00:00 2001 From: "Wunderlich, Mark" Date: Wed, 18 Sep 2019 23:36:37 +0000 Subject: [PATCH 075/312] nvme-tcp: fix wrong stop condition in io_work Allow the do/while statement to continue if current time is not after the proposed time 'deadline'. Intent is to allow loop to proceed for a specific time period. Currently the loop, as coded, will exit after first pass. Signed-off-by: Mark Wunderlich Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4ffd5957637a..385a5212c10f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1042,7 +1042,7 @@ static void nvme_tcp_io_work(struct work_struct *w) { struct nvme_tcp_queue *queue = container_of(w, struct nvme_tcp_queue, io_work); - unsigned long start = jiffies + msecs_to_jiffies(1); + unsigned long deadline = jiffies + msecs_to_jiffies(1); do { bool pending = false; @@ -1067,7 +1067,7 @@ static void nvme_tcp_io_work(struct work_struct *w) if (!pending) return; - } while (time_after(jiffies, start)); /* quota is exhausted */ + } while (!time_after(jiffies, deadline)); /* quota is exhausted */ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } From 7cbb5c6f9aa7cfda7175d82a9cf77a92965b0c5e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 18 Sep 2019 13:15:55 -0500 Subject: [PATCH 076/312] nvme-pci: Save PCI state before putting drive into deepest state The action of saving the PCI state will cause numerous PCI configuration space reads which depending upon the vendor implementation may cause the drive to exit the deepest NVMe state. In these cases ASPM will typically resolve the PCIe link state and APST may resolve the NVMe power state. However it has also been observed that this register access after quiesced will cause PC10 failure on some device combinations. To resolve this, move the PCI state saving to before SetFeatures has been called. This has been proven to resolve the issue across a 5000 sample test on previously failing disk/system combinations. Signed-off-by: Mario Limonciello Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6b4d7b064b38..d3f63f0c212a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2944,11 +2944,21 @@ static int nvme_suspend(struct device *dev) if (ret < 0) goto unfreeze; + /* + * A saved state prevents pci pm from generically controlling the + * device's power. If we're using protocol specific settings, we don't + * want pci interfering. + */ + pci_save_state(pdev); + ret = nvme_set_power_state(ctrl, ctrl->npss); if (ret < 0) goto unfreeze; if (ret) { + /* discard the saved state */ + pci_load_saved_state(pdev, NULL); + /* * Clearing npss forces a controller reset on resume. The * correct value will be resdicovered then. @@ -2956,14 +2966,7 @@ static int nvme_suspend(struct device *dev) nvme_dev_disable(ndev, true); ctrl->npss = 0; ret = 0; - goto unfreeze; } - /* - * A saved state prevents pci pm from generically controlling the - * device's power. If we're using protocol specific settings, we don't - * want pci interfering. - */ - pci_save_state(pdev); unfreeze: nvme_unfreeze(ctrl); return ret; From bc4f6e06a90ea016855fc67212b4d500145f0b8a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 23 Sep 2019 17:18:36 +0300 Subject: [PATCH 077/312] nvme: fix an error code in nvme_init_subsystem() "ret" should be a negative error code here, but it's either success or possibly uninitialized. Fixes: 32fd90c40768 ("nvme: change locking for the per-subsystem controller list") Signed-off-by: Dan Carpenter Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0c385b1994fe..d3c9df62a9d5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2543,8 +2543,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) list_add_tail(&subsys->entry, &nvme_subsystems); } - if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, - dev_name(ctrl->device))) { + ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, + dev_name(ctrl->device)); + if (ret) { dev_err(ctrl->device, "failed to create sysfs link from subsystem.\n"); goto out_put_subsystem; From ff13c1b87c97275b82b2af99044b4abf6861b28f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sat, 21 Sep 2019 23:58:19 +0300 Subject: [PATCH 078/312] nvme-rdma: Fix max_hw_sectors calculation By default, the NVMe/RDMA driver should support max io_size of 1MiB (or upto the maximum supported size by the HCA). Currently, one will see that /sys/class/block//queue/max_hw_sectors_kb is 1020 instead of 1024. A non power of 2 value can cause performance degradation due to unnecessary splitting of IO requests and unoptimized allocation units. The number of pages per MR has been fixed here, so there is no longer any need to reduce max_sectors by 1. Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index dfa07bb9dfeb..9d16dfc29368 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -427,7 +427,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) { return min_t(u32, NVME_RDMA_MAX_SEGMENTS, - ibdev->attrs.max_fast_reg_page_list_len); + ibdev->attrs.max_fast_reg_page_list_len - 1); } static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) @@ -437,7 +437,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) const int cq_factor = send_wr_factor + 1; /* + RECV */ int comp_vector, idx = nvme_rdma_queue_idx(queue); enum ib_poll_context poll_ctx; - int ret; + int ret, pages_per_mr; queue->device = nvme_rdma_find_get_device(queue->cm_id); if (!queue->device) { @@ -479,10 +479,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) goto out_destroy_qp; } + /* + * Currently we don't use SG_GAPS MR's so if the first entry is + * misaligned we'll end up using two entries for a single data page, + * so one additional entry is required. + */ + pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1; ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, queue->queue_size, IB_MR_TYPE_MEM_REG, - nvme_rdma_get_max_fr_pages(ibdev), 0); + pages_per_mr, 0); if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", @@ -820,8 +826,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_stop_queue; - ctrl->ctrl.max_hw_sectors = - (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); + ctrl->ctrl.max_segments = ctrl->max_fr_pages; + ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); From f03e42c6af60f778a6d1ccfb857db9b2ec835279 Mon Sep 17 00:00:00 2001 From: Gabriel Craciunescu Date: Mon, 23 Sep 2019 20:22:56 +0200 Subject: [PATCH 079/312] Added QUIRKs for ADATA XPG SX8200 Pro 512GB Booting with default_ps_max_latency_us >6000 makes the device fail. Also SUBNQN is NULL and gives a warning on each boot/resume. $ nvme id-ctrl /dev/nvme0 | grep ^subnqn subnqn : (null) I use this device with an Acer Nitro 5 (AN515-43-R8BF) Laptop. To be sure is not a Laptop issue only, I tested the device on my server board with the same results. ( with 2x,4x link on the board and 4x link on a PCI-E card ). Signed-off-by: Gabriel Craciunescu Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d3f63f0c212a..7ab07f4dce83 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3091,6 +3091,9 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, From 30f27d57c06e0199a9d3e0775113e13e2ae9078e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 13 Sep 2019 10:36:40 -0700 Subject: [PATCH 080/312] nvmet-tcp: remove superflous check on request sgl Now that sgl_free is null safe, drop the superflous check. Signed-off-by: Sagi Grimberg --- drivers/nvme/target/tcp.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index bf4f03474e89..d535080b781f 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -348,8 +348,7 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) return 0; err: - if (cmd->req.sg_cnt) - sgl_free(cmd->req.sg); + sgl_free(cmd->req.sg); return NVME_SC_INTERNAL; } @@ -554,8 +553,7 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd) if (queue->nvme_sq.sqhd_disabled) { kfree(cmd->iov); - if (cmd->req.sg_cnt) - sgl_free(cmd->req.sg); + sgl_free(cmd->req.sg); } return 1; @@ -586,8 +584,7 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, return -EAGAIN; kfree(cmd->iov); - if (cmd->req.sg_cnt) - sgl_free(cmd->req.sg); + sgl_free(cmd->req.sg); cmd->queue->snd_cmd = NULL; nvmet_tcp_put_cmd(cmd); return 1; @@ -1310,8 +1307,7 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) nvmet_req_uninit(&cmd->req); nvmet_tcp_unmap_pdu_iovec(cmd); kfree(cmd->iov); - if (cmd->req.sg_cnt) - sgl_free(cmd->req.sg); + sgl_free(cmd->req.sg); } static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) From 19ea025e1d28c629b369c3532a85b3df478cc5c6 Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Tue, 24 Sep 2019 17:42:41 +0800 Subject: [PATCH 081/312] nvme: Add quirk for Kingston NVME SSD running FW E8FK11.T Kingston NVME SSD with firmware version E8FK11.T has no interrupt after resume with actions related to suspend to idle. This patch applied NVME_QUIRK_SIMPLE_SUSPEND quirk to fix this issue. Fixes: d916b1be94b6 ("nvme-pci: use host managed power state for suspend") Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=204887 Signed-off-by: Jian-Hong Pan Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d3c9df62a9d5..adff57ea149e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2292,6 +2292,16 @@ static const struct nvme_core_quirk_entry core_quirks[] = { .vid = 0x14a4, .fr = "22301111", .quirks = NVME_QUIRK_SIMPLE_SUSPEND, + }, + { + /* + * This Kingston E8FK11.T firmware version has no interrupt + * after resume with actions related to suspend to idle + * https://bugzilla.kernel.org/show_bug.cgi?id=204887 + */ + .vid = 0x2646, + .fr = "E8FK11.T", + .quirks = NVME_QUIRK_SIMPLE_SUSPEND, } }; From 65e68edce0db433aa0c2b26d7dc14fbbbeb89fbb Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Tue, 24 Sep 2019 15:14:52 +0200 Subject: [PATCH 082/312] nvme: allow 64-bit results in passthru commands It is not possible to get 64-bit results from the passthru commands, what prevents from getting for the Capabilities (CAP) property value. As a result, it is not possible to implement IOL's NVMe Conformance test 4.3 Case 1 for Fabrics targets [1] (page 123). This issue has been already discussed [2], but without a solution. This patch solves the problem by adding new ioctls with a new passthru structure, including 64-bit results. The older ioctls stay unchanged. [1] https://www.iol.unh.edu/sites/default/files/testsuites/nvme/UNH-IOL_NVMe_Conformance_Test_Suite_v11.0.pdf [2] http://lists.infradead.org/pipermail/linux-nvme/2018-June/018791.html Signed-off-by: Marta Rybczynska Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 108 +++++++++++++++++++++++++++----- include/uapi/linux/nvme_ioctl.h | 23 +++++++ 2 files changed, 115 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index adff57ea149e..87ed437a46b8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -850,7 +850,7 @@ out: static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u32 *result, unsigned timeout) + u32 meta_seed, u64 *result, unsigned timeout) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -891,7 +891,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, else ret = nvme_req(req)->status; if (result) - *result = le32_to_cpu(nvme_req(req)->result.u32); + *result = le64_to_cpu(nvme_req(req)->result.u64); if (meta && !ret && !write) { if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; @@ -1338,6 +1338,54 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_command c; unsigned timeout = 0; u32 effects; + u64 result; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + effects = nvme_passthru_start(ctrl, ns, cmd.opcode); + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, + (void __user *)(uintptr_t)cmd.metadata, + cmd.metadata_len, 0, &result, timeout); + nvme_passthru_end(ctrl, effects); + + if (status >= 0) { + if (put_user(result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd64 __user *ucmd) +{ + struct nvme_passthru_cmd64 cmd; + struct nvme_command c; + unsigned timeout = 0; + u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1408,6 +1456,41 @@ static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) srcu_read_unlock(&head->srcu, idx); } +static bool is_ctrl_ioctl(unsigned int cmd) +{ + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) + return true; + if (is_sed_ioctl(cmd)) + return true; + return false; +} + +static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp, + struct nvme_ns_head *head, + int srcu_idx) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + nvme_get_ctrl(ns->ctrl); + nvme_put_ns_from_disk(head, srcu_idx); + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + ret = nvme_user_cmd(ctrl, NULL, argp); + break; + case NVME_IOCTL_ADMIN64_CMD: + ret = nvme_user_cmd64(ctrl, NULL, argp); + break; + default: + ret = sed_ioctl(ctrl->opal_dev, cmd, argp); + break; + } + nvme_put_ctrl(ctrl); + return ret; +} + static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1425,20 +1508,8 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, * seperately and drop the ns SRCU reference early. This avoids a * deadlock when deleting namespaces using the passthrough interface. */ - if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) { - struct nvme_ctrl *ctrl = ns->ctrl; - - nvme_get_ctrl(ns->ctrl); - nvme_put_ns_from_disk(head, srcu_idx); - - if (cmd == NVME_IOCTL_ADMIN_CMD) - ret = nvme_user_cmd(ctrl, NULL, argp); - else - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); - - nvme_put_ctrl(ctrl); - return ret; - } + if (is_ctrl_ioctl(cmd)) + return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); switch (cmd) { case NVME_IOCTL_ID: @@ -1451,6 +1522,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, case NVME_IOCTL_SUBMIT_IO: ret = nvme_submit_io(ns, argp); break; + case NVME_IOCTL_IO64_CMD: + ret = nvme_user_cmd64(ns->ctrl, ns, argp); + break; default: if (ns->ndev) ret = nvme_nvm_ioctl(ns, cmd, arg); @@ -2852,6 +2926,8 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index 1c215ea1798e..e168dc59e9a0 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -45,6 +45,27 @@ struct nvme_passthru_cmd { __u32 result; }; +struct nvme_passthru_cmd64 { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u64 result; +}; + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) @@ -54,5 +75,7 @@ struct nvme_passthru_cmd { #define NVME_IOCTL_RESET _IO('N', 0x44) #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) #define NVME_IOCTL_RESCAN _IO('N', 0x46) +#define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) +#define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) #endif /* _UAPI_LINUX_NVME_IOCTL_H */ From 2b1ff255d2d043aa4e6d62f50c478b283f9943ac Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 24 Sep 2019 14:22:08 -0700 Subject: [PATCH 083/312] nvme: Add ctrl attributes for queue_count and sqsize Current controller interrogation requires a lot of guesswork on how many io queues were created and what the io sq size is. The numbers are dependent upon core/fabric defaults, connect arguments, and target responses. Add sysfs attributes for queue_count and sqsize. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 87ed437a46b8..fd7dea36c3b6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3135,6 +3135,8 @@ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); nvme_show_int_function(cntlid); nvme_show_int_function(numa_node); +nvme_show_int_function(queue_count); +nvme_show_int_function(sqsize); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -3215,6 +3217,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_address.attr, &dev_attr_state.attr, &dev_attr_numa_node.attr, + &dev_attr_queue_count.attr, + &dev_attr_sqsize.attr, NULL }; From c3ca78e2174413c136d62ebdf8039580fe72b504 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 25 Sep 2019 00:32:13 -0500 Subject: [PATCH 084/312] smb3: pass mode bits into create calls We need to populate an ACL (security descriptor open context) on file and directory correct. This patch passes in the mode. Followon patch will build the open context and the security descriptor (from the mode) that goes in the open context. Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/cifsglob.h | 6 ++++-- fs/cifs/cifsproto.h | 3 ++- fs/cifs/cifssmb.c | 3 ++- fs/cifs/inode.c | 3 ++- fs/cifs/smb2inode.c | 34 +++++++++++++++++++--------------- fs/cifs/smb2pdu.c | 20 ++++++++++++++++++++ fs/cifs/smb2proto.h | 3 ++- 7 files changed, 51 insertions(+), 21 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 54e204589cb9..2e960e1049db 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -331,8 +331,9 @@ struct smb_version_operations { umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); - int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, - struct cifs_sb_info *); + int (*mkdir)(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *sb); /* set info on created directory */ void (*mkdir_setinfo)(struct inode *, const char *, struct cifs_sb_info *, struct cifs_tcon *, @@ -1209,6 +1210,7 @@ struct cifs_search_info { bool smallBuf:1; /* so we know which buf_release function to call */ }; +#define ACL_NO_MODE -1 struct cifs_open_parms { struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 99b1b1ef558c..e53e9f62b87b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -372,7 +372,8 @@ extern int CIFSSMBUnixSetPathInfo(const unsigned int xid, const struct nls_table *nls_codepage, int remap); -extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, +extern int CIFSSMBMkDir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index dbee2132e419..4f554f019a98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1078,7 +1078,8 @@ RmDirRetry: } int -CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +CIFSSMBMkDir(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { int rc = 0; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 26cdfbf1e164..3bae2e53f0b8 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1622,13 +1622,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) } /* BB add setting the equivalent of mode via CreateX w/ACLs */ - rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); + rc = server->ops->mkdir(xid, inode, mode, tcon, full_path, cifs_sb); if (rc) { cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc); d_drop(direntry); goto mkdir_out; } + /* TODO: skip this for smb2/smb3 */ rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon, xid); mkdir_out: diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index d2a3fb7e5c8d..4121ac1163ca 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -51,7 +51,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, - __u32 create_options, void *ptr, int command, + __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile) { int rc; @@ -103,6 +103,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; oparms.fid = &fid; oparms.reconnect = false; + oparms.mode = mode; memset(&open_iov, 0, sizeof(open_iov)); rqst[num_rqst].rq_iov = open_iov; @@ -478,7 +479,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { *symlink = true; create_options |= OPEN_REPARSE_POINT; @@ -486,8 +487,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* Failed on a symbolic link - query a reparse point info */ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, smb2_data, - SMB2_OP_QUERY_INFO, NULL); + create_options, ACL_NO_MODE, + smb2_data, SMB2_OP_QUERY_INFO, NULL); } if (rc) goto out; @@ -499,12 +500,14 @@ out: } int -smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR, NULL); + CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, + NULL); } void @@ -525,8 +528,8 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_get_writable_path(tcon, name, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO, - cfile); + CREATE_NOT_FILE, ACL_NO_MODE, + &data, SMB2_OP_SET_INFO, cfile); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -536,7 +539,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, + CREATE_NOT_FILE, ACL_NO_MODE, NULL, SMB2_OP_RMDIR, NULL); } @@ -546,7 +549,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); } static int @@ -564,7 +567,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, goto smb2_rename_path; } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, smb2_to_name, command, cfile); + FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, + command, cfile); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -601,8 +605,8 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, __le64 eof = cpu_to_le64(size); return smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, &eof, - SMB2_OP_SET_EOF, NULL); + FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, + &eof, SMB2_OP_SET_EOF, NULL); } int @@ -623,8 +627,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, return PTR_ERR(tlink); rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path, - FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, - SMB2_OP_SET_INFO, NULL); + FILE_WRITE_ATTRIBUTES, FILE_OPEN, + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ea08e6159481..85f9d614d968 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -751,6 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) unsigned int num = *num_iovec; iov[num].iov_base = create_posix_buf(mode); + if (mode == -1) + cifs_dbg(VFS, "illegal mode\n"); /* BB REMOVEME */ if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2417,6 +2419,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, /* File attributes ignored on open (used in create though) */ req->FileAttributes = cpu_to_le32(file_attributes); req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(oparms->disposition); req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); @@ -2518,6 +2521,23 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, return rc; } + /* TODO: add handling for the mode on create */ + if (oparms->disposition == FILE_CREATE) + cifs_dbg(VFS, "mode is 0x%x\n", oparms->mode); /* BB REMOVEME */ + + if ((oparms->disposition == FILE_CREATE) && (oparms->mode != -1)) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + /* rc = add_sd_context(iov, &n_iov, oparms->mode); */ + if (rc) + return rc; + } + if (n_iov > 2) { struct create_context *ccontext = (struct create_context *)iov[n_iov-1].iov_base; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 67a91b11fd59..da3a6d580808 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -84,7 +84,8 @@ extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); -extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, +extern int smb2_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, struct cifs_sb_info *cifs_sb, From a0f0037e908c656573d165aadbfae6c05fc4dd75 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 26 Sep 2019 08:54:03 +0800 Subject: [PATCH 085/312] KVM: LAPIC: Loosen filter for adaptive tuning of lapic_timer_advance_ns 5000 guest cycles delta is easy to encounter on desktop, per-vCPU lapic_timer_advance_ns always keeps at 1000ns initial value, let's loosen the filter a bit to let adaptive tuning make progress. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3a3a6854dcca..87b0fcc23ef8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -66,9 +66,10 @@ #define X2APIC_BROADCAST 0xFFFFFFFFul static bool lapic_timer_advance_dynamic __read_mostly; -#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 -#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000 -#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 +#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_NS_INIT 1000 +#define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -1504,8 +1505,8 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX)) - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) + timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -2302,7 +2303,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; lapic_timer_advance_dynamic = true; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; From a1a640b8c0cd8a2a7f84ab694f04bc64dc6988af Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 25 Sep 2019 11:17:14 -0700 Subject: [PATCH 086/312] kvm: x86: Fix a spurious -E2BIG in __do_cpuid_func Don't return -E2BIG from __do_cpuid_func when processing function 0BH or 1FH and the last interesting subleaf occupies the last allocated entry in the result array. Cc: Paolo Bonzini Fixes: 831bf664e9c1fc ("KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid") Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 63316036f85a..6bf4a261b308 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -618,16 +618,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, */ case 0x1f: case 0xb: { - int i, level_type; + int i; - /* read more entries until level_type is zero */ - for (i = 1; ; ++i) { + /* + * We filled in entry[0] for CPUID(EAX=, + * ECX=00H) above. If its level type (ECX[15:8]) is + * zero, then the leaf is unimplemented, and we're + * done. Otherwise, continue to populate entries + * until the level type (ECX[15:8]) of the previously + * added entry is zero. + */ + for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { if (*nent >= maxnent) goto out; - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; do_host_cpuid(&entry[i], function, i); ++*nent; } From 3ca94192278ca8de169d78c085396c424be123b3 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 18 Sep 2019 17:50:10 +0800 Subject: [PATCH 087/312] KVM: X86: Fix userspace set invalid CR4 Reported by syzkaller: WARNING: CPU: 0 PID: 6544 at /home/kernel/data/kvm/arch/x86/kvm//vmx/vmx.c:4689 handle_desc+0x37/0x40 [kvm_intel] CPU: 0 PID: 6544 Comm: a.out Tainted: G OE 5.3.0-rc4+ #4 RIP: 0010:handle_desc+0x37/0x40 [kvm_intel] Call Trace: vmx_handle_exit+0xbe/0x6b0 [kvm_intel] vcpu_enter_guest+0x4dc/0x18d0 [kvm] kvm_arch_vcpu_ioctl_run+0x407/0x660 [kvm] kvm_vcpu_ioctl+0x3ad/0x690 [kvm] do_vfs_ioctl+0xa2/0x690 ksys_ioctl+0x6d/0x80 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x74/0x720 entry_SYSCALL_64_after_hwframe+0x49/0xbe When CR4.UMIP is set, guest should have UMIP cpuid flag. Current kvm set_sregs function doesn't have such check when userspace inputs sregs values. SECONDARY_EXEC_DESC is enabled on writes to CR4.UMIP in vmx_set_cr4 though guest doesn't have UMIP cpuid flag. The testcast triggers handle_desc warning when executing ltr instruction since guest architectural CR4 doesn't set UMIP. This patch fixes it by adding valid CR4 and CPUID combination checking in __set_sregs. syzkaller source: https://syzkaller.appspot.com/x/repro.c?x=138efb99600000 Reported-by: syzbot+0f1819555fbdce992df9@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 58 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ed07d8d2caa..180c7e88577a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -885,34 +885,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (cr4 & CR4_RESERVED_BITS) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) + return -EINVAL; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return -EINVAL; + + return 0; +} + int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - if (cr4 & CR4_RESERVED_BITS) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return 1; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + if (kvm_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -8714,10 +8722,6 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8736,7 +8740,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return -EINVAL; } - return 0; + return kvm_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) From 43561123ab3759eb6ff47693aec1a307af0aef83 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 25 Sep 2019 17:04:17 -0700 Subject: [PATCH 088/312] kvm: x86: Improve emulation of CPUID leaves 0BH and 1FH For these CPUID leaves, the EDX output is not dependent on the ECX input (i.e. the SIGNIFCANT_INDEX flag doesn't apply to EDX). Furthermore, the low byte of the ECX output is always identical to the low byte of the ECX input. KVM does not produce the correct ECX and EDX outputs for any undefined subleaves beyond the first. Special-case these CPUID leaves in kvm_cpuid, so that the ECX and EDX outputs are properly generated for all undefined subleaves. Fixes: 0771671749b59a ("KVM: Enhance guest cpuid management") Fixes: a87f2d3a6eadab ("KVM: x86: Add Intel CPUID.1F cpuid emulation support") Signed-off-by: Jim Mattson Reviewed-by: Marc Orr Reviewed-by: Peter Shier Reviewed-by: Jacob Xu Cc: Sean Christopherson Cc: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 81 +++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6bf4a261b308..a26a1ae03145 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -973,53 +973,64 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. + * If the basic or extended CPUID leaf requested is higher than the + * maximum supported basic or extended leaf, respectively, then it is + * out of range. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) +static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) { - struct kvm_cpuid_entry2 *maxlevel; + struct kvm_cpuid_entry2 *max; - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); + max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + return max && function <= max->eax; } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; - bool entry_found = true; + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *max; + bool found; - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) { - entry_found = false; - if (!check_limit) - goto out; - - best = check_cpuid_limit(vcpu, function, index); + entry = kvm_find_cpuid_entry(vcpu, function, index); + found = entry; + /* + * Intel CPUID semantics treats any query for an out-of-range + * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were + * requested. + */ + if (!entry && check_limit && !cpuid_function_in_range(vcpu, function)) { + max = kvm_find_cpuid_entry(vcpu, 0, 0); + if (max) { + function = max->eax; + entry = kvm_find_cpuid_entry(vcpu, function, index); + } } - -out: - if (best) { - *eax = best->eax; - *ebx = best->ebx; - *ecx = best->ecx; - *edx = best->edx; - } else + if (entry) { + *eax = entry->eax; + *ebx = entry->ebx; + *ecx = entry->ecx; + *edx = entry->edx; + } else { *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); - return entry_found; + /* + * When leaf 0BH or 1FH is defined, CL is pass-through + * and EDX is always the x2APIC ID, even for undefined + * subleaves. Index 1 will exist iff the leaf is + * implemented, so we pass through CL iff leaf 1 + * exists. EDX can be copied from any existing index. + */ + if (function == 0xb || function == 0x1f) { + entry = kvm_find_cpuid_entry(vcpu, function, 1); + if (entry) { + *ecx = index & 0xff; + *edx = entry->edx; + } + } + } + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); + return found; } EXPORT_SYMBOL_GPL(kvm_cpuid); From 5f41a37b151f6459e0b650a2f4d1d59b6c02d1ab Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 25 Sep 2019 17:04:18 -0700 Subject: [PATCH 089/312] kvm: x86: Use AMD CPUID semantics for AMD vCPUs When the guest CPUID information represents an AMD vCPU, return all zeroes for queries of undefined CPUID leaves, whether or not they are in range. Signed-off-by: Jim Mattson Fixes: bd22f5cfcfe8f6 ("KVM: move and fix substitue search for missing CPUID entries") Reviewed-by: Marc Orr Reviewed-by: Peter Shier Reviewed-by: Jacob Xu Cc: Sean Christopherson Cc: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a26a1ae03145..ce4a1b3cc3e8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -998,9 +998,11 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, /* * Intel CPUID semantics treats any query for an out-of-range * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were - * requested. + * requested. AMD CPUID semantics returns all zeroes for any + * undefined leaf, whether or not the leaf is in range. */ - if (!entry && check_limit && !cpuid_function_in_range(vcpu, function)) { + if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && + !cpuid_function_in_range(vcpu, function)) { max = kvm_find_cpuid_entry(vcpu, 0, 0); if (max) { function = max->eax; From 40bc47b08b6efc8b64fef77cb46974f634215425 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 24 Sep 2019 13:51:08 -0700 Subject: [PATCH 090/312] kvm: x86: Enumerate support for CLZERO instruction CLZERO is available to the guest if it is supported on the host. Therefore, enumerate support for the instruction in KVM_GET_SUPPORTED_CPUID whenever it is supported on the host. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ce4a1b3cc3e8..34d9f9f481a3 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -485,8 +485,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); + F(CLZERO) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | + F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | F(AMD_SSBD) | + F(VIRT_SSBD) | F(AMD_SSB_NO); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = From 504ce1954fba888936c9d13ccc1e3db9b8f613d5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 25 Sep 2019 23:37:21 +0200 Subject: [PATCH 091/312] KVM: x86: Expose XSAVEERPTR to the guest I was surprised to see that the guest reported `fxsave_leak' while the host did not. After digging deeper I noticed that the bits are simply masked out during enumeration. The XSAVEERPTR feature is actually a bug fix on AMD which means the kernel can disable a workaround. Pass XSAVEERPTR to the guest if available on the host. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 34d9f9f481a3..9c5029cf6f3f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -485,9 +485,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(CLZERO) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | - F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | F(AMD_SSBD) | - F(VIRT_SSBD) | F(AMD_SSB_NO); + F(CLZERO) | F(XSAVEERPTR) | + F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = From f968688f44f529f96a64c9853fb2fb5d0a329aff Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Sep 2019 12:44:39 +0900 Subject: [PATCH 092/312] nvme: Move ctrl sqsize to generic space This isn't specific to fabrics. Signed-off-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b5013c101b35..38a83ef5bcd3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -221,6 +221,7 @@ struct nvme_ctrl { u16 oacs; u16 nssa; u16 nr_streams; + u16 sqsize; u32 max_namespaces; atomic_t abort_limit; u8 vwc; @@ -269,7 +270,6 @@ struct nvme_ctrl { u16 hmmaxd; /* Fabrics only */ - u16 sqsize; u32 ioccsz; u32 iorcsz; u16 icdoff; From ff3ee62a55869b1a64266b5c15af16f2eb37c8a7 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 26 Sep 2019 04:37:18 -0500 Subject: [PATCH 093/312] smb3: missing ACL related flags Various SMB3 ACL related flags (for security descriptor and ACEs for example) were missing and some fields are different in SMB3 and CIFS. Update cifsacl.h definitions based on current MS-DTYP specification. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Reviewed-by: Aurelien Aptel --- fs/cifs/cifsacl.h | 81 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index eb428349f29a..439b99cefeb0 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -90,14 +90,93 @@ struct cifs_acl { __le32 num_aces; } __attribute__((packed)); +/* ACE types - see MS-DTYP 2.4.4.1 */ +#define ACCESS_ALLOWED_ACE_TYPE 0x00 +#define ACCESS_DENIED_ACE_TYPE 0x01 +#define SYSTEM_AUDIT_ACE_TYPE 0x02 +#define SYSTEM_ALARM_ACE_TYPE 0x03 +#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ +#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ +#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 +#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 +#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 + +/* ACE flags */ +#define OBJECT_INHERIT_ACE 0x01 +#define CONTAINER_INHERIT_ACE 0x02 +#define NO_PROPAGATE_INHERIT_ACE 0x04 +#define INHERIT_ONLY_ACE 0x08 +#define INHERITED_ACE 0x10 +#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 +#define FAILED_ACCESS_ACE_FLAG 0x80 + struct cifs_ace { - __u8 type; + __u8 type; /* see above and MS-DTYP 2.4.4.1 */ __u8 flags; __le16 size; __le32 access_req; struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ } __attribute__((packed)); +/* + * The current SMB3 form of security descriptor is similar to what was used for + * cifs (see above) but some fields are split, and fields in the struct below + * matches names of fields to the the spec, MS-DTYP (see sections 2.4.5 and + * 2.4.6). Note that "CamelCase" fields are used in this struct in order to + * match the MS-DTYP and MS-SMB2 specs which define the wire format. + */ +struct smb3_sd { + __u8 Revision; /* revision level, MUST be one */ + __u8 Sbz1; /* only meaningful if 'RM' flag set below */ + __le16 Control; + __le32 OffsetOwner; + __le32 OffsetGroup; + __le32 OffsetSacl; + __le32 OffsetDacl; +} __packed; + +/* Meaning of 'Control' field flags */ +#define ACL_CONTROL_SR 0x0001 /* Self relative */ +#define ACL_CONTROL_RM 0x0002 /* Resource manager control bits */ +#define ACL_CONTROL_PS 0x0004 /* SACL protected from inherits */ +#define ACL_CONTROL_PD 0x0008 /* DACL protected from inherits */ +#define ACL_CONTROL_SI 0x0010 /* SACL Auto-Inherited */ +#define ACL_CONTROL_DI 0x0020 /* DACL Auto-Inherited */ +#define ACL_CONTROL_SC 0x0040 /* SACL computed through inheritance */ +#define ACL_CONTROL_DC 0x0080 /* DACL computed through inheritence */ +#define ACL_CONTROL_SS 0x0100 /* Create server ACL */ +#define ACL_CONTROL_DT 0x0200 /* DACL provided by trusteed source */ +#define ACL_CONTROL_SD 0x0400 /* SACL defaulted */ +#define ACL_CONTROL_SP 0x0800 /* SACL is present on object */ +#define ACL_CONTROL_DD 0x1000 /* DACL defaulted */ +#define ACL_CONTROL_DP 0x2000 /* DACL is present on object */ +#define ACL_CONTROL_GD 0x4000 /* Group was defaulted */ +#define ACL_CONTROL_OD 0x8000 /* User was defaulted */ + +/* Meaning of AclRevision flags */ +#define ACL_REVISION 0x02 /* See section 2.4.4.1 of MS-DTYP */ +#define ACL_REVISION_DS 0x04 /* Additional AceTypes allowed */ + +struct smb3_acl { + u8 AclRevision; /* revision level */ + u8 Sbz1; /* MBZ */ + __le16 AclSize; + __le16 AceCount; + __le16 Sbz2; /* MBZ */ +} __packed; + + /* * Minimum security identifier can be one for system defined Users * and Groups such as NULL SID and World or Built-in accounts such From a016e2794fc3a245a91946038dd8f34d65e53cc3 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 26 Sep 2019 12:31:20 -0700 Subject: [PATCH 094/312] CIFS: Fix oplock handling for SMB 2.1+ protocols There may be situations when a server negotiates SMB 2.1 protocol version or higher but responds to a CREATE request with an oplock rather than a lease. Currently the client doesn't handle such a case correctly: when another CREATE comes in the server sends an oplock break to the initial CREATE and the client doesn't send an ack back due to a wrong caching level being set (READ instead of RWH). Missing an oplock break ack makes the server wait until the break times out which dramatically increases the latency of the second CREATE. Fix this by properly detecting oplocks when using SMB 2.1 protocol version and higher. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2ops.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 3010066a8709..4c0922596467 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3333,6 +3333,11 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; + /* Check if the server granted an oplock rather than a lease */ + if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE) + return smb2_set_oplock_level(cinode, oplock, epoch, + purge_cache); + if (oplock & SMB2_LEASE_READ_CACHING_HE) { new_oplock |= CIFS_CACHE_READ_FLG; strcat(message, "R"); From a0ecd6fdbf5d648123a7315c695fb6850d702835 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Tue, 24 Sep 2019 23:30:30 -0500 Subject: [PATCH 095/312] drm/komeda: prevent memory leak in komeda_wb_connector_add In komeda_wb_connector_add if drm_writeback_connector_init fails the allocated memory for kwb_conn should be released. Signed-off-by: Navid Emamdoost Reviewed-by: James Qian Wang (Arm Technology China) Signed-off-by: james qian wang (Arm Technology China) Link: https://patchwork.freedesktop.org/patch/msgid/20190925043031.32308-1-navid.emamdoost@gmail.com --- drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c index 23fbee268119..b72840c06ab7 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c @@ -165,8 +165,10 @@ static int komeda_wb_connector_add(struct komeda_kms_dev *kms, &komeda_wb_encoder_helper_funcs, formats, n_formats); komeda_put_fourcc_list(formats); - if (err) + if (err) { + kfree(kwb_conn); return err; + } drm_connector_helper_add(&wb_conn->base, &komeda_wb_conn_helper_funcs); From 6eeb4ef049e7cd89783e8ebe1ea2f1dac276f82c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 24 Sep 2019 12:43:08 +0200 Subject: [PATCH 096/312] KVM: x86: assign two bits to track SPTE kinds Currently, we are overloading SPTE_SPECIAL_MASK to mean both "A/D bits unavailable" and MMIO, where the difference between the two is determined by mio_mask and mmio_value. However, the next patch will need two bits to distinguish availability of A/D bits from write protection. So, while at it give MMIO its own bit pattern, and move the two bits from bit 62 to bits 52..53 since Intel is allocating EPT page table bits from the top. Reviewed-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 ------- arch/x86/kvm/mmu.c | 28 ++++++++++++++++++---------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 23edf56cf577..50eb430b0ad8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -219,13 +219,6 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting - * with the SVE bit in EPT PTEs. - */ -#define SPTE_SPECIAL_MASK (1ULL << 62) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5269aa057dfa..bac8d228d82b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -83,7 +83,16 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 #define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -219,12 +228,11 @@ static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. - * Non-present SPTEs with shadow_acc_track_value set are in place for access - * tracking. + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. */ static u64 __read_mostly shadow_acc_track_mask; -static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -304,7 +312,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); - shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; shadow_mmio_access_mask = access_mask; } @@ -323,7 +331,7 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return !(spte & shadow_acc_track_value); + return (spte & SPTE_SPECIAL_MASK) == SPTE_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -461,7 +469,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & shadow_acc_track_value); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -2622,7 +2630,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -2968,7 +2976,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware From 1f4e5fc83a4217fc61b23370b07573827329d7bd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 26 Sep 2019 18:47:59 +0200 Subject: [PATCH 097/312] KVM: x86: fix nested guest live migration with PML Shadow paging is fundamentally incompatible with the page-modification log, because the GPAs in the log come from the wrong memory map. In particular, for the EPT page-modification log, the GPAs in the log come from L2 rather than L1. (If there was a non-EPT page-modification log, we couldn't use it for shadow paging because it would log GVAs rather than GPAs). Therefore, we need to rely on write protection to record dirty pages. This has the side effect of bypassing PML, since writes now result in an EPT violation vmexit. This is relatively easy to add to KVM, because pretty much the only place that needs changing is spte_clear_dirty. The first access to the page already goes through the page fault path and records the correct GPA; it's only subsequent accesses that are wrong. Therefore, we can equip set_spte (where the first access happens) to record that the SPTE will have to be write protected, and then spte_clear_dirty will use this information to do the right thing. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bac8d228d82b..24c23c66b226 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -92,6 +92,7 @@ module_param(dbg, bool, 0644); #define SPTE_SPECIAL_MASK (3ULL << 52) #define SPTE_AD_ENABLED_MASK (0ULL << 52) #define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) #define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -328,10 +329,27 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) return sp->role.ad_disabled; } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) == SPTE_AD_ENABLED_MASK; + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -1597,16 +1615,16 @@ static bool spte_clear_dirty(u64 *sptep) rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; - return mmu_spte_update(sptep, spte); } -static bool wrprot_ad_disabled_spte(u64 *sptep) +static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); - if (was_writable) + if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); return was_writable; @@ -1625,10 +1643,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_clear_dirty(sptep); + if (spte_ad_need_write_protect(*sptep)) + flush |= spte_wrprot_for_clear_dirty(sptep); else - flush |= wrprot_ad_disabled_spte(sptep); + flush |= spte_clear_dirty(sptep); return flush; } @@ -1639,6 +1657,11 @@ static bool spte_set_dirty(u64 *sptep) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + /* + * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, + * do not bother adding back write access to pages marked + * SPTE_AD_WRPROT_ONLY_MASK. + */ spte |= shadow_dirty_mask; return mmu_spte_update(sptep, spte); @@ -2977,6 +3000,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware From 094444204570a5420d9e6ce3d4558877c3487856 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 26 Sep 2019 15:01:15 +0200 Subject: [PATCH 098/312] selftests: kvm: add test for dirty logging inside nested guests Check that accesses by nested guests are logged according to the L1 physical addresses rather than L2. Most of the patch is really adding EPT support to the testing framework. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/include/x86_64/processor.h | 3 + .../selftests/kvm/include/x86_64/vmx.h | 14 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- .../selftests/kvm/lib/kvm_util_internal.h | 3 + tools/testing/selftests/kvm/lib/x86_64/vmx.c | 201 +++++++++++++++++- .../selftests/kvm/x86_64/vmx_dirty_log_test.c | 156 ++++++++++++++ 7 files changed, 377 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 62c591f87dab..fd84b7a78dcf 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -22,6 +22,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += clear_dirty_log_test diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 0c17f2ee685e..ff234018219c 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -1083,6 +1083,9 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); #define VMX_BASIC_MEM_TYPE_WB 6LLU #define VMX_BASIC_INOUT 0x0040000000000000LLU +/* VMX_EPT_VPID_CAP bits */ +#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) + /* MSR_IA32_VMX_MISC bits */ #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 69b17055f63d..6ae5a47fe067 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -569,6 +569,10 @@ struct vmx_pages { void *enlightened_vmcs_hva; uint64_t enlightened_vmcs_gpa; void *enlightened_vmcs; + + void *eptp_hva; + uint64_t eptp_gpa; + void *eptp; }; struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva); @@ -576,4 +580,14 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx); void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); bool load_vmcs(struct vmx_pages *vmx); +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot); +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + uint32_t eptp_memslot); +void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t memslot, uint32_t eptp_memslot); +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot); + #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 80a338b5403c..41cf45416060 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -705,7 +705,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, * on error (e.g. currently no memory region using memslot as a KVM * memory slot ID). */ -static struct userspace_mem_region * +struct userspace_mem_region * memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index f36262e0f655..ac50c42750cf 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -68,4 +68,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot); + #endif /* SELFTEST_KVM_UTIL_INTERNAL_H */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 9cef0455b819..fab8f6b0bf52 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -7,11 +7,39 @@ #include "test_util.h" #include "kvm_util.h" +#include "../kvm_util_internal.h" #include "processor.h" #include "vmx.h" +#define PAGE_SHIFT_4K 12 + +#define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000 + bool enable_evmcs; +struct eptPageTableEntry { + uint64_t readable:1; + uint64_t writable:1; + uint64_t executable:1; + uint64_t memory_type:3; + uint64_t ignore_pat:1; + uint64_t page_size:1; + uint64_t accessed:1; + uint64_t dirty:1; + uint64_t ignored_11_10:2; + uint64_t address:40; + uint64_t ignored_62_52:11; + uint64_t suppress_ve:1; +}; + +struct eptPageTablePointer { + uint64_t memory_type:3; + uint64_t page_walk_length:3; + uint64_t ad_enabled:1; + uint64_t reserved_11_07:5; + uint64_t address:40; + uint64_t reserved_63_52:12; +}; int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) { uint16_t evmcs_ver; @@ -174,15 +202,35 @@ bool load_vmcs(struct vmx_pages *vmx) */ static inline void init_vmcs_control_fields(struct vmx_pages *vmx) { + uint32_t sec_exec_ctl = 0; + vmwrite(VIRTUAL_PROCESSOR_ID, 0); vmwrite(POSTED_INTR_NV, 0); vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); - if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, 0)) + + if (vmx->eptp_gpa) { + uint64_t ept_paddr; + struct eptPageTablePointer eptp = { + .memory_type = VMX_BASIC_MEM_TYPE_WB, + .page_walk_length = 3, /* + 1 */ + .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS), + .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, + }; + + memcpy(&ept_paddr, &eptp, sizeof(ept_paddr)); + vmwrite(EPT_POINTER, ept_paddr); + sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT; + } + + if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl)) vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); - else + else { vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS)); + GUEST_ASSERT(!sec_exec_ctl); + } + vmwrite(EXCEPTION_BITMAP, 0); vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */ @@ -327,3 +375,152 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_host_state(); init_vmcs_guest_state(guest_rip, guest_rsp); } + +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot) +{ + uint16_t index[4]; + struct eptPageTableEntry *pml4e; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((nested_paddr % vm->page_size) == 0, + "Nested physical address not on page boundary,\n" + " nested_paddr: 0x%lx vm->page_size: 0x%x", + nested_paddr, vm->page_size); + TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", + paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + index[0] = (nested_paddr >> 12) & 0x1ffu; + index[1] = (nested_paddr >> 21) & 0x1ffu; + index[2] = (nested_paddr >> 30) & 0x1ffu; + index[3] = (nested_paddr >> 39) & 0x1ffu; + + /* Allocate page directory pointer table if not present. */ + pml4e = vmx->eptp_hva; + if (!pml4e[index[3]].readable) { + pml4e[index[3]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pml4e[index[3]].writable = true; + pml4e[index[3]].readable = true; + pml4e[index[3]].executable = true; + } + + /* Allocate page directory table if not present. */ + struct eptPageTableEntry *pdpe; + pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); + if (!pdpe[index[2]].readable) { + pdpe[index[2]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pdpe[index[2]].writable = true; + pdpe[index[2]].readable = true; + pdpe[index[2]].executable = true; + } + + /* Allocate page table if not present. */ + struct eptPageTableEntry *pde; + pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); + if (!pde[index[1]].readable) { + pde[index[1]].address = vm_phy_page_alloc(vm, + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) + >> vm->page_shift; + pde[index[1]].writable = true; + pde[index[1]].readable = true; + pde[index[1]].executable = true; + } + + /* Fill in page table entry. */ + struct eptPageTableEntry *pte; + pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); + pte[index[0]].address = paddr >> vm->page_shift; + pte[index[0]].writable = true; + pte[index[0]].readable = true; + pte[index[0]].executable = true; + + /* + * For now mark these as accessed and dirty because the only + * testcase we have needs that. Can be reconsidered later. + */ + pte[index[0]].accessed = true; + pte[index[0]].dirty = true; +} + +/* + * Map a range of EPT guest physical addresses to the VM's physical address + * + * Input Args: + * vm - Virtual Machine + * nested_paddr - Nested guest physical address to map + * paddr - VM Physical Address + * size - The size of the range to map + * eptp_memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within the VM given by vm, creates a nested guest translation for the + * page range starting at nested_paddr to the page range starting at paddr. + */ +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + uint32_t eptp_memslot) +{ + size_t page_size = vm->page_size; + size_t npages = size / page_size; + + TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); + + while (npages--) { + nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot); + nested_paddr += page_size; + paddr += page_size; + } +} + +/* Prepare an identity extended page table that maps all the + * physical pages in VM. + */ +void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t memslot, uint32_t eptp_memslot) +{ + sparsebit_idx_t i, last; + struct userspace_mem_region *region = + memslot2region(vm, memslot); + + i = (region->region.guest_phys_addr >> vm->page_shift) - 1; + last = i + (region->region.memory_size >> vm->page_shift); + for (;;) { + i = sparsebit_next_clear(region->unused_phy_pages, i); + if (i > last) + break; + + nested_map(vmx, vm, + (uint64_t)i << vm->page_shift, + (uint64_t)i << vm->page_shift, + 1 << vm->page_shift, + eptp_memslot); + } +} + +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, + uint32_t eptp_memslot) +{ + vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); + vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c new file mode 100644 index 000000000000..0bca1cfe2c1e --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging test + * + * Copyright (C) 2018, Red Hat, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define VCPU_ID 1 + +/* The memory slot index to track dirty pages */ +#define TEST_MEM_SLOT_INDEX 1 +#define TEST_MEM_SIZE 3 + +/* L1 guest test virtual memory offset */ +#define GUEST_TEST_MEM 0xc0000000 + +/* L2 guest test virtual memory offset */ +#define NESTED_TEST_MEM1 0xc0001000 +#define NESTED_TEST_MEM2 0xc0002000 + +static void l2_guest_code(void) +{ + *(volatile uint64_t *)NESTED_TEST_MEM1; + *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; + GUEST_SYNC(true); + GUEST_SYNC(false); + + *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + GUEST_SYNC(true); + *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + GUEST_SYNC(true); + GUEST_SYNC(false); + + /* Exit to L1 and never come back. */ + vmcall(); +} + +void l1_guest_code(struct vmx_pages *vmx) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + prepare_vmcs(vmx, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(false); + GUEST_ASSERT(!vmlaunch()); + GUEST_SYNC(false); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + struct vmx_pages *vmx; + unsigned long *bmap; + uint64_t *host_test_mem; + + struct kvm_vm *vm; + struct kvm_run *run; + struct ucall uc; + bool done = false; + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + run = vcpu_state(vm, VCPU_ID); + + /* Add an extra memory slot for testing dirty logging */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + GUEST_TEST_MEM, + TEST_MEM_SLOT_INDEX, + TEST_MEM_SIZE, + KVM_MEM_LOG_DIRTY_PAGES); + + /* + * Add an identity map for GVA range [0xc0000000, 0xc0002000). This + * affects both L1 and L2. However... + */ + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, + TEST_MEM_SIZE * 4096, 0); + + /* + * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to + * 0xc0000000. + * + * Note that prepare_eptp should be called only L1's GPA map is done, + * meaning after the last call to virt_map. + */ + prepare_eptp(vmx, vm, 0); + nested_map_memslot(vmx, vm, 0, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0); + + bmap = bitmap_alloc(TEST_MEM_SIZE); + host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); + + while (!done) { + memset(host_test_mem, 0xaa, TEST_MEM_SIZE * 4096); + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + /* + * The nested guest wrote at offset 0x1000 in the memslot, but the + * dirty bitmap must be filled in according to L1 GPA, not L2. + */ + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + if (uc.args[1]) { + TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n"); + TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n"); + } else { + TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n"); + } + + TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n"); + TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n"); + TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n"); + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + } + } +} From b9023b91dd020ad7e093baa5122b6968c48cc9e0 Mon Sep 17 00:00:00 2001 From: Balasubramani Vivekanandan Date: Thu, 26 Sep 2019 15:51:01 +0200 Subject: [PATCH 099/312] tick: broadcast-hrtimer: Fix a race in bc_set_next When a cpu requests broadcasting, before starting the tick broadcast hrtimer, bc_set_next() checks if the timer callback (bc_handler) is active using hrtimer_try_to_cancel(). But hrtimer_try_to_cancel() does not provide the required synchronization when the callback is active on other core. The callback could have already executed tick_handle_oneshot_broadcast() and could have also returned. But still there is a small time window where the hrtimer_try_to_cancel() returns -1. In that case bc_set_next() returns without doing anything, but the next_event of the tick broadcast clock device is already set to a timeout value. In the race condition diagram below, CPU #1 is running the timer callback and CPU #2 is entering idle state and so calls bc_set_next(). In the worst case, the next_event will contain an expiry time, but the hrtimer will not be started which happens when the racing callback returns HRTIMER_NORESTART. The hrtimer might never recover if all further requests from the CPUs to subscribe to tick broadcast have timeout greater than the next_event of tick broadcast clock device. This leads to cascading of failures and finally noticed as rcu stall warnings Here is a depiction of the race condition CPU #1 (Running timer callback) CPU #2 (Enter idle and subscribe to tick broadcast) --------------------- --------------------- __run_hrtimer() tick_broadcast_enter() bc_handler() __tick_broadcast_oneshot_control() tick_handle_oneshot_broadcast() raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; //wait for tick_broadcast_lock //next_event for tick broadcast clock set to KTIME_MAX since no other cores subscribed to tick broadcasting raw_spin_unlock(&tick_broadcast_lock); if (dev->next_event == KTIME_MAX) return HRTIMER_NORESTART // callback function exits without restarting the hrtimer //tick_broadcast_lock acquired raw_spin_lock(&tick_broadcast_lock); tick_broadcast_set_event() clockevents_program_event() dev->next_event = expires; bc_set_next() hrtimer_try_to_cancel() //returns -1 since the timer callback is active. Exits without restarting the timer cpu_base->running = NULL; The comment that hrtimer cannot be armed from within the callback is wrong. It is fine to start the hrtimer from within the callback. Also it is safe to start the hrtimer from the enter/exit idle code while the broadcast handler is active. The enter/exit idle code and the broadcast handler are synchronized using tick_broadcast_lock. So there is no need for the existing try to cancel logic. All this can be removed which will eliminate the race condition as well. Fixes: 5d1638acb9f6 ("tick: Introduce hrtimer based broadcast") Originally-by: Thomas Gleixner Signed-off-by: Balasubramani Vivekanandan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190926135101.12102-2-balasubramani_vivekanandan@mentor.com --- kernel/time/tick-broadcast-hrtimer.c | 62 +++++++++++++--------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index c1f5bb590b5e..b5a65e212df2 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -42,39 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt) */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { - int bc_moved; /* - * We try to cancel the timer first. If the callback is on - * flight on some other cpu then we let it handle it. If we - * were able to cancel the timer nothing can rearm it as we - * own broadcast_lock. + * This is called either from enter/exit idle code or from the + * broadcast handler. In all cases tick_broadcast_lock is held. * - * However we can also be called from the event handler of - * ce_broadcast_hrtimer itself when it expires. We cannot - * restart the timer because we are in the callback, but we - * can set the expiry time and let the callback return - * HRTIMER_RESTART. + * hrtimer_cancel() cannot be called here neither from the + * broadcast handler nor from the enter/exit idle code. The idle + * code can run into the problem described in bc_shutdown() and the + * broadcast handler cannot wait for itself to complete for obvious + * reasons. * - * Since we are in the idle loop at this point and because - * hrtimer_{start/cancel} functions call into tracing, - * calls to these functions must be bound within RCU_NONIDLE. + * Each caller tries to arm the hrtimer on its own CPU, but if the + * hrtimer callbback function is currently running, then + * hrtimer_start() cannot move it and the timer stays on the CPU on + * which it is assigned at the moment. + * + * As this can be called from idle code, the hrtimer_start() + * invocation has to be wrapped with RCU_NONIDLE() as + * hrtimer_start() can call into tracing. */ - RCU_NONIDLE( - { - bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; - if (bc_moved) { - hrtimer_start(&bctimer, expires, - HRTIMER_MODE_ABS_PINNED_HARD); - } - } - ); - - if (bc_moved) { - /* Bind the "device" to the cpu */ - bc->bound_on = smp_processor_id(); - } else if (bc->bound_on == smp_processor_id()) { - hrtimer_set_expires(&bctimer, expires); - } + RCU_NONIDLE( { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. + * + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. + */ + bc->bound_on = bctimer.base->cpu_base->cpu; + } ); return 0; } @@ -100,10 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) - if (ce_broadcast_hrtimer.next_event != KTIME_MAX) - return HRTIMER_RESTART; - return HRTIMER_NORESTART; } From bab32fc069ce8829c416e8737c119f62a57970f9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2019 20:02:38 +0800 Subject: [PATCH 100/312] btrfs: qgroup: Fix the wrong target io_tree when freeing reserved data space [BUG] Under the following case with qgroup enabled, if some error happened after we have reserved delalloc space, then in error handling path, we could cause qgroup data space leakage: From btrfs_truncate_block() in inode.c: ret = btrfs_delalloc_reserve_space(inode, &data_reserved, block_start, blocksize); if (ret) goto out; again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); ret = -ENOMEM; goto out; } [CAUSE] In the above case, btrfs_delalloc_reserve_space() will call btrfs_qgroup_reserve_data() and mark the io_tree range with EXTENT_QGROUP_RESERVED flag. In the error handling path, we have the following call stack: btrfs_delalloc_release_space() |- btrfs_free_reserved_data_space() |- btrsf_qgroup_free_data() |- __btrfs_qgroup_release_data(reserved=@reserved, free=1) |- qgroup_free_reserved_data(reserved=@reserved) |- clear_record_extent_bits(); |- freed += changeset.bytes_changed; However due to a completion bug, qgroup_free_reserved_data() will clear EXTENT_QGROUP_RESERVED flag in BTRFS_I(inode)->io_failure_tree, other than the correct BTRFS_I(inode)->io_tree. Since io_failure_tree is never marked with that flag, btrfs_qgroup_free_data() will not free any data reserved space at all, causing a leakage. This type of error handling can only be triggered by errors outside of qgroup code. So EDQUOT error from qgroup can't trigger it. [FIX] Fix the wrong target io_tree. Reported-by: Josef Bacik Fixes: bc42bda22345 ("btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges") CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 52701c1be109..4ab85555a947 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3486,7 +3486,7 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, free_start, free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) From d4e204948fe3e0dc8e1fbf3f8f3290c9c2823be3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 16 Sep 2019 20:02:39 +0800 Subject: [PATCH 101/312] btrfs: qgroup: Fix reserved data space leak if we have multiple reserve calls [BUG] The following script can cause btrfs qgroup data space leak: mkfs.btrfs -f $dev mount $dev -o nospace_cache $mnt btrfs subv create $mnt/subv btrfs quota en $mnt btrfs quota rescan -w $mnt btrfs qgroup limit 128m $mnt/subv for (( i = 0; i < 3; i++)); do # Create 3 64M holes for latter fallocate to fail truncate -s 192m $mnt/subv/file xfs_io -c "pwrite 64m 4k" $mnt/subv/file > /dev/null xfs_io -c "pwrite 128m 4k" $mnt/subv/file > /dev/null sync # it's supposed to fail, and each failure will leak at least 64M # data space xfs_io -f -c "falloc 0 192m" $mnt/subv/file &> /dev/null rm $mnt/subv/file sync done # Shouldn't fail after we removed the file xfs_io -f -c "falloc 0 64m" $mnt/subv/file [CAUSE] Btrfs qgroup data reserve code allow multiple reservations to happen on a single extent_changeset: E.g: btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_1M); btrfs_qgroup_reserve_data(inode, &data_reserved, SZ_1M, SZ_2M); btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_4M); Btrfs qgroup code has its internal tracking to make sure we don't double-reserve in above example. The only pattern utilizing this feature is in the main while loop of btrfs_fallocate() function. However btrfs_qgroup_reserve_data()'s error handling has a bug in that on error it clears all ranges in the io_tree with EXTENT_QGROUP_RESERVED flag but doesn't free previously reserved bytes. This bug has a two fold effect: - Clearing EXTENT_QGROUP_RESERVED ranges This is the correct behavior, but it prevents btrfs_qgroup_check_reserved_leak() to catch the leakage as the detector is purely EXTENT_QGROUP_RESERVED flag based. - Leak the previously reserved data bytes. The bug manifests when N calls to btrfs_qgroup_reserve_data are made and the last one fails, leaking space reserved in the previous ones. [FIX] Also free previously reserved data bytes when btrfs_qgroup_reserve_data fails. Fixes: 524725537023 ("btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4ab85555a947..c4bb69941c77 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3442,6 +3442,9 @@ cleanup: while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); + /* Also free data bytes of already reserved one */ + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, + orig_reserved, BTRFS_QGROUP_RSV_DATA); extent_changeset_release(reserved); return ret; } From 19a36d329f5b1e5d3d06f7093de992693d0fdee6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 26 Aug 2019 15:30:23 -0400 Subject: [PATCH 102/312] KVM: VMX: Set VMENTER_L1D_FLUSH_NOT_REQUIRED if !X86_BUG_L1TF The l1tf_vmx_mitigation is only set to VMENTER_L1D_FLUSH_NOT_REQUIRED when the ARCH_CAPABILITIES MSR indicates that L1D flush is not required. However, if the CPU is not affected by L1TF, l1tf_vmx_mitigation will still be set to VMENTER_L1D_FLUSH_AUTO. This is certainly not the best option for a !X86_BUG_L1TF CPU. So force l1tf_vmx_mitigation to VMENTER_L1D_FLUSH_NOT_REQUIRED to make it more explicit in case users are checking the vmentry_l1d_flush parameter. Signed-off-by: Waiman Long [Patch rewritten accoring to Borislav Petkov's suggestion. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d4575ffb3cec..e7970a2e8eae 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -209,6 +209,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) struct page *page; unsigned int i; + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; @@ -7995,12 +8000,10 @@ static int __init vmx_init(void) * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ - if (boot_cpu_has(X86_BUG_L1TF)) { - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; } #ifdef CONFIG_KEXEC_CORE From 2e4a75976dfb88ee6ce019b1d4fa507aabd02c14 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 27 Sep 2019 17:54:13 +0200 Subject: [PATCH 103/312] KVM: selftests: x86: clarify what is reported on KVM_GET_MSRS failure When KVM_GET_MSRS fail the report looks like ==== Test Assertion Failure ==== lib/x86_64/processor.c:1089: r == nmsrs pid=28775 tid=28775 - Argument list too long 1 0x000000000040a55f: vcpu_save_state at processor.c:1088 (discriminator 3) 2 0x00000000004010e3: main at state_test.c:171 (discriminator 4) 3 0x00007fb8e69223d4: ?? ??:0 4 0x0000000000401287: _start at ??:? Unexpected result from KVM_GET_MSRS, r: 36 (failed at 194) and it's not obvious that '194' here is the failed MSR index and that it's printed in hex. Change that. Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index c53dbc6bc568..6698cb741e10 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1085,7 +1085,7 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) for (i = 0; i < nmsrs; i++) state->msrs.entries[i].index = list->indices[i]; r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed at %x)", + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", r, r == nmsrs ? -1 : list->indices[r]); r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); From 67b483dd03c4cd9e90e4c3943132dce514ea4e88 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 24 Sep 2019 11:27:05 -0700 Subject: [PATCH 104/312] nvme-rdma: fix possible use-after-free in connect timeout If the connect times out, we may have already destroyed the queue in the timeout handler, so test if the queue is still allocated in the connect error handler. Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9d16dfc29368..4d280160dd3f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -620,7 +620,8 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) if (!ret) { set_bit(NVME_RDMA_Q_LIVE, &queue->flags); } else { - __nvme_rdma_stop_queue(queue); + if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) + __nvme_rdma_stop_queue(queue); dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); } From a12de1d42d74ef3c80e9fb9a2da94daaef747869 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 27 Sep 2019 15:24:30 +0800 Subject: [PATCH 105/312] blk-mq: honor IO scheduler for multiqueue devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a device is using multiple queues, the IO scheduler may be bypassed. This may hurt performance for some slow MQ devices, and it also breaks zoned devices which depend on mq-deadline for respecting the write order in one zone. Don't bypass io scheduler if we have one setup. This patch can double sequential write performance basically on MQ scsi_debug when mq-deadline is applied. Cc: Bart Van Assche Cc: Hannes Reinecke Cc: Dave Chinner Reviewed-by: Javier González Reviewed-by: Damien Le Moal Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 6e3b15f70cd7..5d4bef3c378d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2012,6 +2012,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } blk_add_rq_to_plug(plug, rq); + } else if (q->elevator) { + blk_mq_sched_insert_request(rq, false, true, true); } else if (plug && !blk_queue_nomerges(q)) { /* * We do limited plugging. If the bio can be merged, do that. @@ -2035,8 +2037,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } - } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && - !data.hctx->dispatch_busy)) { + } else if ((q->nr_hw_queues > 1 && is_sync) || + !data.hctx->dispatch_busy) { blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { blk_mq_sched_insert_request(rq, false, true, true); From 3154df262db52f1d27e01020e871f4345ec2f9a8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 27 Sep 2019 15:24:31 +0800 Subject: [PATCH 106/312] blk-mq: apply normal plugging for HDD Some HDD drive may expose multiple hardware queues, such as MegraRaid. Let's apply the normal plugging for such devices because sequential IO may benefit a lot from plug merging. Cc: Bart Van Assche Cc: Hannes Reinecke Cc: Dave Chinner Reviewed-by: Damien Le Moal Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5d4bef3c378d..ec791156e9cc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1992,10 +1992,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) /* bypass scheduler for flush rq */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs || + !blk_queue_nonrot(q))) { /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. + * + * Use normal plugging if this disk is slow HDD, as sequential + * IO may benefit a lot from plug merging. */ unsigned int request_count = plug->rq_count; struct request *last = NULL; From d1c536e3177390da43d99f20143b810c35433d1f Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 22 Sep 2019 11:26:53 +0100 Subject: [PATCH 107/312] mmc: sdhci: improve ADMA error reporting ADMA errors are potentially data corrupting events; although we print the register state, we do not usefully print the ADMA descriptors. Worse than that, we print them by referencing their virtual address which is meaningless when the register state gives us the DMA address of the failing descriptor. Print the ADMA descriptors giving their DMA addresses rather than their virtual addresses, and print them using SDHCI_DUMP() rather than DBG(). We also do not show the correct value of the interrupt status register; the register dump shows the current value, after we have cleared the pending interrupts we are going to service. What is more useful is to print the interrupts that _were_ pending at the time the ADMA error was encountered. Fix that too. Signed-off-by: Russell King Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 4b297f397326..922a5b594c5e 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2874,6 +2874,7 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p) static void sdhci_adma_show_error(struct sdhci_host *host) { void *desc = host->adma_table; + dma_addr_t dma = host->adma_addr; sdhci_dumpregs(host); @@ -2881,18 +2882,21 @@ static void sdhci_adma_show_error(struct sdhci_host *host) struct sdhci_adma2_64_desc *dma_desc = desc; if (host->flags & SDHCI_USE_64_BIT_DMA) - DBG("%p: DMA 0x%08x%08x, LEN 0x%04x, Attr=0x%02x\n", - desc, le32_to_cpu(dma_desc->addr_hi), + SDHCI_DUMP("%08llx: DMA 0x%08x%08x, LEN 0x%04x, Attr=0x%02x\n", + (unsigned long long)dma, + le32_to_cpu(dma_desc->addr_hi), le32_to_cpu(dma_desc->addr_lo), le16_to_cpu(dma_desc->len), le16_to_cpu(dma_desc->cmd)); else - DBG("%p: DMA 0x%08x, LEN 0x%04x, Attr=0x%02x\n", - desc, le32_to_cpu(dma_desc->addr_lo), + SDHCI_DUMP("%08llx: DMA 0x%08x, LEN 0x%04x, Attr=0x%02x\n", + (unsigned long long)dma, + le32_to_cpu(dma_desc->addr_lo), le16_to_cpu(dma_desc->len), le16_to_cpu(dma_desc->cmd)); desc += host->desc_sz; + dma += host->desc_sz; if (dma_desc->cmd & cpu_to_le16(ADMA2_END)) break; @@ -2968,7 +2972,8 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) != MMC_BUS_TEST_R) host->data->error = -EILSEQ; else if (intmask & SDHCI_INT_ADMA_ERROR) { - pr_err("%s: ADMA error\n", mmc_hostname(host->mmc)); + pr_err("%s: ADMA error: 0x%08x\n", mmc_hostname(host->mmc), + intmask); sdhci_adma_show_error(host); host->data->error = -EIO; if (host->ops->adma_workaround) From 121bd08b029e03404c451bb237729cdff76eafed Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 22 Sep 2019 11:26:58 +0100 Subject: [PATCH 108/312] mmc: sdhci-of-esdhc: set DMA snooping based on DMA coherence We must not unconditionally set the DMA snoop bit; if the DMA API is assuming that the device is not DMA coherent, and the device snoops the CPU caches, the device can see stale cache lines brought in by speculative prefetch. This leads to the device seeing stale data, potentially resulting in corrupted data transfers. Commonly, this results in a descriptor fetch error such as: mmc0: ADMA error mmc0: sdhci: ============ SDHCI REGISTER DUMP =========== mmc0: sdhci: Sys addr: 0x00000000 | Version: 0x00002202 mmc0: sdhci: Blk size: 0x00000008 | Blk cnt: 0x00000001 mmc0: sdhci: Argument: 0x00000000 | Trn mode: 0x00000013 mmc0: sdhci: Present: 0x01f50008 | Host ctl: 0x00000038 mmc0: sdhci: Power: 0x00000003 | Blk gap: 0x00000000 mmc0: sdhci: Wake-up: 0x00000000 | Clock: 0x000040d8 mmc0: sdhci: Timeout: 0x00000003 | Int stat: 0x00000001 mmc0: sdhci: Int enab: 0x037f108f | Sig enab: 0x037f108b mmc0: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00002202 mmc0: sdhci: Caps: 0x35fa0000 | Caps_1: 0x0000af00 mmc0: sdhci: Cmd: 0x0000333a | Max curr: 0x00000000 mmc0: sdhci: Resp[0]: 0x00000920 | Resp[1]: 0x001d8a33 mmc0: sdhci: Resp[2]: 0x325b5900 | Resp[3]: 0x3f400e00 mmc0: sdhci: Host ctl2: 0x00000000 mmc0: sdhci: ADMA Err: 0x00000009 | ADMA Ptr: 0x000000236d43820c mmc0: sdhci: ============================================ mmc0: error -5 whilst initialising SD card but can lead to other errors, and potentially direct the SDHCI controller to read/write data to other memory locations (e.g. if a valid descriptor is visible to the device in a stale cache line.) Fix this by ensuring that the DMA snoop bit corresponds with the behaviour of the DMA API. Since the driver currently only supports DT, use of_dma_is_coherent(). Note that device_get_dma_attr() can not be used as that risks re-introducing this bug if/when the driver is converted to ACPI. Signed-off-by: Russell King Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-esdhc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 3271c2d76629..1d1953dfc54b 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -495,7 +495,12 @@ static int esdhc_of_enable_dma(struct sdhci_host *host) dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40)); value = sdhci_readl(host, ESDHC_DMA_SYSCTL); - value |= ESDHC_DMA_SNOOP; + + if (of_dma_is_coherent(dev->of_node)) + value |= ESDHC_DMA_SNOOP; + else + value &= ~ESDHC_DMA_SNOOP; + sdhci_writel(host, value, ESDHC_DMA_SYSCTL); return 0; } From 4ee7dde4c777f14cb0f98dd201491bf6cc15899b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 23 Sep 2019 12:08:09 +0200 Subject: [PATCH 109/312] mmc: sdhci: Let drivers define their DMA mask Add host operation ->set_dma_mask() so that drivers can define their own DMA masks. Signed-off-by: Adrian Hunter Tested-by: Nicolin Chen Signed-off-by: Thierry Reding Cc: stable@vger.kernel.org # v4.15 + Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 12 ++++-------- drivers/mmc/host/sdhci.h | 1 + 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 922a5b594c5e..b056400e34b1 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -3781,18 +3781,14 @@ int sdhci_setup_host(struct sdhci_host *host) host->flags &= ~SDHCI_USE_ADMA; } - /* - * It is assumed that a 64-bit capable device has set a 64-bit DMA mask - * and *must* do 64-bit DMA. A driver has the opportunity to change - * that during the first call to ->enable_dma(). Similarly - * SDHCI_QUIRK2_BROKEN_64_BIT_DMA must be left to the drivers to - * implement. - */ if (sdhci_can_64bit_dma(host)) host->flags |= SDHCI_USE_64_BIT_DMA; if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) { - ret = sdhci_set_dma_mask(host); + if (host->ops->set_dma_mask) + ret = host->ops->set_dma_mask(host); + else + ret = sdhci_set_dma_mask(host); if (!ret && host->ops->enable_dma) ret = host->ops->enable_dma(host); diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index a29c4cd2d92e..0ed3e0eaef5f 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -622,6 +622,7 @@ struct sdhci_ops { u32 (*irq)(struct sdhci_host *host, u32 intmask); + int (*set_dma_mask)(struct sdhci_host *host); int (*enable_dma)(struct sdhci_host *host); unsigned int (*get_max_clock)(struct sdhci_host *host); unsigned int (*get_min_clock)(struct sdhci_host *host); From b960bc448a252428bacca271f3416a8bda3b599b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 23 Sep 2019 12:08:10 +0200 Subject: [PATCH 110/312] mmc: tegra: Implement ->set_dma_mask() The SDHCI controller on Tegra186 supports 40-bit addressing, which is usually enough to address all of system memory. However, if the SDHCI controller is behind an IOMMU, the address space can go beyond. This happens on Tegra186 and later where the ARM SMMU has an input address space of 48 bits. If the DMA API is backed by this ARM SMMU, the top- down IOVA allocator will cause IOV addresses to be returned that the SDHCI controller cannot access. Unfortunately, prior to the introduction of the ->set_dma_mask() host operation, the SDHCI core would set either a 64-bit DMA mask if the controller claimed to support 64-bit addressing, or a 32-bit DMA mask otherwise. Since the full 64 bits cannot be addressed on Tegra, this had to be worked around in commit 68481a7e1c84 ("mmc: tegra: Mark 64 bit dma broken on Tegra186") by setting the SDHCI_QUIRK2_BROKEN_64_BIT_DMA quirk, which effectively restricts the DMA mask to 32 bits. One disadvantage of this is that dma_map_*() APIs will now try to use the swiotlb to bounce DMA to addresses beyond of the controller's DMA mask. This in turn caused degraded performance and can lead to situations where the swiotlb buffer is exhausted, which in turn leads to DMA transfers to fail. With the recent introduction of the ->set_dma_mask() host operation, this can now be properly fixed. For each generation of Tegra, the exact supported DMA mask can be configured. This kills two birds with one stone: it avoids the use of bounce buffers because system memory never exceeds the addressable memory range of the SDHCI controllers on these devices, and at the same time when an IOMMU is involved, it prevents IOV addresses from being allocated beyond the addressible range of the controllers. Since the DMA mask is now properly handled, the 64-bit DMA quirk can be removed. Signed-off-by: Nicolin Chen [treding@nvidia.com: provide more background in commit message] Tested-by: Nicolin Chen Acked-by: Adrian Hunter Signed-off-by: Thierry Reding Cc: stable@vger.kernel.org # v4.15 + Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 48 ++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 02d8f524bb9e..7bc950520fd9 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -104,6 +105,7 @@ struct sdhci_tegra_soc_data { const struct sdhci_pltfm_data *pdata; + u64 dma_mask; u32 nvquirks; u8 min_tap_delay; u8 max_tap_delay; @@ -1233,11 +1235,25 @@ static const struct cqhci_host_ops sdhci_tegra_cqhci_ops = { .update_dcmd_desc = sdhci_tegra_update_dcmd_desc, }; +static int tegra_sdhci_set_dma_mask(struct sdhci_host *host) +{ + struct sdhci_pltfm_host *platform = sdhci_priv(host); + struct sdhci_tegra *tegra = sdhci_pltfm_priv(platform); + const struct sdhci_tegra_soc_data *soc = tegra->soc_data; + struct device *dev = mmc_dev(host->mmc); + + if (soc->dma_mask) + return dma_set_mask_and_coherent(dev, soc->dma_mask); + + return 0; +} + static const struct sdhci_ops tegra_sdhci_ops = { .get_ro = tegra_sdhci_get_ro, .read_w = tegra_sdhci_readw, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .platform_execute_tuning = tegra_sdhci_execute_tuning, @@ -1257,6 +1273,7 @@ static const struct sdhci_pltfm_data sdhci_tegra20_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra20 = { .pdata = &sdhci_tegra20_pdata, + .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_FORCE_SDHCI_SPEC_200 | NVQUIRK_ENABLE_BLOCK_GAP_DET, }; @@ -1283,6 +1300,7 @@ static const struct sdhci_pltfm_data sdhci_tegra30_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra30 = { .pdata = &sdhci_tegra30_pdata, + .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300 | NVQUIRK_ENABLE_SDR50 | NVQUIRK_ENABLE_SDR104 | @@ -1295,6 +1313,7 @@ static const struct sdhci_ops tegra114_sdhci_ops = { .write_w = tegra_sdhci_writew, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .platform_execute_tuning = tegra_sdhci_execute_tuning, @@ -1316,6 +1335,7 @@ static const struct sdhci_pltfm_data sdhci_tegra114_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra114 = { .pdata = &sdhci_tegra114_pdata, + .dma_mask = DMA_BIT_MASK(32), }; static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { @@ -1325,22 +1345,13 @@ static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { SDHCI_QUIRK_NO_HISPD_BIT | SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, - .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | - /* - * The TRM states that the SD/MMC controller found on - * Tegra124 can address 34 bits (the maximum supported by - * the Tegra memory controller), but tests show that DMA - * to or from above 4 GiB doesn't work. This is possibly - * caused by missing programming, though it's not obvious - * what sequence is required. Mark 64-bit DMA broken for - * now to fix this for existing users (e.g. Nyan boards). - */ - SDHCI_QUIRK2_BROKEN_64_BIT_DMA, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .ops = &tegra114_sdhci_ops, }; static const struct sdhci_tegra_soc_data soc_data_tegra124 = { .pdata = &sdhci_tegra124_pdata, + .dma_mask = DMA_BIT_MASK(34), }; static const struct sdhci_ops tegra210_sdhci_ops = { @@ -1349,6 +1360,7 @@ static const struct sdhci_ops tegra210_sdhci_ops = { .write_w = tegra210_sdhci_writew, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .set_uhs_signaling = tegra_sdhci_set_uhs_signaling, @@ -1369,6 +1381,7 @@ static const struct sdhci_pltfm_data sdhci_tegra210_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra210 = { .pdata = &sdhci_tegra210_pdata, + .dma_mask = DMA_BIT_MASK(34), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | @@ -1383,6 +1396,7 @@ static const struct sdhci_ops tegra186_sdhci_ops = { .read_w = tegra_sdhci_readw, .write_l = tegra_sdhci_writel, .set_clock = tegra_sdhci_set_clock, + .set_dma_mask = tegra_sdhci_set_dma_mask, .set_bus_width = sdhci_set_bus_width, .reset = tegra_sdhci_reset, .set_uhs_signaling = tegra_sdhci_set_uhs_signaling, @@ -1398,20 +1412,13 @@ static const struct sdhci_pltfm_data sdhci_tegra186_pdata = { SDHCI_QUIRK_NO_HISPD_BIT | SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC | SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, - .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | - /* SDHCI controllers on Tegra186 support 40-bit addressing. - * IOVA addresses are 48-bit wide on Tegra186. - * With 64-bit dma mask used for SDHCI, accesses can - * be broken. Disable 64-bit dma, which would fall back - * to 32-bit dma mask. Ideally 40-bit dma mask would work, - * But it is not supported as of now. - */ - SDHCI_QUIRK2_BROKEN_64_BIT_DMA, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .ops = &tegra186_sdhci_ops, }; static const struct sdhci_tegra_soc_data soc_data_tegra186 = { .pdata = &sdhci_tegra186_pdata, + .dma_mask = DMA_BIT_MASK(40), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | @@ -1424,6 +1431,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra186 = { static const struct sdhci_tegra_soc_data soc_data_tegra194 = { .pdata = &sdhci_tegra186_pdata, + .dma_mask = DMA_BIT_MASK(39), .nvquirks = NVQUIRK_NEEDS_PAD_CONTROL | NVQUIRK_HAS_PADCALIB | NVQUIRK_DIS_CARD_CLK_CONFIG_TAP | From e51df6ce668a8f75ce27f83ce0f60103c568c375 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Wed, 11 Sep 2019 15:23:44 +0800 Subject: [PATCH 111/312] mmc: host: sdhci-pci: Add Genesys Logic GL975x support Add support for the GL9750 and GL9755 chipsets. Enable v4 mode and wait 5ms after set 1.8V signal enable for GL9750/ GL9755. Fix the value of SDHCI_MAX_CURRENT register and use the vendor tuning flow for GL9750. Co-developed-by: Michael K Johnson Signed-off-by: Michael K Johnson Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 1 + drivers/mmc/host/Makefile | 2 +- drivers/mmc/host/sdhci-pci-core.c | 2 + drivers/mmc/host/sdhci-pci-gli.c | 352 ++++++++++++++++++++++++++++++ drivers/mmc/host/sdhci-pci.h | 5 + 5 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 drivers/mmc/host/sdhci-pci-gli.c diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 3a52f5703286..49ea02c467bf 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -94,6 +94,7 @@ config MMC_SDHCI_PCI depends on MMC_SDHCI && PCI select MMC_CQHCI select IOSF_MBI if X86 + select MMC_SDHCI_IO_ACCESSORS help This selects the PCI Secure Digital Host Controller Interface. Most controllers found today are PCI devices. diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 390ee162fe71..11c4598e91d9 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_MMC_MXS) += mxs-mmc.o obj-$(CONFIG_MMC_SDHCI) += sdhci.o obj-$(CONFIG_MMC_SDHCI_PCI) += sdhci-pci.o sdhci-pci-y += sdhci-pci-core.o sdhci-pci-o2micro.o sdhci-pci-arasan.o \ - sdhci-pci-dwc-mshc.o + sdhci-pci-dwc-mshc.o sdhci-pci-gli.o obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += sdhci-pci-data.o obj-$(CONFIG_MMC_SDHCI_ACPI) += sdhci-acpi.o obj-$(CONFIG_MMC_SDHCI_PXAV3) += sdhci-pxav3.o diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index e1ca185d7328..eaffa85bc728 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -1685,6 +1685,8 @@ static const struct pci_device_id pci_ids[] = { SDHCI_PCI_DEVICE(O2, SEABIRD1, o2), SDHCI_PCI_DEVICE(ARASAN, PHY_EMMC, arasan), SDHCI_PCI_DEVICE(SYNOPSYS, DWC_MSHC, snps), + SDHCI_PCI_DEVICE(GLI, 9750, gl9750), + SDHCI_PCI_DEVICE(GLI, 9755, gl9755), SDHCI_PCI_DEVICE_CLASS(AMD, SYSTEM_SDHCI, PCI_CLASS_MASK, amd), /* Generic SD host controller */ {PCI_DEVICE_CLASS(SYSTEM_SDHCI, PCI_CLASS_MASK)}, diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c new file mode 100644 index 000000000000..5eea8d70a85d --- /dev/null +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Genesys Logic, Inc. + * + * Authors: Ben Chuang + * + * Version: v0.9.0 (2019-08-08) + */ + +#include +#include +#include +#include +#include +#include "sdhci.h" +#include "sdhci-pci.h" + +/* Genesys Logic extra registers */ +#define SDHCI_GLI_9750_WT 0x800 +#define SDHCI_GLI_9750_WT_EN BIT(0) +#define GLI_9750_WT_EN_ON 0x1 +#define GLI_9750_WT_EN_OFF 0x0 + +#define SDHCI_GLI_9750_DRIVING 0x860 +#define SDHCI_GLI_9750_DRIVING_1 GENMASK(11, 0) +#define SDHCI_GLI_9750_DRIVING_2 GENMASK(27, 26) +#define GLI_9750_DRIVING_1_VALUE 0xFFF +#define GLI_9750_DRIVING_2_VALUE 0x3 + +#define SDHCI_GLI_9750_PLL 0x864 +#define SDHCI_GLI_9750_PLL_TX2_INV BIT(23) +#define SDHCI_GLI_9750_PLL_TX2_DLY GENMASK(22, 20) +#define GLI_9750_PLL_TX2_INV_VALUE 0x1 +#define GLI_9750_PLL_TX2_DLY_VALUE 0x0 + +#define SDHCI_GLI_9750_SW_CTRL 0x874 +#define SDHCI_GLI_9750_SW_CTRL_4 GENMASK(7, 6) +#define GLI_9750_SW_CTRL_4_VALUE 0x3 + +#define SDHCI_GLI_9750_MISC 0x878 +#define SDHCI_GLI_9750_MISC_TX1_INV BIT(2) +#define SDHCI_GLI_9750_MISC_RX_INV BIT(3) +#define SDHCI_GLI_9750_MISC_TX1_DLY GENMASK(6, 4) +#define GLI_9750_MISC_TX1_INV_VALUE 0x0 +#define GLI_9750_MISC_RX_INV_ON 0x1 +#define GLI_9750_MISC_RX_INV_OFF 0x0 +#define GLI_9750_MISC_RX_INV_VALUE GLI_9750_MISC_RX_INV_OFF +#define GLI_9750_MISC_TX1_DLY_VALUE 0x5 + +#define SDHCI_GLI_9750_TUNING_CONTROL 0x540 +#define SDHCI_GLI_9750_TUNING_CONTROL_EN BIT(4) +#define GLI_9750_TUNING_CONTROL_EN_ON 0x1 +#define GLI_9750_TUNING_CONTROL_EN_OFF 0x0 +#define SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1 BIT(16) +#define SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2 GENMASK(20, 19) +#define GLI_9750_TUNING_CONTROL_GLITCH_1_VALUE 0x1 +#define GLI_9750_TUNING_CONTROL_GLITCH_2_VALUE 0x2 + +#define SDHCI_GLI_9750_TUNING_PARAMETERS 0x544 +#define SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY GENMASK(2, 0) +#define GLI_9750_TUNING_PARAMETERS_RX_DLY_VALUE 0x1 + +#define GLI_MAX_TUNING_LOOP 40 + +/* Genesys Logic chipset */ +static inline void gl9750_wt_on(struct sdhci_host *host) +{ + u32 wt_value; + u32 wt_enable; + + wt_value = sdhci_readl(host, SDHCI_GLI_9750_WT); + wt_enable = FIELD_GET(SDHCI_GLI_9750_WT_EN, wt_value); + + if (wt_enable == GLI_9750_WT_EN_ON) + return; + + wt_value &= ~SDHCI_GLI_9750_WT_EN; + wt_value |= FIELD_PREP(SDHCI_GLI_9750_WT_EN, GLI_9750_WT_EN_ON); + + sdhci_writel(host, wt_value, SDHCI_GLI_9750_WT); +} + +static inline void gl9750_wt_off(struct sdhci_host *host) +{ + u32 wt_value; + u32 wt_enable; + + wt_value = sdhci_readl(host, SDHCI_GLI_9750_WT); + wt_enable = FIELD_GET(SDHCI_GLI_9750_WT_EN, wt_value); + + if (wt_enable == GLI_9750_WT_EN_OFF) + return; + + wt_value &= ~SDHCI_GLI_9750_WT_EN; + wt_value |= FIELD_PREP(SDHCI_GLI_9750_WT_EN, GLI_9750_WT_EN_OFF); + + sdhci_writel(host, wt_value, SDHCI_GLI_9750_WT); +} + +static void gli_set_9750(struct sdhci_host *host) +{ + u32 driving_value; + u32 pll_value; + u32 sw_ctrl_value; + u32 misc_value; + u32 parameter_value; + u32 control_value; + u16 ctrl2; + + gl9750_wt_on(host); + + driving_value = sdhci_readl(host, SDHCI_GLI_9750_DRIVING); + pll_value = sdhci_readl(host, SDHCI_GLI_9750_PLL); + sw_ctrl_value = sdhci_readl(host, SDHCI_GLI_9750_SW_CTRL); + misc_value = sdhci_readl(host, SDHCI_GLI_9750_MISC); + parameter_value = sdhci_readl(host, SDHCI_GLI_9750_TUNING_PARAMETERS); + control_value = sdhci_readl(host, SDHCI_GLI_9750_TUNING_CONTROL); + + driving_value &= ~(SDHCI_GLI_9750_DRIVING_1); + driving_value &= ~(SDHCI_GLI_9750_DRIVING_2); + driving_value |= FIELD_PREP(SDHCI_GLI_9750_DRIVING_1, + GLI_9750_DRIVING_1_VALUE); + driving_value |= FIELD_PREP(SDHCI_GLI_9750_DRIVING_2, + GLI_9750_DRIVING_2_VALUE); + sdhci_writel(host, driving_value, SDHCI_GLI_9750_DRIVING); + + sw_ctrl_value &= ~SDHCI_GLI_9750_SW_CTRL_4; + sw_ctrl_value |= FIELD_PREP(SDHCI_GLI_9750_SW_CTRL_4, + GLI_9750_SW_CTRL_4_VALUE); + sdhci_writel(host, sw_ctrl_value, SDHCI_GLI_9750_SW_CTRL); + + /* reset the tuning flow after reinit and before starting tuning */ + pll_value &= ~SDHCI_GLI_9750_PLL_TX2_INV; + pll_value &= ~SDHCI_GLI_9750_PLL_TX2_DLY; + pll_value |= FIELD_PREP(SDHCI_GLI_9750_PLL_TX2_INV, + GLI_9750_PLL_TX2_INV_VALUE); + pll_value |= FIELD_PREP(SDHCI_GLI_9750_PLL_TX2_DLY, + GLI_9750_PLL_TX2_DLY_VALUE); + + misc_value &= ~SDHCI_GLI_9750_MISC_TX1_INV; + misc_value &= ~SDHCI_GLI_9750_MISC_RX_INV; + misc_value &= ~SDHCI_GLI_9750_MISC_TX1_DLY; + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_TX1_INV, + GLI_9750_MISC_TX1_INV_VALUE); + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_VALUE); + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_TX1_DLY, + GLI_9750_MISC_TX1_DLY_VALUE); + + parameter_value &= ~SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY; + parameter_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_PARAMETERS_RX_DLY, + GLI_9750_TUNING_PARAMETERS_RX_DLY_VALUE); + + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1; + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_1, + GLI_9750_TUNING_CONTROL_GLITCH_1_VALUE); + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_GLITCH_2, + GLI_9750_TUNING_CONTROL_GLITCH_2_VALUE); + + sdhci_writel(host, pll_value, SDHCI_GLI_9750_PLL); + sdhci_writel(host, misc_value, SDHCI_GLI_9750_MISC); + + /* disable tuned clk */ + ctrl2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); + ctrl2 &= ~SDHCI_CTRL_TUNED_CLK; + sdhci_writew(host, ctrl2, SDHCI_HOST_CONTROL2); + + /* enable tuning parameters control */ + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_EN; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_EN, + GLI_9750_TUNING_CONTROL_EN_ON); + sdhci_writel(host, control_value, SDHCI_GLI_9750_TUNING_CONTROL); + + /* write tuning parameters */ + sdhci_writel(host, parameter_value, SDHCI_GLI_9750_TUNING_PARAMETERS); + + /* disable tuning parameters control */ + control_value &= ~SDHCI_GLI_9750_TUNING_CONTROL_EN; + control_value |= FIELD_PREP(SDHCI_GLI_9750_TUNING_CONTROL_EN, + GLI_9750_TUNING_CONTROL_EN_OFF); + sdhci_writel(host, control_value, SDHCI_GLI_9750_TUNING_CONTROL); + + /* clear tuned clk */ + ctrl2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); + ctrl2 &= ~SDHCI_CTRL_TUNED_CLK; + sdhci_writew(host, ctrl2, SDHCI_HOST_CONTROL2); + + gl9750_wt_off(host); +} + +static void gli_set_9750_rx_inv(struct sdhci_host *host, bool b) +{ + u32 misc_value; + + gl9750_wt_on(host); + + misc_value = sdhci_readl(host, SDHCI_GLI_9750_MISC); + misc_value &= ~SDHCI_GLI_9750_MISC_RX_INV; + if (b) { + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_ON); + } else { + misc_value |= FIELD_PREP(SDHCI_GLI_9750_MISC_RX_INV, + GLI_9750_MISC_RX_INV_OFF); + } + sdhci_writel(host, misc_value, SDHCI_GLI_9750_MISC); + + gl9750_wt_off(host); +} + +static int __sdhci_execute_tuning_9750(struct sdhci_host *host, u32 opcode) +{ + int i; + int rx_inv; + + for (rx_inv = 0; rx_inv < 2; rx_inv++) { + gli_set_9750_rx_inv(host, !!rx_inv); + sdhci_start_tuning(host); + + for (i = 0; i < GLI_MAX_TUNING_LOOP; i++) { + u16 ctrl; + + sdhci_send_tuning(host, opcode); + + if (!host->tuning_done) { + sdhci_abort_tuning(host, opcode); + break; + } + + ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); + if (!(ctrl & SDHCI_CTRL_EXEC_TUNING)) { + if (ctrl & SDHCI_CTRL_TUNED_CLK) + return 0; /* Success! */ + break; + } + } + } + if (!host->tuning_done) { + pr_info("%s: Tuning timeout, falling back to fixed sampling clock\n", + mmc_hostname(host->mmc)); + return -ETIMEDOUT; + } + + pr_info("%s: Tuning failed, falling back to fixed sampling clock\n", + mmc_hostname(host->mmc)); + sdhci_reset_tuning(host); + + return -EAGAIN; +} + +static int gl9750_execute_tuning(struct sdhci_host *host, u32 opcode) +{ + host->mmc->retune_period = 0; + if (host->tuning_mode == SDHCI_TUNING_MODE_1) + host->mmc->retune_period = host->tuning_count; + + gli_set_9750(host); + host->tuning_err = __sdhci_execute_tuning_9750(host, opcode); + sdhci_end_tuning(host); + + return 0; +} + +static int gli_probe_slot_gl9750(struct sdhci_pci_slot *slot) +{ + struct sdhci_host *host = slot->host; + + slot->host->mmc->caps2 |= MMC_CAP2_NO_SDIO; + sdhci_enable_v4_mode(host); + + return 0; +} + +static int gli_probe_slot_gl9755(struct sdhci_pci_slot *slot) +{ + struct sdhci_host *host = slot->host; + + slot->host->mmc->caps2 |= MMC_CAP2_NO_SDIO; + sdhci_enable_v4_mode(host); + + return 0; +} + +static void sdhci_gli_voltage_switch(struct sdhci_host *host) +{ + /* + * According to Section 3.6.1 signal voltage switch procedure in + * SD Host Controller Simplified Spec. 4.20, steps 6~8 are as + * follows: + * (6) Set 1.8V Signal Enable in the Host Control 2 register. + * (7) Wait 5ms. 1.8V voltage regulator shall be stable within this + * period. + * (8) If 1.8V Signal Enable is cleared by Host Controller, go to + * step (12). + * + * Wait 5ms after set 1.8V signal enable in Host Control 2 register + * to ensure 1.8V signal enable bit is set by GL9750/GL9755. + */ + usleep_range(5000, 5500); +} + +static void sdhci_gl9750_reset(struct sdhci_host *host, u8 mask) +{ + sdhci_reset(host, mask); + gli_set_9750(host); +} + +static u32 sdhci_gl9750_readl(struct sdhci_host *host, int reg) +{ + u32 value; + + value = readl(host->ioaddr + reg); + if (unlikely(reg == SDHCI_MAX_CURRENT && !(value & 0xff))) + value |= 0xc8; + + return value; +} + +static const struct sdhci_ops sdhci_gl9755_ops = { + .set_clock = sdhci_set_clock, + .enable_dma = sdhci_pci_enable_dma, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .voltage_switch = sdhci_gli_voltage_switch, +}; + +const struct sdhci_pci_fixes sdhci_gl9755 = { + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, + .quirks2 = SDHCI_QUIRK2_BROKEN_DDR50, + .probe_slot = gli_probe_slot_gl9755, + .ops = &sdhci_gl9755_ops, +}; + +static const struct sdhci_ops sdhci_gl9750_ops = { + .read_l = sdhci_gl9750_readl, + .set_clock = sdhci_set_clock, + .enable_dma = sdhci_pci_enable_dma, + .set_bus_width = sdhci_set_bus_width, + .reset = sdhci_gl9750_reset, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .voltage_switch = sdhci_gli_voltage_switch, + .platform_execute_tuning = gl9750_execute_tuning, +}; + +const struct sdhci_pci_fixes sdhci_gl9750 = { + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC, + .quirks2 = SDHCI_QUIRK2_BROKEN_DDR50, + .probe_slot = gli_probe_slot_gl9750, + .ops = &sdhci_gl9750_ops, +}; diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h index 1abc9d47a4c0..558202fe64c6 100644 --- a/drivers/mmc/host/sdhci-pci.h +++ b/drivers/mmc/host/sdhci-pci.h @@ -68,6 +68,9 @@ #define PCI_DEVICE_ID_SYNOPSYS_DWC_MSHC 0xc202 +#define PCI_DEVICE_ID_GLI_9755 0x9755 +#define PCI_DEVICE_ID_GLI_9750 0x9750 + /* * PCI device class and mask */ @@ -188,5 +191,7 @@ int sdhci_pci_enable_dma(struct sdhci_host *host); extern const struct sdhci_pci_fixes sdhci_arasan; extern const struct sdhci_pci_fixes sdhci_snps; extern const struct sdhci_pci_fixes sdhci_o2; +extern const struct sdhci_pci_fixes sdhci_gl9750; +extern const struct sdhci_pci_fixes sdhci_gl9755; #endif /* __SDHCI_PCI_H */ From 6402939ec86eaf226c8b8ae00ed983936b164908 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Tue, 17 Sep 2019 17:47:12 -0500 Subject: [PATCH 112/312] ieee802154: ca8210: prevent memory leak In ca8210_probe the allocated pdata needs to be assigned to spi_device->dev.platform_data before calling ca8210_get_platform_data. Othrwise when ca8210_get_platform_data fails pdata cannot be released. Signed-off-by: Navid Emamdoost Link: https://lore.kernel.org/r/20190917224713.26371-1-navid.emamdoost@gmail.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index b188fce3f641..658b399ac9ea 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -3152,12 +3152,12 @@ static int ca8210_probe(struct spi_device *spi_device) goto error; } + priv->spi->dev.platform_data = pdata; ret = ca8210_get_platform_data(priv->spi, pdata); if (ret) { dev_crit(&spi_device->dev, "ca8210_get_platform_data failed\n"); goto error; } - priv->spi->dev.platform_data = pdata; ret = ca8210_dev_com_init(priv); if (ret) { From f15d9a992f901d4f22db868adf800844d1cac9f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:55 +0200 Subject: [PATCH 113/312] iommu/amd: Remove domain->updated This struct member was used to track whether a domain change requires updates to the device-table and IOMMU cache flushes. The problem is, that access to this field is racy since locking in the common mapping code-paths has been eliminated. Move the updated field to the stack to get rid of all potential races and remove the field from the struct. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 49 +++++++++++++++++---------------- drivers/iommu/amd_iommu_types.h | 1 - 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 7bdce3b10f3d..042854bbc5bc 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1458,10 +1458,11 @@ static void free_pagetable(struct protection_domain *domain) * another level increases the size of the address space by 9 bits to a size up * to 64 bits. */ -static void increase_address_space(struct protection_domain *domain, +static bool increase_address_space(struct protection_domain *domain, gfp_t gfp) { unsigned long flags; + bool ret = false; u64 *pte; spin_lock_irqsave(&domain->lock, flags); @@ -1478,19 +1479,21 @@ static void increase_address_space(struct protection_domain *domain, iommu_virt_to_phys(domain->pt_root)); domain->pt_root = pte; domain->mode += 1; - domain->updated = true; + + ret = true; out: spin_unlock_irqrestore(&domain->lock, flags); - return; + return ret; } static u64 *alloc_pte(struct protection_domain *domain, unsigned long address, unsigned long page_size, u64 **pte_page, - gfp_t gfp) + gfp_t gfp, + bool *updated) { int level, end_lvl; u64 *pte, *page; @@ -1498,7 +1501,7 @@ static u64 *alloc_pte(struct protection_domain *domain, BUG_ON(!is_power_of_2(page_size)); while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); + *updated = increase_address_space(domain, gfp) || *updated; level = domain->mode - 1; pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; @@ -1530,7 +1533,7 @@ static u64 *alloc_pte(struct protection_domain *domain, for (i = 0; i < count; ++i) cmpxchg64(&lpte[i], __pte, 0ULL); - domain->updated = true; + *updated = true; continue; } @@ -1547,7 +1550,7 @@ static u64 *alloc_pte(struct protection_domain *domain, if (cmpxchg64(pte, __pte, __npte) != __pte) free_page((unsigned long)page); else if (IOMMU_PTE_PRESENT(__pte)) - domain->updated = true; + *updated = true; continue; } @@ -1656,28 +1659,29 @@ static int iommu_map_page(struct protection_domain *dom, gfp_t gfp) { struct page *freelist = NULL; + bool updated = false; u64 __pte, *pte; - int i, count; + int ret, i, count; BUG_ON(!IS_ALIGNED(bus_addr, page_size)); BUG_ON(!IS_ALIGNED(phys_addr, page_size)); + ret = -EINVAL; if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; + goto out; count = PAGE_SIZE_PTE_COUNT(page_size); - pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp); + pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated); - if (!pte) { - update_domain(dom); - return -ENOMEM; - } + ret = -ENOMEM; + if (!pte) + goto out; for (i = 0; i < count; ++i) freelist = free_clear_pte(&pte[i], pte[i], freelist); if (freelist != NULL) - dom->updated = true; + updated = true; if (count > 1) { __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size); @@ -1693,12 +1697,16 @@ static int iommu_map_page(struct protection_domain *dom, for (i = 0; i < count; ++i) pte[i] = __pte; - update_domain(dom); + ret = 0; + +out: + if (updated) + update_domain(dom); /* Everything flushed out, free pages now */ free_page_list(freelist); - return 0; + return ret; } static unsigned long iommu_unmap_page(struct protection_domain *dom, @@ -2399,15 +2407,10 @@ static void update_device_table(struct protection_domain *domain) static void update_domain(struct protection_domain *domain) { - if (!domain->updated) - return; - update_device_table(domain); domain_flush_devices(domain); domain_flush_tlb_pde(domain); - - domain->updated = false; } static int dir2prot(enum dma_data_direction direction) @@ -3333,7 +3336,6 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom) /* Update data structure */ domain->mode = PAGE_MODE_NONE; - domain->updated = true; /* Make changes visible to IOMMUs */ update_domain(domain); @@ -3379,7 +3381,6 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) domain->glx = levels; domain->flags |= PD_IOMMUV2_MASK; - domain->updated = true; update_domain(domain); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 9ac229e92b07..0186501ab971 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -475,7 +475,6 @@ struct protection_domain { int glx; /* Number of levels for GCR3 table */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ - bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; From 3a11905b69eb026402448c750f97a0eadfa76b08 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:56 +0200 Subject: [PATCH 114/312] iommu/amd: Remove amd_iommu_devtable_lock The lock is not necessary because the device table does not contain shared state that needs protection. Locking is only needed on an individual entry basis, and that needs to happen on the iommu_dev_data level. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 042854bbc5bc..37a9c04fc728 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -70,7 +70,6 @@ */ #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) -static DEFINE_SPINLOCK(amd_iommu_devtable_lock); static DEFINE_SPINLOCK(pd_bitmap_lock); /* List of all available dev_data structures */ @@ -2080,10 +2079,11 @@ static void do_detach(struct iommu_dev_data *dev_data) static int __attach_device(struct iommu_dev_data *dev_data, struct protection_domain *domain) { + unsigned long flags; int ret; /* lock domain */ - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); ret = -EBUSY; if (dev_data->domain != NULL) @@ -2097,7 +2097,7 @@ static int __attach_device(struct iommu_dev_data *dev_data, out_unlock: /* ready */ - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); return ret; } @@ -2181,7 +2181,6 @@ static int attach_device(struct device *dev, { struct pci_dev *pdev; struct iommu_dev_data *dev_data; - unsigned long flags; int ret; dev_data = get_dev_data(dev); @@ -2209,9 +2208,7 @@ static int attach_device(struct device *dev, } skip_ats_check: - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); ret = __attach_device(dev_data, domain); - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* * We might boot into a crash-kernel here. The crashed kernel @@ -2231,14 +2228,15 @@ skip_ats_check: static void __detach_device(struct iommu_dev_data *dev_data) { struct protection_domain *domain; + unsigned long flags; domain = dev_data->domain; - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); do_detach(dev_data); - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); } /* @@ -2248,7 +2246,6 @@ static void detach_device(struct device *dev) { struct protection_domain *domain; struct iommu_dev_data *dev_data; - unsigned long flags; dev_data = get_dev_data(dev); domain = dev_data->domain; @@ -2262,10 +2259,7 @@ static void detach_device(struct device *dev) if (WARN_ON(!dev_data->domain)) return; - /* lock device table */ - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev_data); - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); if (!dev_is_pci(dev)) return; @@ -2910,9 +2904,6 @@ int __init amd_iommu_init_dma_ops(void) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; - unsigned long flags; - - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, @@ -2920,8 +2911,6 @@ static void cleanup_domain(struct protection_domain *domain) BUG_ON(!entry->domain); __detach_device(entry); } - - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } static void protection_domain_free(struct protection_domain *domain) From f6c0bfce271b2dd613e8b8e009eefe89c1f788e8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:57 +0200 Subject: [PATCH 115/312] iommu/amd: Take domain->lock for complete attach/detach path The code-paths before __attach_device() and __detach_device() are called also access and modify domain state, so take the domain lock there too. This allows to get rid of the __detach_device() function. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 65 ++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 37a9c04fc728..2919168577ff 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2079,27 +2079,13 @@ static void do_detach(struct iommu_dev_data *dev_data) static int __attach_device(struct iommu_dev_data *dev_data, struct protection_domain *domain) { - unsigned long flags; - int ret; - - /* lock domain */ - spin_lock_irqsave(&domain->lock, flags); - - ret = -EBUSY; if (dev_data->domain != NULL) - goto out_unlock; + return -EBUSY; /* Attach alias group root */ do_attach(dev_data, domain); - ret = 0; - -out_unlock: - - /* ready */ - spin_unlock_irqrestore(&domain->lock, flags); - - return ret; + return 0; } @@ -2181,8 +2167,11 @@ static int attach_device(struct device *dev, { struct pci_dev *pdev; struct iommu_dev_data *dev_data; + unsigned long flags; int ret; + spin_lock_irqsave(&domain->lock, flags); + dev_data = get_dev_data(dev); if (!dev_is_pci(dev)) @@ -2190,12 +2179,13 @@ static int attach_device(struct device *dev, pdev = to_pci_dev(dev); if (domain->flags & PD_IOMMUV2_MASK) { + ret = -EINVAL; if (!dev_data->passthrough) - return -EINVAL; + goto out; if (dev_data->iommu_v2) { if (pdev_iommuv2_enable(pdev) != 0) - return -EINVAL; + goto out; dev_data->ats.enabled = true; dev_data->ats.qdep = pci_ats_queue_depth(pdev); @@ -2219,24 +2209,10 @@ skip_ats_check: domain_flush_complete(domain); - return ret; -} - -/* - * Removes a device from a protection domain (unlocked) - */ -static void __detach_device(struct iommu_dev_data *dev_data) -{ - struct protection_domain *domain; - unsigned long flags; - - domain = dev_data->domain; - - spin_lock_irqsave(&domain->lock, flags); - - do_detach(dev_data); - +out: spin_unlock_irqrestore(&domain->lock, flags); + + return ret; } /* @@ -2246,10 +2222,13 @@ static void detach_device(struct device *dev) { struct protection_domain *domain; struct iommu_dev_data *dev_data; + unsigned long flags; dev_data = get_dev_data(dev); domain = dev_data->domain; + spin_lock_irqsave(&domain->lock, flags); + /* * First check if the device is still attached. It might already * be detached from its domain because the generic @@ -2257,12 +2236,12 @@ static void detach_device(struct device *dev) * our alias handling. */ if (WARN_ON(!dev_data->domain)) - return; + goto out; - __detach_device(dev_data); + do_detach(dev_data); if (!dev_is_pci(dev)) - return; + goto out; if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) pdev_iommuv2_disable(to_pci_dev(dev)); @@ -2270,6 +2249,9 @@ static void detach_device(struct device *dev) pci_disable_ats(to_pci_dev(dev)); dev_data->ats.enabled = false; + +out: + spin_unlock_irqrestore(&domain->lock, flags); } static int amd_iommu_add_device(struct device *dev) @@ -2904,13 +2886,18 @@ int __init amd_iommu_init_dma_ops(void) static void cleanup_domain(struct protection_domain *domain) { struct iommu_dev_data *entry; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, struct iommu_dev_data, list); BUG_ON(!entry->domain); - __detach_device(entry); + do_detach(entry); } + + spin_unlock_irqrestore(&domain->lock, flags); } static void protection_domain_free(struct protection_domain *domain) From 45e528d9c479aeef2d3d1db1e619b243f91e324f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:58 +0200 Subject: [PATCH 116/312] iommu/amd: Check for busy devices earlier in attach_device() Check early in attach_device whether the device is already attached to a domain. This also simplifies the code path so that __attach_device() can be removed. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 2919168577ff..459247c32dc0 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2072,23 +2072,6 @@ static void do_detach(struct iommu_dev_data *dev_data) domain->dev_cnt -= 1; } -/* - * If a device is not yet associated with a domain, this function makes the - * device visible in the domain - */ -static int __attach_device(struct iommu_dev_data *dev_data, - struct protection_domain *domain) -{ - if (dev_data->domain != NULL) - return -EBUSY; - - /* Attach alias group root */ - do_attach(dev_data, domain); - - return 0; -} - - static void pdev_iommuv2_disable(struct pci_dev *pdev) { pci_disable_ats(pdev); @@ -2174,6 +2157,10 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); + ret = -EBUSY; + if (dev_data->domain != NULL) + goto out; + if (!dev_is_pci(dev)) goto skip_ats_check; @@ -2198,7 +2185,9 @@ static int attach_device(struct device *dev, } skip_ats_check: - ret = __attach_device(dev_data, domain); + ret = 0; + + do_attach(dev_data, domain); /* * We might boot into a crash-kernel here. The crashed kernel From ab7b2577f0d119052b98b8d913bad369ac2760eb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:22:59 +0200 Subject: [PATCH 117/312] iommu/amd: Lock dev_data in attach/detach code paths Make sure that attaching a detaching a device can't race against each other and protect the iommu_dev_data with a spin_lock in these code paths. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 9 +++++++++ drivers/iommu/amd_iommu_types.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 459247c32dc0..bac4e20a5919 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -201,6 +201,7 @@ static struct iommu_dev_data *alloc_dev_data(u16 devid) if (!dev_data) return NULL; + spin_lock_init(&dev_data->lock); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -2157,6 +2158,8 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); + spin_lock(&dev_data->lock); + ret = -EBUSY; if (dev_data->domain != NULL) goto out; @@ -2199,6 +2202,8 @@ skip_ats_check: domain_flush_complete(domain); out: + spin_unlock(&dev_data->lock); + spin_unlock_irqrestore(&domain->lock, flags); return ret; @@ -2218,6 +2223,8 @@ static void detach_device(struct device *dev) spin_lock_irqsave(&domain->lock, flags); + spin_lock(&dev_data->lock); + /* * First check if the device is still attached. It might already * be detached from its domain because the generic @@ -2240,6 +2247,8 @@ static void detach_device(struct device *dev) dev_data->ats.enabled = false; out: + spin_unlock(&dev_data->lock); + spin_unlock_irqrestore(&domain->lock, flags); } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 0186501ab971..c9c1612d52e0 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -633,6 +633,9 @@ struct devid_map { * This struct contains device specific data for the IOMMU */ struct iommu_dev_data { + /*Protect against attach/detach races */ + spinlock_t lock; + struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ struct protection_domain *domain; /* Domain the device is bound to */ From 2a78f9962565e53b78363eaf516eb052009e8020 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Sep 2019 15:23:00 +0200 Subject: [PATCH 118/312] iommu/amd: Lock code paths traversing protection_domain->dev_list The traversing of this list requires protection_domain->lock to be taken to avoid nasty races with attach/detach code. Make sure the lock is held on all code-paths traversing this list. Reported-by: Filippo Sironi Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reviewed-by: Filippo Sironi Reviewed-by: Jerry Snitselaar Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index bac4e20a5919..9c26976a0f99 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1334,8 +1334,12 @@ static void domain_flush_np_cache(struct protection_domain *domain, dma_addr_t iova, size_t size) { if (unlikely(amd_iommu_np_cache)) { + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); domain_flush_pages(domain, iova, size); domain_flush_complete(domain); + spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1700,8 +1704,13 @@ static int iommu_map_page(struct protection_domain *dom, ret = 0; out: - if (updated) + if (updated) { + unsigned long flags; + + spin_lock_irqsave(&dom->lock, flags); update_domain(dom); + spin_unlock_irqrestore(&dom->lock, flags); + } /* Everything flushed out, free pages now */ free_page_list(freelist); @@ -1857,8 +1866,12 @@ static void free_gcr3_table(struct protection_domain *domain) static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom) { + unsigned long flags; + + spin_lock_irqsave(&dom->domain.lock, flags); domain_flush_tlb(&dom->domain); domain_flush_complete(&dom->domain); + spin_unlock_irqrestore(&dom->domain.lock, flags); } static void iova_domain_flush_tlb(struct iova_domain *iovad) @@ -2414,6 +2427,7 @@ static dma_addr_t __map_single(struct device *dev, { dma_addr_t offset = paddr & ~PAGE_MASK; dma_addr_t address, start, ret; + unsigned long flags; unsigned int pages; int prot = 0; int i; @@ -2451,8 +2465,10 @@ out_unmap: iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE); } + spin_lock_irqsave(&dma_dom->domain.lock, flags); domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); dma_ops_free_iova(dma_dom, address, pages); @@ -2481,8 +2497,12 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, } if (amd_iommu_unmap_flush) { + unsigned long flags; + + spin_lock_irqsave(&dma_dom->domain.lock, flags); domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); dma_ops_free_iova(dma_dom, dma_addr, pages); } else { pages = __roundup_pow_of_two(pages); @@ -3246,9 +3266,12 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; + spin_lock_irqsave(&dom->lock, flags); domain_flush_tlb_pde(dom); domain_flush_complete(dom); + spin_unlock_irqrestore(&dom->lock, flags); } static void amd_iommu_iotlb_sync(struct iommu_domain *domain, From 127068abe85bf3dee50df51cb039a5a987a4a666 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 5 Sep 2019 20:24:12 +0100 Subject: [PATCH 119/312] i2c: qcom-geni: Disable DMA processing on the Lenovo Yoga C630 We have a production-level laptop (Lenovo Yoga C630) which is exhibiting a rather horrific bug. When I2C HID devices are being scanned for at boot-time the QCom Geni based I2C (Serial Engine) attempts to use DMA. When it does, the laptop reboots and the user never sees the OS. Attempts are being made to debug the reason for the spontaneous reboot. No luck so far, hence the requirement for this hot-fix. This workaround will be removed once we have a viable fix. Signed-off-by: Lee Jones Tested-by: Bjorn Andersson Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-qcom-geni.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index a89bfce5388e..17abf60c94ae 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -355,11 +355,13 @@ static int geni_i2c_rx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, { dma_addr_t rx_dma; unsigned long time_left; - void *dma_buf; + void *dma_buf = NULL; struct geni_se *se = &gi2c->se; size_t len = msg->len; - dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (!of_machine_is_compatible("lenovo,yoga-c630")) + dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (dma_buf) geni_se_select_mode(se, GENI_SE_DMA); else @@ -394,11 +396,13 @@ static int geni_i2c_tx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, { dma_addr_t tx_dma; unsigned long time_left; - void *dma_buf; + void *dma_buf = NULL; struct geni_se *se = &gi2c->se; size_t len = msg->len; - dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (!of_machine_is_compatible("lenovo,yoga-c630")) + dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (dma_buf) geni_se_select_mode(se, GENI_SE_DMA); else From a71e2ac1f32097fbb2beab098687a7a95c84543e Mon Sep 17 00:00:00 2001 From: Chris Brandt Date: Thu, 26 Sep 2019 07:19:09 -0500 Subject: [PATCH 120/312] i2c: riic: Clear NACK in tend isr The NACKF flag should be cleared in INTRIICNAKI interrupt processing as description in HW manual. This issue shows up quickly when PREEMPT_RT is applied and a device is probed that is not plugged in (like a touchscreen controller). The result is endless interrupts that halt system boot. Fixes: 310c18a41450 ("i2c: riic: add driver") Cc: stable@vger.kernel.org Reported-by: Chien Nguyen Signed-off-by: Chris Brandt Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-riic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-riic.c b/drivers/i2c/busses/i2c-riic.c index f31413fd9521..800414886f6b 100644 --- a/drivers/i2c/busses/i2c-riic.c +++ b/drivers/i2c/busses/i2c-riic.c @@ -202,6 +202,7 @@ static irqreturn_t riic_tend_isr(int irq, void *data) if (readb(riic->base + RIIC_ICSR2) & ICSR2_NACKF) { /* We got a NACKIE */ readb(riic->base + RIIC_ICDRR); /* dummy read */ + riic_clear_set_bit(riic, ICSR2_NACKF, 0, RIIC_ICSR2); riic->err = -ENXIO; } else if (riic->bytes_left) { return IRQ_NONE; From fd4b204a0971854c35795ab60b3673a5b57dfebc Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 27 Sep 2019 14:09:11 +0300 Subject: [PATCH 121/312] i2c: i801: Bring back Block Process Call support for certain platforms Commit b84398d6d7f9 ("i2c: i801: Use iTCO version 6 in Cannon Lake PCH and beyond") looks like to drop by accident Block Write-Block Read Process Call support for Intel Sunrisepoint, Lewisburg, Denverton and Kaby Lake. That support was added for above and newer platforms by the commit 315cd67c9453 ("i2c: i801: Add Block Write-Block Read Process Call support") so bring it back for above platforms. Fixes: b84398d6d7f9 ("i2c: i801: Use iTCO version 6 in Cannon Lake PCH and beyond") Signed-off-by: Jarkko Nikula Reviewed-by: Alexander Sverdlin Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-i801.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index c09791fb4929..f1c714acc280 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1736,6 +1736,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS: case PCI_DEVICE_ID_INTEL_DNV_SMBUS: case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS: + priv->features |= FEATURE_BLOCK_PROC; priv->features |= FEATURE_I2C_BLOCK_READ; priv->features |= FEATURE_IRQ; priv->features |= FEATURE_SMBUS_PEC; From 11af27f494086188620e7768e421894af93c126a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ard=C3=B6?= Date: Fri, 6 Sep 2019 16:06:09 +0200 Subject: [PATCH 122/312] i2c: slave-eeprom: Add read only mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add read-only versions of all EEPROMs. These versions are read-only on the i2c side, but can be written from the sysfs side. Signed-off-by: Björn Ardö Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-eeprom.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index 92ff9991bae8..db9763cb4dae 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -33,11 +33,13 @@ struct eeprom_data { u16 address_mask; u8 num_address_bytes; u8 idx_write_cnt; + bool read_only; u8 buffer[]; }; #define I2C_SLAVE_BYTELEN GENMASK(15, 0) #define I2C_SLAVE_FLAG_ADDR16 BIT(16) +#define I2C_SLAVE_FLAG_RO BIT(17) #define I2C_SLAVE_DEVICE_MAGIC(_len, _flags) ((_flags) | (_len)) static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, @@ -53,9 +55,11 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, eeprom->buffer_idx = *val | (eeprom->buffer_idx << 8); eeprom->idx_write_cnt++; } else { - spin_lock(&eeprom->buffer_lock); - eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; - spin_unlock(&eeprom->buffer_lock); + if (!eeprom->read_only) { + spin_lock(&eeprom->buffer_lock); + eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; + spin_unlock(&eeprom->buffer_lock); + } } break; @@ -130,6 +134,7 @@ static int i2c_slave_eeprom_probe(struct i2c_client *client, const struct i2c_de eeprom->idx_write_cnt = 0; eeprom->num_address_bytes = flag_addr16 ? 2 : 1; eeprom->address_mask = size - 1; + eeprom->read_only = FIELD_GET(I2C_SLAVE_FLAG_RO, id->driver_data); spin_lock_init(&eeprom->buffer_lock); i2c_set_clientdata(client, eeprom); @@ -165,8 +170,11 @@ static int i2c_slave_eeprom_remove(struct i2c_client *client) static const struct i2c_device_id i2c_slave_eeprom_id[] = { { "slave-24c02", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, 0) }, + { "slave-24c02ro", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, I2C_SLAVE_FLAG_RO) }, { "slave-24c32", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c32ro", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16 | I2C_SLAVE_FLAG_RO) }, { "slave-24c64", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c64ro", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16 | I2C_SLAVE_FLAG_RO) }, { } }; MODULE_DEVICE_TABLE(i2c, i2c_slave_eeprom_id); From d2aea95a1a4d195d939d16303700921be318a2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Sep 2019 05:53:29 -0400 Subject: [PATCH 123/312] tracing/probe: Fix to check the difference of nr_args before adding probe Steven reported that a test triggered: ================================================================== BUG: KASAN: slab-out-of-bounds in trace_kprobe_create+0xa9e/0xe40 Read of size 8 at addr ffff8880c4f25a48 by task ftracetest/4798 CPU: 2 PID: 4798 Comm: ftracetest Not tainted 5.3.0-rc6-test+ #30 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 Call Trace: dump_stack+0x7c/0xc0 ? trace_kprobe_create+0xa9e/0xe40 print_address_description+0x6c/0x332 ? trace_kprobe_create+0xa9e/0xe40 ? trace_kprobe_create+0xa9e/0xe40 __kasan_report.cold.6+0x1a/0x3b ? trace_kprobe_create+0xa9e/0xe40 kasan_report+0xe/0x12 trace_kprobe_create+0xa9e/0xe40 ? print_kprobe_event+0x280/0x280 ? match_held_lock+0x1b/0x240 ? find_held_lock+0xac/0xd0 ? fs_reclaim_release.part.112+0x5/0x20 ? lock_downgrade+0x350/0x350 ? kasan_unpoison_shadow+0x30/0x40 ? __kasan_kmalloc.constprop.6+0xc1/0xd0 ? trace_kprobe_create+0xe40/0xe40 ? trace_kprobe_create+0xe40/0xe40 create_or_delete_trace_kprobe+0x2e/0x60 trace_run_command+0xc3/0xe0 ? trace_panic_handler+0x20/0x20 ? kasan_unpoison_shadow+0x30/0x40 trace_parse_run_command+0xdc/0x163 vfs_write+0xe1/0x240 ksys_write+0xba/0x150 ? __ia32_sys_read+0x50/0x50 ? tracer_hardirqs_on+0x61/0x180 ? trace_hardirqs_off_caller+0x43/0x110 ? mark_held_locks+0x29/0xa0 ? do_syscall_64+0x14/0x260 do_syscall_64+0x68/0x260 Fix to check the difference of nr_args before adding probe on existing probes. This also may set the error log index bigger than the number of command parameters. In that case it sets the error position is next to the last parameter. Link: http://lkml.kernel.org/r/156966474783.3478.13217501608215769150.stgit@devnote2 Fixes: ca89bc071d5e ("tracing/kprobe: Add multi-probe per event support") Reported-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index baf58a3612c0..905b10af5d5c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -178,6 +178,16 @@ void __trace_probe_log_err(int offset, int err_type) if (!command) return; + if (trace_probe_log.index >= trace_probe_log.argc) { + /** + * Set the error position is next to the last arg + space. + * Note that len includes the terminal null and the cursor + * appaers at pos + 1. + */ + pos = len; + offset = 0; + } + /* And make a command string from argv array */ p = command; for (i = 0; i < trace_probe_log.argc; i++) { @@ -1084,6 +1094,12 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) { int i; + /* In case of more arguments */ + if (a->nr_args < b->nr_args) + return a->nr_args + 1; + if (a->nr_args > b->nr_args) + return b->nr_args + 1; + for (i = 0; i < a->nr_args; i++) { if ((b->nr_args <= i) || ((a->args[i].type != b->args[i].type) || From 968e5170939662341242812b9c82ef51cf140a33 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 26 Sep 2019 09:22:59 -0700 Subject: [PATCH 124/312] tracing: Fix clang -Wint-in-bool-context warnings in IF_ASSIGN macro After r372664 in clang, the IF_ASSIGN macro causes a couple hundred warnings along the lines of: kernel/trace/trace_output.c:1331:2: warning: converting the enum constant to a boolean [-Wint-in-bool-context] kernel/trace/trace.h:409:3: note: expanded from macro 'trace_assign_type' IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, ^ kernel/trace/trace.h:371:14: note: expanded from macro 'IF_ASSIGN' WARN_ON(id && (entry)->type != id); \ ^ 264 warnings generated. This warning can catch issues with constructs like: if (state == A || B) where the developer really meant: if (state == A || state == B) This is currently the only occurrence of the warning in the kernel tree across defconfig, allyesconfig, allmodconfig for arm32, arm64, and x86_64. Add the implicit '!= 0' to the WARN_ON statement to fix the warnings and find potential issues in the future. Link: https://github.com/llvm/llvm-project/commit/28b38c277a2941e9e891b2db30652cfd962f070b Link: https://github.com/ClangBuiltLinux/linux/issues/686 Link: http://lkml.kernel.org/r/20190926162258.466321-1-natechancellor@gmail.com Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 26b0a08f3c7d..f801d154ff6a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -365,11 +365,11 @@ static inline struct trace_array *top_trace_array(void) __builtin_types_compatible_p(typeof(var), type *) #undef IF_ASSIGN -#define IF_ASSIGN(var, entry, etype, id) \ - if (FTRACE_CMP_TYPE(var, etype)) { \ - var = (typeof(var))(entry); \ - WARN_ON(id && (entry)->type != id); \ - break; \ +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id != 0 && (entry)->type != id); \ + break; \ } /* Will cause compile errors if type is not found. */ From 96c5c6e6a5b6db592acae039fed54b5c8844cd35 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Fri, 20 Sep 2019 17:57:59 -0500 Subject: [PATCH 125/312] tracing: Have error path in predicate_parse() free its allocated memory In predicate_parse, there is an error path that is not going to out_free instead it returns directly which leads to a memory leak. Link: http://lkml.kernel.org/r/20190920225800.3870-1-navid.emamdoost@gmail.com Signed-off-by: Navid Emamdoost Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c773b8fb270c..c9a74f82b14a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -452,8 +452,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, switch (*next) { case '(': /* #2 */ - if (top - op_stack > nr_parens) - return ERR_PTR(-EINVAL); + if (top - op_stack > nr_parens) { + ret = -EINVAL; + goto out_free; + } *(++top) = invert; continue; case '!': /* #3 */ From f7d6316fb43735ae8af969c2e582fbee85709483 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Sat, 14 Sep 2019 18:32:15 +0800 Subject: [PATCH 126/312] mm, tracing: Print symbol name for call_site in trace events To improve the readability of raw slab trace points, print the call_site ip using '%pS'. Then we can grep events with function names. [002] .... 808.188897: kmem_cache_free: call_site=putname+0x47/0x50 ptr=00000000cef40c80 [002] .... 808.188898: kfree: call_site=security_cred_free+0x42/0x50 ptr=0000000062400820 [002] .... 808.188904: kmem_cache_free: call_site=put_cred_rcu+0x88/0xa0 ptr=0000000058d74ef8 [002] .... 808.188913: kmem_cache_alloc: call_site=prepare_creds+0x26/0x100 ptr=0000000058d74ef8 bytes_req=168 bytes_alloc=576 gfp_flags=GFP_KERNEL [002] .... 808.188917: kmalloc: call_site=security_prepare_creds+0x77/0xa0 ptr=0000000062400820 bytes_req=8 bytes_alloc=336 gfp_flags=GFP_KERNEL|__GFP_ZERO [002] .... 808.188920: kmem_cache_alloc: call_site=getname_flags+0x4f/0x1e0 ptr=00000000cef40c80 bytes_req=4096 bytes_alloc=4480 gfp_flags=GFP_KERNEL [002] .... 808.188925: kmem_cache_free: call_site=putname+0x47/0x50 ptr=00000000cef40c80 [002] .... 808.188926: kfree: call_site=security_cred_free+0x42/0x50 ptr=0000000062400820 [002] .... 808.188931: kmem_cache_free: call_site=put_cred_rcu+0x88/0xa0 ptr=0000000058d74ef8 Link: http://lkml.kernel.org/r/20190914103215.23301-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/kmem.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index eb57e3037deb..69e8bb8963db 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -35,8 +35,8 @@ DECLARE_EVENT_CLASS(kmem_alloc, __entry->gfp_flags = gfp_flags; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", - __entry->call_site, + TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", + (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, @@ -131,7 +131,8 @@ DECLARE_EVENT_CLASS(kmem_free, __entry->ptr = ptr; ), - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) + TP_printk("call_site=%pS ptr=%p", + (void *)__entry->call_site, __entry->ptr) ); DEFINE_EVENT(kmem_free, kfree, From 8ed4889eb83179dbc9a105cfed65cc42ecb61097 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 27 Sep 2019 11:10:22 -0400 Subject: [PATCH 127/312] selftests/ftrace: Fix same probe error test The "same probe" selftest that tests that adding the same probe fails doesn't add the same probe and passes, which fails the test. Fixes: b78b94b82122 ("selftests/ftrace: Update kprobe event error testcase") Signed-off-by: Steven Rostedt (VMware) --- .../selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index 8a4025e912cb..ef1e9bafb098 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -95,7 +95,7 @@ echo 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events check_error 'p:kprobes/testevent _do_fork ^bcd=\1' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\1:u8' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\"foo"' # DIFF_ARG_TYPE -check_error '^p:kprobes/testevent _do_fork' # SAME_PROBE +check_error '^p:kprobes/testevent _do_fork abcd=\1' # SAME_PROBE fi exit 0 From dc925a36060e8cef050a9d05c64dae1c30dc5027 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Sep 2019 10:29:49 +0200 Subject: [PATCH 128/312] Documentation/process: Clarify disclosure rules The role of the contact list provided by the disclosing party and how it affects the disclosure process and the ability to include experts into the development process is not really well explained. Neither is it entirely clear when the disclosing party will be informed about the fact that a developer who is not covered by an employer NDA needs to be brought in and disclosed. Explain the role of the contact list and the information policy along with an eventual conflict resolution better. Reported-by: Dave Hansen Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Link: https://lore.kernel.org/r/alpine.DEB.2.21.1909251028390.10825@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- .../process/embargoed-hardware-issues.rst | 40 +++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index e57b9f39c69f..a3c3349046c4 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -143,6 +143,20 @@ via their employer, they cannot enter individual non-disclosure agreements in their role as Linux kernel developers. They will, however, agree to adhere to this documented process and the Memorandum of Understanding. +The disclosing party should provide a list of contacts for all other +entities who have already been, or should be, informed about the issue. +This serves several purposes: + + - The list of disclosed entities allows communication accross the + industry, e.g. other OS vendors, HW vendors, etc. + + - The disclosed entities can be contacted to name experts who should + participate in the mitigation development. + + - If an expert which is required to handle an issue is employed by an + listed entity or member of an listed entity, then the response teams can + request the disclosure of that expert from that entity. This ensures + that the expert is also part of the entity's response team. Disclosure """""""""" @@ -158,10 +172,7 @@ Mitigation development """""""""""""""""""""" The initial response team sets up an encrypted mailing-list or repurposes -an existing one if appropriate. The disclosing party should provide a list -of contacts for all other parties who have already been, or should be, -informed about the issue. The response team contacts these parties so they -can name experts who should be subscribed to the mailing-list. +an existing one if appropriate. Using a mailing-list is close to the normal Linux development process and has been successfully used in developing mitigations for various hardware @@ -175,9 +186,24 @@ development branch against the mainline kernel and backport branches for stable kernel versions as necessary. The initial response team will identify further experts from the Linux -kernel developer community as needed and inform the disclosing party about -their participation. Bringing in experts can happen at any time of the -development process and often needs to be handled in a timely manner. +kernel developer community as needed. Bringing in experts can happen at any +time of the development process and needs to be handled in a timely manner. + +If an expert is employed by or member of an entity on the disclosure list +provided by the disclosing party, then participation will be requested from +the relevant entity. + +If not, then the disclosing party will be informed about the experts +participation. The experts are covered by the Memorandum of Understanding +and the disclosing party is requested to acknowledge the participation. In +case that the disclosing party has a compelling reason to object, then this +objection has to be raised within five work days and resolved with the +incident team immediately. If the disclosing party does not react within +five work days this is taken as silent acknowledgement. + +After acknowledgement or resolution of an objection the expert is disclosed +by the incident team and brought into the development process. + Coordinated release """"""""""""""""""" From 50ee7529ec4500c88f8664560770a7a1b65db72b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 28 Sep 2019 16:53:52 -0700 Subject: [PATCH 129/312] random: try to actively add entropy rather than passively wait for it For 5.3 we had to revert a nice ext4 IO pattern improvement, because it caused a bootup regression due to lack of entropy at bootup together with arguably broken user space that was asking for secure random numbers when it really didn't need to. See commit 72dbcf721566 (Revert "ext4: make __ext4_get_inode_loc plug"). This aims to solve the issue by actively generating entropy noise using the CPU cycle counter when waiting for the random number generator to initialize. This only works when you have a high-frequency time stamp counter available, but that's the case on all modern x86 CPU's, and on most other modern CPU's too. What we do is to generate jitter entropy from the CPU cycle counter under a somewhat complex load: calling the scheduler while also guaranteeing a certain amount of timing noise by also triggering a timer. I'm sure we can tweak this, and that people will want to look at other alternatives, but there's been a number of papers written on jitter entropy, and this should really be fairly conservative by crediting one bit of entropy for every timer-induced jump in the cycle counter. Not because the timer itself would be all that unpredictable, but because the interaction between the timer and the loop is going to be. Even if (and perhaps particularly if) the timer actually happens on another CPU, the cacheline interaction between the loop that reads the cycle counter and the timer itself firing is going to add perturbations to the cycle counter values that get mixed into the entropy pool. As Thomas pointed out, with a modern out-of-order CPU, even quite simple loops show a fair amount of hard-to-predict timing variability even in the absense of external interrupts. But this tries to take that further by actually having a fairly complex interaction. This is not going to solve the entropy issue for architectures that have no CPU cycle counter, but it's not clear how (and if) that is solvable, and the hardware in question is largely starting to be irrelevant. And by doing this we can at least avoid some of the even more contentious approaches (like making the entropy waiting time out in order to avoid the possibly unbounded waiting). Cc: Ahmed Darwish Cc: Thomas Gleixner Cc: Theodore Ts'o Cc: Nicholas Mc Guire Cc: Andy Lutomirski Cc: Kees Cook Cc: Willy Tarreau Cc: Alexander E. Patrakov Cc: Lennart Poettering Signed-off-by: Linus Torvalds --- drivers/char/random.c | 62 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 5d5ea4ce1442..2fda6166c1dd 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1731,6 +1731,56 @@ void get_random_bytes(void *buf, int nbytes) } EXPORT_SYMBOL(get_random_bytes); + +/* + * Each time the timer fires, we expect that we got an unpredictable + * jump in the cycle counter. Even if the timer is running on another + * CPU, the timer activity will be touching the stack of the CPU that is + * generating entropy.. + * + * Note that we don't re-arm the timer in the timer itself - we are + * happy to be scheduled away, since that just makes the load more + * complex, but we do not want the timer to keep ticking unless the + * entropy loop is running. + * + * So the re-arming always happens in the entropy loop itself. + */ +static void entropy_timer(struct timer_list *t) +{ + credit_entropy_bits(&input_pool, 1); +} + +/* + * If we have an actual cycle counter, see if we can + * generate enough entropy with timing noise + */ +static void try_to_generate_entropy(void) +{ + struct { + unsigned long now; + struct timer_list timer; + } stack; + + stack.now = random_get_entropy(); + + /* Slow counter - or none. Don't even bother */ + if (stack.now == random_get_entropy()) + return; + + timer_setup_on_stack(&stack.timer, entropy_timer, 0); + while (!crng_ready()) { + if (!timer_pending(&stack.timer)) + mod_timer(&stack.timer, jiffies+1); + mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now)); + schedule(); + stack.now = random_get_entropy(); + } + + del_timer_sync(&stack.timer); + destroy_timer_on_stack(&stack.timer); + mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now)); +} + /* * Wait for the urandom pool to be seeded and thus guaranteed to supply * cryptographically secure random numbers. This applies to: the /dev/urandom @@ -1745,7 +1795,17 @@ int wait_for_random_bytes(void) { if (likely(crng_ready())) return 0; - return wait_event_interruptible(crng_init_wait, crng_ready()); + + do { + int ret; + ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ); + if (ret) + return ret > 0 ? 0 : ret; + + try_to_generate_entropy(); + } while (!crng_ready()); + + return 0; } EXPORT_SYMBOL(wait_for_random_bytes); From 02f03c4206c1b2a7451d3b3546f86c9c783eac13 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Sep 2019 17:59:23 -0700 Subject: [PATCH 130/312] Revert "Revert "ext4: make __ext4_get_inode_loc plug"" This reverts commit 72dbcf72156641fde4d8ea401e977341bfd35a05. Instead of waiting forever for entropy that may just not happen, we now try to actively generate entropy when required, and are thus hopefully avoiding the problem that caused the nice ext4 IO pattern fix to be reverted. So revert the revert. Cc: Ahmed S. Darwish Cc: Ted Ts'o Cc: Willy Tarreau Cc: Alexander E. Patrakov Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 006b7a2070bf..420fe3deed39 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4586,6 +4586,7 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; + struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4674,6 +4675,7 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ + blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4704,6 +4706,7 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, From fdbdcddc2c93096e9b956de930d2d710a1342502 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 28 Aug 2019 16:35:19 +0300 Subject: [PATCH 131/312] csky: Use generic free_initrd_mem() The csky implementation of free_initrd_mem() is an open-coded version of free_reserved_area() without poisoning. Remove it and make csky use the generic version of free_initrd_mem(). Signed-off-by: Mike Rapoport Signed-off-by: Guo Ren --- arch/csky/mm/init.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index eb0dc9e5065f..d4c2292ea46b 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -60,22 +60,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - if (start < end) - pr_info("Freeing initrd memory: %ldk freed\n", - (end - start) >> 10); - - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages_inc(); - } -} -#endif - extern char __init_begin[], __init_end[]; void free_initmem(void) From 48ede51fd94fe9251058fc85626b2aeb5cbb5884 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 25 Sep 2019 19:56:16 +0800 Subject: [PATCH 132/312] csky: Fixup add zero_fp fixup perf backtrace panic We need set fp zero to let backtrace know the end. The patch fixup perf callchain panic problem, because backtrace didn't know what is the end of fp. Signed-off-by: Guo Ren Reported-by: Mao Han --- arch/csky/kernel/entry.S | 50 +++++++++++++++++++++++--------------- arch/csky/kernel/process.c | 2 +- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index a7e84ccccbd8..564dab2fabaa 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -17,6 +17,12 @@ #define PTE_INDX_SHIFT 10 #define _PGDIR_SHIFT 22 +.macro zero_fp +#ifdef CONFIG_STACKTRACE + movi r8, 0 +#endif +.endm + .macro tlbop_begin name, val0, val1, val2 ENTRY(csky_\name) mtcr a3, ss2 @@ -96,6 +102,7 @@ ENTRY(csky_\name) SAVE_ALL 0 .endm .macro tlbop_end is_write + zero_fp RD_MEH a2 psrset ee, ie mov a0, sp @@ -120,6 +127,7 @@ tlbop_end 1 ENTRY(csky_systemcall) SAVE_ALL TRAP0_SIZE + zero_fp psrset ee, ie @@ -136,9 +144,9 @@ ENTRY(csky_systemcall) mov r9, sp bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - ANDI_R3 r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + ANDI_R3 r12, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) + cmpnei r12, 0 bt csky_syscall_trace #if defined(__CSKYABIV2__) subi sp, 8 @@ -180,7 +188,7 @@ csky_syscall_trace: ENTRY(ret_from_kernel_thread) jbsr schedule_tail - mov a0, r8 + mov a0, r10 jsr r9 jbsr ret_from_exception @@ -189,9 +197,9 @@ ENTRY(ret_from_fork) mov r9, sp bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - ANDI_R3 r8, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + ANDI_R3 r12, (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) + cmpnei r12, 0 bf ret_from_exception mov a0, sp /* sp = pt_regs pointer */ jbsr syscall_trace_exit @@ -209,9 +217,9 @@ ret_from_exception: bmaski r10, THREAD_SHIFT andn r9, r10 - ldw r8, (r9, TINFO_FLAGS) - andi r8, (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED) - cmpnei r8, 0 + ldw r12, (r9, TINFO_FLAGS) + andi r12, (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED) + cmpnei r12, 0 bt exit_work 1: RESTORE_ALL @@ -220,11 +228,11 @@ exit_work: lrw syscallid, ret_from_exception mov lr, syscallid - btsti r8, TIF_NEED_RESCHED + btsti r12, TIF_NEED_RESCHED bt work_resched mov a0, sp - mov a1, r8 + mov a1, r12 jmpi do_notify_resume work_resched: @@ -232,6 +240,7 @@ work_resched: ENTRY(csky_trap) SAVE_ALL 0 + zero_fp psrset ee mov a0, sp /* Push Stack pointer arg */ jbsr trap_c /* Call C-level trap handler */ @@ -265,6 +274,7 @@ ENTRY(csky_get_tls) ENTRY(csky_irq) SAVE_ALL 0 + zero_fp psrset ee #ifdef CONFIG_PREEMPT @@ -276,21 +286,21 @@ ENTRY(csky_irq) * Get task_struct->stack.preempt_count for current, * and increase 1. */ - ldw r8, (r9, TINFO_PREEMPT) - addi r8, 1 - stw r8, (r9, TINFO_PREEMPT) + ldw r12, (r9, TINFO_PREEMPT) + addi r12, 1 + stw r12, (r9, TINFO_PREEMPT) #endif mov a0, sp jbsr csky_do_IRQ #ifdef CONFIG_PREEMPT - subi r8, 1 - stw r8, (r9, TINFO_PREEMPT) - cmpnei r8, 0 + subi r12, 1 + stw r12, (r9, TINFO_PREEMPT) + cmpnei r12, 0 bt 2f - ldw r8, (r9, TINFO_FLAGS) - btsti r8, TIF_NEED_RESCHED + ldw r12, (r9, TINFO_FLAGS) + btsti r12, TIF_NEED_RESCHED bf 2f 1: jbsr preempt_schedule_irq /* irq en/disable is done inside */ diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index e555740c0be5..f320d9248a22 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -55,7 +55,7 @@ int copy_thread(unsigned long clone_flags, if (unlikely(p->flags & PF_KTHREAD)) { memset(childregs, 0, sizeof(struct pt_regs)); childstack->r15 = (unsigned long) ret_from_kernel_thread; - childstack->r8 = kthread_arg; + childstack->r10 = kthread_arg; childstack->r9 = usp; childregs->sr = mfcr("psr"); } else { From 3a09d8e2893b2403a043890e5832966e8640feaf Mon Sep 17 00:00:00 2001 From: Mao Han Date: Wed, 25 Sep 2019 17:23:02 +0800 Subject: [PATCH 133/312] csky: Fixup csky_pmu.max_period assignment The csky_pmu.max_period has type u64, and BIT() can only return 32 bits unsigned long on C-SKY. The initialization for max_period will be incorrect when count_width is bigger than 32. Use BIT_ULL() Signed-off-by: Mao Han Signed-off-by: Guo Ren --- arch/csky/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c index 4c1a1934d76a..7570109cddc6 100644 --- a/arch/csky/kernel/perf_event.c +++ b/arch/csky/kernel/perf_event.c @@ -1306,7 +1306,7 @@ int csky_pmu_device_probe(struct platform_device *pdev, &csky_pmu.count_width)) { csky_pmu.count_width = DEFAULT_COUNT_WIDTH; } - csky_pmu.max_period = BIT(csky_pmu.count_width) - 1; + csky_pmu.max_period = BIT_ULL(csky_pmu.count_width) - 1; csky_pmu.plat_device = pdev; From a2139d3b4fd7ef26a363a1b1eb6cd55be2c1bcd1 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Sep 2019 15:36:14 +0100 Subject: [PATCH 134/312] csky: entry: Remove unneeded need_resched() loop Since the enabling and disabling of IRQs within preempt_schedule_irq() is contained in a need_resched() loop, we don't need the outer arch code loop. Signed-off-by: Valentin Schneider Signed-off-by: Guo Ren --- arch/csky/kernel/entry.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index 564dab2fabaa..a7a5b67df898 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -302,11 +302,7 @@ ENTRY(csky_irq) ldw r12, (r9, TINFO_FLAGS) btsti r12, TIF_NEED_RESCHED bf 2f -1: jbsr preempt_schedule_irq /* irq en/disable is done inside */ - ldw r7, (r9, TINFO_FLAGS) /* get new tasks TI_FLAGS */ - btsti r7, TIF_NEED_RESCHED - bt 1b /* go again */ #endif 2: jmpi ret_from_exception From 9af032a30172e119a5935f802b066631f8ded2d6 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 3 Sep 2019 13:36:51 +0200 Subject: [PATCH 135/312] csky: Move static keyword to the front of declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static keyword to the front of declaration of csky_pmu_of_device_ids, and resolve the following compiler warning that can be seen when building with warnings enabled (W=1): arch/csky/kernel/perf_event.c:1340:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Krzysztof Wilczynski Signed-off-by: Guo Ren --- arch/csky/kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c index 7570109cddc6..1a29f1157449 100644 --- a/arch/csky/kernel/perf_event.c +++ b/arch/csky/kernel/perf_event.c @@ -1337,7 +1337,7 @@ int csky_pmu_device_probe(struct platform_device *pdev, return ret; } -const static struct of_device_id csky_pmu_of_device_ids[] = { +static const struct of_device_id csky_pmu_of_device_ids[] = { {.compatible = "csky,csky-pmu"}, {}, }; From d7d44b6fe40a98e960be92ea8617954c2596d140 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Sep 2019 22:33:57 +0200 Subject: [PATCH 136/312] drm/tilcdc: include linux/pinctrl/consumer.h again This was apparently dropped by accident in a recent cleanup, causing a build failure in some configurations now: drivers/gpu/drm/tilcdc/tilcdc_tfp410.c:296:12: error: implicit declaration of function 'devm_pinctrl_get_select_default' [-Werror,-Wimplicit-function-declaration] Fixes: fcb57664172e ("drm/tilcdc: drop use of drmP.h") Signed-off-by: Arnd Bergmann Reviewed-by: Laurent Pinchart Signed-off-by: Jyri Sarha Link: https://patchwork.freedesktop.org/patch/msgid/02b03f74cf941f52a941a36bdc8dabb4a69fd87e.1569852588.git.jsarha@ti.com Acked-by: Sam Ravnborg --- drivers/gpu/drm/tilcdc/tilcdc_tfp410.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c b/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c index 525dc1c0f1c1..530edb3b51cc 100644 --- a/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c +++ b/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include From 833b45de69a6016c4b0cebe6765d526a31a81580 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 30 Sep 2019 18:48:44 +0200 Subject: [PATCH 137/312] kvm: x86, powerpc: do not allow clearing largepages debugfs entry The largepages debugfs entry is incremented/decremented as shadow pages are created or destroyed. Clearing it will result in an underflow, which is harmless to KVM but ugly (and could be misinterpreted by tools that use debugfs information), so make this particular statistic read-only. Cc: kvm-ppc@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s.c | 8 ++++---- arch/x86/kvm/x86.c | 6 +++--- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 10 +++++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index d7fcdfa7fee4..ec2547cc5ecb 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -36,8 +36,8 @@ #include "book3s.h" #include "trace.h" -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ /* #define EXIT_DEBUG */ @@ -69,8 +69,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "pthru_all", VCPU_STAT(pthru_all) }, { "pthru_host", VCPU_STAT(pthru_host) }, { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, - { "largepages_2M", VM_STAT(num_2M_pages) }, - { "largepages_1G", VM_STAT(num_1G_pages) }, + { "largepages_2M", VM_STAT(num_2M_pages, .mode = 0444) }, + { "largepages_1G", VM_STAT(num_1G_pages, .mode = 0444) }, { NULL } }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 180c7e88577a..8072acaaf028 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -212,7 +212,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fcb46b3374c6..719fc3e15ea4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1090,6 +1090,7 @@ enum kvm_stat_kind { struct kvm_stat_data { int offset; + int mode; struct kvm *kvm; }; @@ -1097,6 +1098,7 @@ struct kvm_stats_debugfs_item { const char *name; int offset; enum kvm_stat_kind kind; + int mode; }; extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6de3159e682..fd68fbe0a75d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -617,8 +617,9 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) stat_data->kvm = kvm; stat_data->offset = p->offset; + stat_data->mode = p->mode ? p->mode : 0644; kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, + debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, stat_data, stat_fops_per_vm[p->kind]); } return 0; @@ -3929,7 +3930,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) return -ENOENT; - if (simple_attr_open(inode, file, get, set, fmt)) { + if (simple_attr_open(inode, file, get, + stat_data->mode & S_IWUGO ? set : NULL, + fmt)) { kvm_put_kvm(stat_data->kvm); return -ENOMEM; } @@ -4177,7 +4180,8 @@ static void kvm_init_debug(void) kvm_debugfs_num_entries = 0; for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - debugfs_create_file(p->name, 0644, kvm_debugfs_dir, + int mode = p->mode ? p->mode : 0644; + debugfs_create_file(p->name, mode, kvm_debugfs_dir, (void *)(long)p->offset, stat_fops[p->kind]); } From 54ecb8f7028c5eb3d740bb82b0f1d90f2df63c5c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 30 Sep 2019 10:35:40 -0700 Subject: [PATCH 138/312] Linux 5.4-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d456746da347..6f54f2f95743 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 -PATCHLEVEL = 3 +PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Bobtail Squid # *DOCUMENTATION* From 7ae6d93c8f052b7a77ba56ed0f654e22a2876739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Vok=C3=A1=C4=8D?= Date: Thu, 26 Sep 2019 10:59:17 +0200 Subject: [PATCH 139/312] net: dsa: qca8k: Use up to 7 ports for all operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The QCA8K family supports up to 7 ports. So use the existing QCA8K_NUM_PORTS define to allocate the switch structure and limit all operations with the switch ports. This was not an issue until commit 0394a63acfe2 ("net: dsa: enable and disable all ports") disabled all unused ports. Since the unused ports 7-11 are outside of the correct register range on this switch some registers were rewritten with invalid content. Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family") Fixes: a0c02161ecfc ("net: dsa: variable number of ports") Fixes: 0394a63acfe2 ("net: dsa: enable and disable all ports") Signed-off-by: Michal Vokáč Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/qca8k.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 684aa51684db..b00274caae4f 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -705,7 +705,7 @@ qca8k_setup(struct dsa_switch *ds) BIT(0) << QCA8K_GLOBAL_FW_CTRL1_UC_DP_S); /* Setup connection between CPU port & user ports */ - for (i = 0; i < DSA_MAX_PORTS; i++) { + for (i = 0; i < QCA8K_NUM_PORTS; i++) { /* CPU port gets connected to all user ports of the switch */ if (dsa_is_cpu_port(ds, i)) { qca8k_rmw(priv, QCA8K_PORT_LOOKUP_CTRL(QCA8K_CPU_PORT), @@ -1077,7 +1077,7 @@ qca8k_sw_probe(struct mdio_device *mdiodev) if (id != QCA8K_ID_QCA8337) return -ENODEV; - priv->ds = dsa_switch_alloc(&mdiodev->dev, DSA_MAX_PORTS); + priv->ds = dsa_switch_alloc(&mdiodev->dev, QCA8K_NUM_PORTS); if (!priv->ds) return -ENOMEM; From e9789c7cc182484fc031fd88097eb14cb26c4596 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Sep 2019 18:24:43 -0700 Subject: [PATCH 140/312] sch_cbq: validate TCA_CBQ_WRROPT to avoid crash syzbot reported a crash in cbq_normalize_quanta() caused by an out of range cl->priority. iproute2 enforces this check, but malicious users do not. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI Modules linked in: CPU: 1 PID: 26447 Comm: syz-executor.1 Not tainted 5.3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:cbq_normalize_quanta.part.0+0x1fd/0x430 net/sched/sch_cbq.c:902 RSP: 0018:ffff8801a5c333b0 EFLAGS: 00010206 RAX: 0000000020000003 RBX: 00000000fffffff8 RCX: ffffc9000712f000 RDX: 00000000000043bf RSI: ffffffff83be8962 RDI: 0000000100000018 RBP: ffff8801a5c33420 R08: 000000000000003a R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000002ef R13: ffff88018da95188 R14: dffffc0000000000 R15: 0000000000000015 FS: 00007f37d26b1700(0000) GS:ffff8801dad00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004c7cec CR3: 00000001bcd0a006 CR4: 00000000001626f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] cbq_normalize_quanta include/net/pkt_sched.h:27 [inline] [] cbq_addprio net/sched/sch_cbq.c:1097 [inline] [] cbq_set_wrr+0x2d7/0x450 net/sched/sch_cbq.c:1115 [] cbq_change_class+0x987/0x225b net/sched/sch_cbq.c:1537 [] tc_ctl_tclass+0x555/0xcd0 net/sched/sch_api.c:2329 [] rtnetlink_rcv_msg+0x485/0xc10 net/core/rtnetlink.c:5248 [] netlink_rcv_skb+0x17a/0x460 net/netlink/af_netlink.c:2510 [] rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5266 [] netlink_unicast_kernel net/netlink/af_netlink.c:1324 [inline] [] netlink_unicast+0x536/0x720 net/netlink/af_netlink.c:1350 [] netlink_sendmsg+0x89a/0xd50 net/netlink/af_netlink.c:1939 [] sock_sendmsg_nosec net/socket.c:673 [inline] [] sock_sendmsg+0x12e/0x170 net/socket.c:684 [] ___sys_sendmsg+0x81d/0x960 net/socket.c:2359 [] __sys_sendmsg+0x105/0x1d0 net/socket.c:2397 [] SYSC_sendmsg net/socket.c:2406 [inline] [] SyS_sendmsg+0x29/0x30 net/socket.c:2404 [] do_syscall_64+0x528/0x770 arch/x86/entry/common.c:305 [] entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 06c7a2da21bc..39b427dc7512 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1127,6 +1127,33 @@ static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, }; +static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], + struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, + cbq_policy, extack); + if (err < 0) + return err; + + if (tb[TCA_CBQ_WRROPT]) { + const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]); + + if (wrr->priority > TC_CBQ_MAXPRIO) { + NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO"); + err = -EINVAL; + } + } + return err; +} + static int cbq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1139,13 +1166,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt, hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); q->delay_timer.function = cbq_undelay; - if (!opt) { - NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, - extack); + err = cbq_opt_parse(tb, opt, extack); if (err < 0) return err; @@ -1464,13 +1485,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t struct cbq_class *parent; struct qdisc_rate_table *rtab = NULL; - if (!opt) { - NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, - extack); + err = cbq_opt_parse(tb, opt, extack); if (err < 0) return err; From 0e141f757b2c78c983df893e9993313e2dc21e38 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Fri, 27 Sep 2019 14:58:20 +0800 Subject: [PATCH 141/312] erspan: remove the incorrect mtu limit for erspan erspan driver calls ether_setup(), after commit 61e84623ace3 ("net: centralize net_device min/max MTU checking"), the range of mtu is [min_mtu, max_mtu], which is [68, 1500] by default. It causes the dev mtu of the erspan device to not be greater than 1500, this limit value is not correct for ipgre tap device. Tested: Before patch: # ip link set erspan0 mtu 1600 Error: mtu greater than device maximum. After patch: # ip link set erspan0 mtu 1600 # ip -d link show erspan0 21: erspan0@NONE: mtu 1600 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 0 Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a53a543fe055..52690bb3e40f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1446,6 +1446,7 @@ static void erspan_setup(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &erspan_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; From 61129dd29f7962f278b618a2a3e8fdb986a66dc8 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 17 Sep 2019 09:18:53 +0200 Subject: [PATCH 142/312] sched: Add __ASSEMBLY__ guards around struct clone_args The addition of struct clone_args to uapi/linux/sched.h is not protected by __ASSEMBLY__ guards, causing a failure to build from source for glibc on RISC-V. Add the guards to fix this. Fixes: 7f192e3cd316 ("fork: add clone3") Signed-off-by: Seth Forshee Cc: Acked-by: Ingo Molnar Link: https://lore.kernel.org/r/20190917071853.12385-1-seth.forshee@canonical.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b3105ac1381a..851ff1feadd5 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -33,6 +33,7 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +#ifndef __ASSEMBLY__ /* * Arguments for the clone3 syscall */ @@ -46,6 +47,7 @@ struct clone_args { __aligned_u64 stack_size; __aligned_u64 tls; }; +#endif /* * Scheduling policies From 3969e76909d3aa06715997896184ee684f68d164 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 24 Sep 2019 13:52:37 -0600 Subject: [PATCH 143/312] selftests: pidfd: Fix undefined reference to pthread_create() Fix build failure: undefined reference to `pthread_create' collect2: error: ld returned 1 exit status Fix CFLAGS to include pthread correctly. Fixes: 740378dc7834 ("pidfd: add polling selftests") Signed-off-by: Shuah Khan Reviewed-by: Christian Brauner Cc: Link: https://lore.kernel.org/r/20190924195237.30519-1-skhan@linuxfoundation.org Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 464c9b76148f..7550f08822a3 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -g -I../../../../usr/include/ -lpthread +CFLAGS += -g -I../../../../usr/include/ -pthread TEST_GEN_PROGS := pidfd_test pidfd_open_test pidfd_poll_test pidfd_wait From 517d6b9c6f71bee15c729155445c42cc4ef77dda Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 18 Sep 2019 08:30:33 +0000 Subject: [PATCH 144/312] erofs: fix return value check in erofs_read_superblock() In case of error, the function read_mapping_page() returns ERR_PTR() not NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: fe7c2423570d ("erofs: use read_mapping_page instead of sb_bread") Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Wei Yongjun Link: https://lore.kernel.org/r/20190918083033.47780-1-weiyongjun1@huawei.com Signed-off-by: Gao Xiang --- fs/erofs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index caf9a95173b0..0e369494f2f2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -105,9 +105,9 @@ static int erofs_read_superblock(struct super_block *sb) int ret; page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL); - if (!page) { + if (IS_ERR(page)) { erofs_err(sb, "cannot read erofs superblock"); - return -EIO; + return PTR_ERR(page); } sbi = EROFS_SB(sb); From 6927cc05c2b067d4ca8ab6aeaa3b4544bea26ef8 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 19 Sep 2019 14:28:38 +0800 Subject: [PATCH 145/312] MAINTAINERS: erofs: complete sub-entries for erofs Add a formal git tree and missing files for erofs after moving out of staging for everyone to blame in order for better improvement. Acked-by: Chao Yu Link: https://lore.kernel.org/r/20190919062838.106423-1-gaoxiang25@huawei.com Signed-off-by: Gao Xiang --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 296de2b51c83..30a5b4028d2f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6112,7 +6112,10 @@ M: Gao Xiang M: Chao Yu L: linux-erofs@lists.ozlabs.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git +F: Documentation/filesystems/erofs.txt F: fs/erofs/ +F: include/trace/events/erofs.h ERRSEQ ERROR TRACKING INFRASTRUCTURE M: Jeff Layton From 55252ab72b774119afedfdc6d1f142ffa2a9b818 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 22 Sep 2019 02:43:55 +0800 Subject: [PATCH 146/312] erofs: fix erofs_get_meta_page locking due to a cleanup After doing more drop_caches stress test on our products, I found the mistake introduced by a very recent cleanup [1]. The current rule is that "erofs_get_meta_page" should be returned with page locked (although it's mostly unnecessary for read-only fs after pages are PG_uptodate), but a fix should be done for this. [1] https://lore.kernel.org/r/20190904020912.63925-26-gaoxiang25@huawei.com Fixes: 618f40ea026b ("erofs: use read_cache_page_gfp for erofs_get_meta_page") Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20190921184355.149928-1-gaoxiang25@huawei.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8a9fcbd0e8ac..fc3a8d8064f8 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -34,11 +34,15 @@ static void erofs_readendio(struct bio *bio) struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) { - struct inode *const bd_inode = sb->s_bdev->bd_inode; - struct address_space *const mapping = bd_inode->i_mapping; + struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; + struct page *page; - return read_cache_page_gfp(mapping, blkaddr, + page = read_cache_page_gfp(mapping, blkaddr, mapping_gfp_constraint(mapping, ~__GFP_FS)); + /* should already be PageUptodate */ + if (!IS_ERR(page)) + lock_page(page); + return page; } static int erofs_map_blocks_flatmode(struct inode *inode, From dc76ea8c1087b5c44235566ed4be2202d21a8504 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 22 Sep 2019 18:04:34 +0800 Subject: [PATCH 147/312] erofs: fix mis-inplace determination related with noio chain Fix a recent cleanup patch. noio (bypass) chain is handled asynchronously against submit chain, therefore inplace I/O or pagevec cannot be applied to such pages. Add detailed comment for this as well. Fixes: 97e86a858bc3 ("staging: erofs: tidy up decompression frontend") Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20190922100434.229340-1-gaoxiang25@huawei.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 96e34c90f814..fad80c97d247 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -575,7 +575,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; struct z_erofs_collector *const clt = &fe->clt; const loff_t offset = page_offset(page); - bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED); + bool tight = true; enum z_erofs_cache_alloctype cache_strategy; enum z_erofs_page_type page_type; @@ -628,8 +628,16 @@ restart_now: preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy, pagepool); - tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED); hitted: + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or pagevec (should be processed in strict order.) + */ + tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED && + clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); From 2b6509f42dc6c96ef0b625888d4aa56d2e140330 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 27 Sep 2019 18:27:42 +0800 Subject: [PATCH 148/312] MIPS: Loongson64: Fix boot failure after dropping boot_mem_map From commit a94e4f24ec83 ("MIPS: init: Drop boot_mem_map") onwards, add_memory_region() is handled by memblock_add()/memblock_reserve() directly and all bootmem API should be converted to memblock API. Otherwise it will lead to boot failure, especially in the NUMA case because add_memory_region lose the node_id information. Fixes: a94e4f24ec836c8984f83959 ("MIPS: init: Drop boot_mem_map") Signed-off-by: Huacai Chen Signed-off-by: Jiaxun Yang [paul.burton@mips.com: - Invert node_id check to de-indent the switch statement & avoid lines over 80 characters. - Fixup commit reference in commit message.] Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-mips@vger.kernel.org Cc: Fuxin Zhang Cc: Zhangjin Wu Cc: Huacai Chen --- arch/mips/loongson64/common/mem.c | 35 +++++++++++++------------- arch/mips/loongson64/loongson-3/numa.c | 11 +------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/arch/mips/loongson64/common/mem.c b/arch/mips/loongson64/common/mem.c index 4abb92e0fc39..4254ac4ec616 100644 --- a/arch/mips/loongson64/common/mem.c +++ b/arch/mips/loongson64/common/mem.c @@ -3,6 +3,7 @@ */ #include #include +#include #include #include @@ -64,24 +65,22 @@ void __init prom_init_memory(void) node_id = loongson_memmap->map[i].node_id; mem_type = loongson_memmap->map[i].mem_type; - if (node_id == 0) { - switch (mem_type) { - case SYSTEM_RAM_LOW: - add_memory_region(loongson_memmap->map[i].mem_start, - (u64)loongson_memmap->map[i].mem_size << 20, - BOOT_MEM_RAM); - break; - case SYSTEM_RAM_HIGH: - add_memory_region(loongson_memmap->map[i].mem_start, - (u64)loongson_memmap->map[i].mem_size << 20, - BOOT_MEM_RAM); - break; - case SYSTEM_RAM_RESERVED: - add_memory_region(loongson_memmap->map[i].mem_start, - (u64)loongson_memmap->map[i].mem_size << 20, - BOOT_MEM_RESERVED); - break; - } + if (node_id != 0) + continue; + + switch (mem_type) { + case SYSTEM_RAM_LOW: + memblock_add(loongson_memmap->map[i].mem_start, + (u64)loongson_memmap->map[i].mem_size << 20); + break; + case SYSTEM_RAM_HIGH: + memblock_add(loongson_memmap->map[i].mem_start, + (u64)loongson_memmap->map[i].mem_size << 20); + break; + case SYSTEM_RAM_RESERVED: + memblock_reserve(loongson_memmap->map[i].mem_start, + (u64)loongson_memmap->map[i].mem_size << 20); + break; } } } diff --git a/arch/mips/loongson64/loongson-3/numa.c b/arch/mips/loongson64/loongson-3/numa.c index 414e97de5dc0..8f20d2cb3767 100644 --- a/arch/mips/loongson64/loongson-3/numa.c +++ b/arch/mips/loongson64/loongson-3/numa.c @@ -142,8 +142,6 @@ static void __init szmem(unsigned int node) (u32)node_id, mem_type, mem_start, mem_size); pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", start_pfn, end_pfn, num_physpages); - add_memory_region((node_id << 44) + mem_start, - (u64)mem_size << 20, BOOT_MEM_RAM); memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), node); break; @@ -156,16 +154,12 @@ static void __init szmem(unsigned int node) (u32)node_id, mem_type, mem_start, mem_size); pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", start_pfn, end_pfn, num_physpages); - add_memory_region((node_id << 44) + mem_start, - (u64)mem_size << 20, BOOT_MEM_RAM); memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), node); break; case SYSTEM_RAM_RESERVED: pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n", (u32)node_id, mem_type, mem_start, mem_size); - add_memory_region((node_id << 44) + mem_start, - (u64)mem_size << 20, BOOT_MEM_RESERVED); memblock_reserve(((node_id << 44) + mem_start), mem_size << 20); break; @@ -191,8 +185,6 @@ static void __init node_mem_init(unsigned int node) NODE_DATA(node)->node_start_pfn = start_pfn; NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn; - free_bootmem_with_active_regions(node, end_pfn); - if (node == 0) { /* kernel end address */ unsigned long kernel_end_pfn = PFN_UP(__pa_symbol(&_end)); @@ -209,8 +201,6 @@ static void __init node_mem_init(unsigned int node) memblock_reserve((node_addrspace_offset | 0xfe000000), 32 << 20); } - - sparse_memory_present_with_active_regions(node); } static __init void prom_meminit(void) @@ -227,6 +217,7 @@ static __init void prom_meminit(void) cpumask_clear(&__node_data[(node)]->cpumask); } } + memblocks_present(); max_low_pfn = PHYS_PFN(memblock_end_of_DRAM()); for (cpu = 0; cpu < loongson_sysconf.nr_cpus; cpu++) { From 0889d07f3e4b171c453b2aaf2b257f9074cdf624 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 30 Sep 2019 11:39:52 +0200 Subject: [PATCH 149/312] MIPS: dts: ar9331: fix interrupt-controller size It is two registers each of 4 byte. Signed-off-by: Oleksij Rempel Signed-off-by: Paul Burton Cc: Rob Herring Cc: Mark Rutland Cc: Pengutronix Kernel Team Cc: Ralf Baechle Cc: James Hogan Cc: devicetree@vger.kernel.org Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/boot/dts/qca/ar9331.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/boot/dts/qca/ar9331.dtsi b/arch/mips/boot/dts/qca/ar9331.dtsi index 63a9f33aa43e..5cfc9d347826 100644 --- a/arch/mips/boot/dts/qca/ar9331.dtsi +++ b/arch/mips/boot/dts/qca/ar9331.dtsi @@ -99,7 +99,7 @@ miscintc: interrupt-controller@18060010 { compatible = "qca,ar7240-misc-intc"; - reg = <0x18060010 0x4>; + reg = <0x18060010 0x8>; interrupt-parent = <&cpuintc>; interrupts = <6>; From 8c7138b33e5c690c308b2a7085f6313fdcb3f616 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 27 Sep 2019 16:00:31 -0700 Subject: [PATCH 150/312] net: Unpublish sk from sk_reuseport_cb before call_rcu The "reuse->sock[]" array is shared by multiple sockets. The going away sk must unpublish itself from "reuse->sock[]" before making call_rcu() call. However, this unpublish-action is currently done after a grace period and it may cause use-after-free. The fix is to move reuseport_detach_sock() to sk_destruct(). Due to the above reason, any socket with sk_reuseport_cb has to go through the rcu grace period before freeing it. It is a rather old bug (~3 yrs). The Fixes tag is not necessary the right commit but it is the one that introduced the SOCK_RCU_FREE logic and this fix is depending on it. Fixes: a4298e4522d6 ("net: add SOCK_RCU_FREE socket flag") Cc: Eric Dumazet Suggested-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/core/sock.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 07863edbe6fc..e23ec80a67bf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1700,8 +1700,6 @@ static void __sk_destruct(struct rcu_head *head) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } - if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); @@ -1728,7 +1726,14 @@ static void __sk_destruct(struct rcu_head *head) void sk_destruct(struct sock *sk) { - if (sock_flag(sk, SOCK_RCU_FREE)) + bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); + + if (rcu_access_pointer(sk->sk_reuseport_cb)) { + reuseport_detach_sock(sk); + use_call_rcu = true; + } + + if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); From 13dc8c029cabf52ba95f60c56eb104d4d95d5889 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 21 Sep 2019 15:49:54 +0900 Subject: [PATCH 151/312] kbuild: remove ar-option and KBUILD_ARFLAGS Commit 40df759e2b9e ("kbuild: Fix build with binutils <= 2.19") introduced ar-option and KBUILD_ARFLAGS to deal with old binutils. According to Documentation/process/changes.rst, the current minimal supported version of binutils is 2.21 so you can assume the 'D' option is always supported. Not only GNU ar but also llvm-ar supports it. With the 'D' option hard-coded, there is no more user of ar-option or KBUILD_ARFLAGS. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers --- Documentation/kbuild/makefiles.rst | 5 ----- Makefile | 4 ---- arch/powerpc/boot/Makefile | 2 +- scripts/Kbuild.include | 5 ----- scripts/Makefile.build | 2 +- scripts/Makefile.lib | 2 +- 6 files changed, 3 insertions(+), 17 deletions(-) diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 6ba9d5365ff3..b89c88168d6a 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -954,11 +954,6 @@ When kbuild executes, the following steps are followed (roughly): From commandline LDFLAGS_MODULE shall be used (see kbuild.txt). - KBUILD_ARFLAGS Options for $(AR) when creating archives - - $(KBUILD_ARFLAGS) set by the top level Makefile to "D" (deterministic - mode) if this option is supported by $(AR). - KBUILD_LDS The linker script with full path. Assigned by the top-level Makefile. diff --git a/Makefile b/Makefile index 6f54f2f95743..7452174f5b21 100644 --- a/Makefile +++ b/Makefile @@ -498,7 +498,6 @@ export CFLAGS_KASAN CFLAGS_KASAN_NOSANITIZE CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL -export KBUILD_ARFLAGS # Files to ignore in find ... statements @@ -914,9 +913,6 @@ ifdef CONFIG_RETPOLINE KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none) endif -# use the deterministic mode of AR if available -KBUILD_ARFLAGS := $(call ar-option,D) - include scripts/Makefile.kasan include scripts/Makefile.extrawarn include scripts/Makefile.ubsan diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 6841bd52738b..dfbd7f22eef5 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -50,7 +50,7 @@ endif BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) -nostdinc -BOOTARFLAGS := -cr$(KBUILD_ARFLAGS) +BOOTARFLAGS := -crD ifdef CONFIG_CC_IS_CLANG BOOTCFLAGS += $(CLANG_FLAGS) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 4b0432e095ae..10ba926ae292 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -143,11 +143,6 @@ cc-ifversion = $(shell [ $(CONFIG_GCC_VERSION)0 $(1) $(2)000 ] && echo $(3) || e # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y) ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3)) -# ar-option -# Usage: KBUILD_ARFLAGS := $(call ar-option,D) -# Important: no spaces around options -ar-option = $(call try-run, $(AR) rc$(1) "$$TMP",$(1),$(2)) - # ld-version # Note this is mainly for HJ Lu's 3 number binutil versions ld-version = $(shell $(LD) --version | $(srctree)/scripts/ld-version.sh) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index f72aba64d611..a9e47953ca53 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -389,7 +389,7 @@ $(sort $(subdir-obj-y)): $(subdir-ym) ; ifdef builtin-target quiet_cmd_ar_builtin = AR $@ - cmd_ar_builtin = rm -f $@; $(AR) rcSTP$(KBUILD_ARFLAGS) $@ $(real-prereqs) + cmd_ar_builtin = rm -f $@; $(AR) cDPrST $@ $(real-prereqs) $(builtin-target): $(real-obj-y) FORCE $(call if_changed,ar_builtin) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 4a0cdd6f5909..179d55af5852 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -232,7 +232,7 @@ quiet_cmd_ld = LD $@ # --------------------------------------------------------------------------- quiet_cmd_ar = AR $@ - cmd_ar = rm -f $@; $(AR) rcsTP$(KBUILD_ARFLAGS) $@ $(real-prereqs) + cmd_ar = rm -f $@; $(AR) cDPrsT $@ $(real-prereqs) # Objcopy # --------------------------------------------------------------------------- From b6f2494d311a19b33b19708543e7ef6dea1de459 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 01:08:17 +0300 Subject: [PATCH 152/312] net: dsa: sja1105: Ensure PTP time for rxtstamp reconstruction is not in the past Sometimes the PTP synchronization on the switch 'jumps': ptp4l[11241.155]: rms 8 max 16 freq -21732 +/- 11 delay 742 +/- 0 ptp4l[11243.157]: rms 7 max 17 freq -21731 +/- 10 delay 744 +/- 0 ptp4l[11245.160]: rms 33592410 max 134217731 freq +192422 +/- 8530253 delay 743 +/- 0 ptp4l[11247.163]: rms 811631 max 964131 freq +10326 +/- 557785 delay 743 +/- 0 ptp4l[11249.166]: rms 261936 max 533876 freq -304323 +/- 126371 delay 744 +/- 0 ptp4l[11251.169]: rms 48700 max 57740 freq -20218 +/- 30532 delay 744 +/- 0 ptp4l[11253.171]: rms 14570 max 30163 freq -5568 +/- 7563 delay 742 +/- 0 ptp4l[11255.174]: rms 2914 max 3440 freq -22001 +/- 1667 delay 744 +/- 1 ptp4l[11257.177]: rms 811 max 1710 freq -22653 +/- 451 delay 744 +/- 1 ptp4l[11259.180]: rms 177 max 218 freq -21695 +/- 89 delay 741 +/- 0 ptp4l[11261.182]: rms 45 max 92 freq -21677 +/- 32 delay 742 +/- 0 ptp4l[11263.186]: rms 14 max 32 freq -21733 +/- 11 delay 742 +/- 0 ptp4l[11265.188]: rms 9 max 14 freq -21725 +/- 12 delay 742 +/- 0 ptp4l[11267.191]: rms 9 max 16 freq -21727 +/- 13 delay 742 +/- 0 ptp4l[11269.194]: rms 6 max 15 freq -21726 +/- 9 delay 743 +/- 0 ptp4l[11271.197]: rms 8 max 15 freq -21728 +/- 11 delay 743 +/- 0 ptp4l[11273.200]: rms 6 max 12 freq -21727 +/- 8 delay 743 +/- 0 ptp4l[11275.202]: rms 9 max 17 freq -21720 +/- 11 delay 742 +/- 0 ptp4l[11277.205]: rms 9 max 18 freq -21725 +/- 12 delay 742 +/- 0 Background: the switch only offers partial RX timestamps (24 bits) and it is up to the driver to read the PTP clock to fill those timestamps up to 64 bits. But the PTP clock readout needs to happen quickly enough (in 0.135 seconds, in fact), otherwise the PTP clock will wrap around 24 bits, condition which cannot be detected. Looking at the 'max 134217731' value on output line 3, one can see that in hex it is 0x8000003. Because the PTP clock resolution is 8 ns, that means 0x1000000 in ticks, which is exactly 2^24. So indeed this is a PTP clock wraparound, but the reason might be surprising. What is going on is that sja1105_tstamp_reconstruct(priv, now, ts) expects a "now" time that is later than the "ts" was snapshotted at. This, of course, is obvious: we read the PTP time _after_ the partial RX timestamp was received. However, the workqueue is processing frames from a skb queue and reuses the same PTP time, read once at the beginning. Normally the skb queue only contains one frame and all goes well. But when the skb queue contains two frames, the second frame that gets dequeued might have been partially timestamped by the RX MAC _after_ we had read our PTP time initially. The code was originally like that due to concerns that SPI access for PTP time readout is a slow process, and we are time-constrained anyway (aka: premature optimization). But some timing analysis reveals that the time spent until the RX timestamp is completely reconstructed is 1 order of magnitude lower than the 0.135 s deadline even under worst-case conditions. So we can afford to read the PTP time for each frame in the RX timestamping queue, which of course ensures that the full PTP time is in the partial timestamp's future. Fixes: f3097be21bf1 ("net: dsa: sja1105: Add a state machine for RX timestamping") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index b9def744bcb3..f3d38ff144c4 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2005,12 +2005,12 @@ static void sja1105_rxtstamp_work(struct work_struct *work) mutex_lock(&priv->ptp_lock); - now = priv->tstamp_cc.read(&priv->tstamp_cc); - while ((skb = skb_dequeue(&data->skb_rxtstamp_queue)) != NULL) { struct skb_shared_hwtstamps *shwt = skb_hwtstamps(skb); u64 ts; + now = priv->tstamp_cc.read(&priv->tstamp_cc); + *shwt = (struct skb_shared_hwtstamps) {0}; ts = SJA1105_SKB_CB(skb)->meta_tstamp; From 7e35b42591c058b91282f95ce3b2cf0c05ffe93d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 21 Sep 2019 16:05:41 +0900 Subject: [PATCH 153/312] kbuild: remove SUBDIRS support Linux 5.3 is out. Remove the SUBDIRS support. Signed-off-by: Masahiro Yamada --- Makefile | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index 7452174f5b21..779c9c9b9820 100644 --- a/Makefile +++ b/Makefile @@ -206,24 +206,8 @@ ifndef KBUILD_CHECKSRC KBUILD_CHECKSRC = 0 endif -# Use make M=dir to specify directory of external module to build -# Old syntax make ... SUBDIRS=$PWD is still supported -# Setting the environment variable KBUILD_EXTMOD take precedence -ifdef SUBDIRS - $(warning ================= WARNING ================) - $(warning 'SUBDIRS' will be removed after Linux 5.3) - $(warning ) - $(warning If you are building an individual subdirectory) - $(warning in the kernel tree, you can do like this:) - $(warning $$ make path/to/dir/you/want/to/build/) - $(warning (Do not forget the trailing slash)) - $(warning ) - $(warning If you are building an external module,) - $(warning Please use 'M=' or 'KBUILD_EXTMOD' instead) - $(warning ==========================================) - KBUILD_EXTMOD ?= $(SUBDIRS) -endif - +# Use make M=dir or set the environment variable KBUILD_EXTMOD to specify the +# directory of external module to build. Setting M= takes precedence. ifeq ("$(origin M)", "command line") KBUILD_EXTMOD := $(M) endif From 807f2105b87af20c4a582fb62111b16689f83bb8 Mon Sep 17 00:00:00 2001 From: Alex Gaynor Date: Sat, 21 Sep 2019 15:23:04 -0700 Subject: [PATCH 154/312] kbuild: correct formatting of header in kbuild module docs Minor formatting fixup. Fixes: cd238effefa2 ("docs: kbuild: convert docs to ReST and rename to *.rst") Signed-off-by: Alex Gaynor Signed-off-by: Matthew Garrett Signed-off-by: Masahiro Yamada --- Documentation/kbuild/modules.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/kbuild/modules.rst b/Documentation/kbuild/modules.rst index d2ae799237fd..33a17121d11d 100644 --- a/Documentation/kbuild/modules.rst +++ b/Documentation/kbuild/modules.rst @@ -498,7 +498,8 @@ build. will be written containing all exported symbols that were not defined in the kernel. ---- 6.3 Symbols From Another External Module +6.3 Symbols From Another External Module +---------------------------------------- Sometimes, an external module uses exported symbols from another external module. kbuild needs to have full knowledge of From 47346e96f004eca07720e1e2b24fc7f0b0df4092 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 24 Sep 2019 21:07:40 +0900 Subject: [PATCH 155/312] modpost: fix static EXPORT_SYMBOL warnings for UML build Johannes Berg reports lots of modpost warnings on ARCH=um builds: WARNING: "rename" [vmlinux] is a static EXPORT_SYMBOL WARNING: "lseek" [vmlinux] is a static EXPORT_SYMBOL WARNING: "ftruncate64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "getuid" [vmlinux] is a static EXPORT_SYMBOL WARNING: "lseek64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "unlink" [vmlinux] is a static EXPORT_SYMBOL WARNING: "pwrite64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "close" [vmlinux] is a static EXPORT_SYMBOL WARNING: "opendir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "pread64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "syscall" [vmlinux] is a static EXPORT_SYMBOL WARNING: "readdir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "readdir64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "futimes" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__lxstat" [vmlinux] is a static EXPORT_SYMBOL WARNING: "write" [vmlinux] is a static EXPORT_SYMBOL WARNING: "closedir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__xstat" [vmlinux] is a static EXPORT_SYMBOL WARNING: "fsync" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__lxstat64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__fxstat64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "telldir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "printf" [vmlinux] is a static EXPORT_SYMBOL WARNING: "readlink" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__sprintf_chk" [vmlinux] is a static EXPORT_SYMBOL WARNING: "link" [vmlinux] is a static EXPORT_SYMBOL WARNING: "rmdir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "fdatasync" [vmlinux] is a static EXPORT_SYMBOL WARNING: "truncate" [vmlinux] is a static EXPORT_SYMBOL WARNING: "statfs" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__errno_location" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__xmknod" [vmlinux] is a static EXPORT_SYMBOL WARNING: "open64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "truncate64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "open" [vmlinux] is a static EXPORT_SYMBOL WARNING: "read" [vmlinux] is a static EXPORT_SYMBOL WARNING: "chown" [vmlinux] is a static EXPORT_SYMBOL WARNING: "chmod" [vmlinux] is a static EXPORT_SYMBOL WARNING: "utime" [vmlinux] is a static EXPORT_SYMBOL WARNING: "fchmod" [vmlinux] is a static EXPORT_SYMBOL WARNING: "seekdir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "ioctl" [vmlinux] is a static EXPORT_SYMBOL WARNING: "dup2" [vmlinux] is a static EXPORT_SYMBOL WARNING: "statfs64" [vmlinux] is a static EXPORT_SYMBOL WARNING: "utimes" [vmlinux] is a static EXPORT_SYMBOL WARNING: "mkdir" [vmlinux] is a static EXPORT_SYMBOL WARNING: "fchown" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__guard" [vmlinux] is a static EXPORT_SYMBOL WARNING: "symlink" [vmlinux] is a static EXPORT_SYMBOL WARNING: "access" [vmlinux] is a static EXPORT_SYMBOL WARNING: "__stack_smash_handler" [vmlinux] is a static EXPORT_SYMBOL When you run "make", the modpost is run twice; before linking vmlinux, and before building modules. All the warnings above are from the second modpost. The offending symbols are defined not in vmlinux, but in the C library. The first modpost is run against the relocatable vmlinux.o, and those warnings are nicely suppressed because the SH_UNDEF entries from the symbol table clear the ->is_static flag. The second modpost is run against the executable vmlinux (+ modules), where those symbols have been resolved, but the definitions do not exist. This commit fixes it in a straightforward way; suppress the static EXPORT_SYMBOL warnings from "vmlinux". Without this commit, we see valid warnings twice anyway. For example, ARCH=arm64 defconfig shows the following warning twice: WARNING: "HYPERVISOR_platform_op" [vmlinux] is a static EXPORT_SYMBOL_GPL So, it is reasonable to suppress the second one. Fixes: 15bfc2348d54 ("modpost: check for static EXPORT_SYMBOL* functions") Reported-by: Johannes Berg Signed-off-by: Masahiro Yamada Tested-by: Johannes Berg Tested-by: Denis Efremov --- scripts/mod/modpost.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 3961941e8e7a..442d5e2ad688 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2652,15 +2652,20 @@ int main(int argc, char **argv) fatal("modpost: Section mismatches detected.\n" "Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them.\n"); for (n = 0; n < SYMBOL_HASH_SIZE; n++) { - struct symbol *s = symbolhash[n]; + struct symbol *s; + + for (s = symbolhash[n]; s; s = s->next) { + /* + * Do not check "vmlinux". This avoids the same warnings + * shown twice, and false-positives for ARCH=um. + */ + if (is_vmlinux(s->module->name) && !s->module->is_dot_o) + continue; - while (s) { if (s->is_static) warn("\"%s\" [%s] is a static %s\n", s->name, s->module->name, export_str(s->export)); - - s = s->next; } } From 68501df92d116b760777a2cfda314789f926476f Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Sun, 29 Sep 2019 01:43:39 +0300 Subject: [PATCH 156/312] net: dsa: sja1105: Prevent leaking memory In sja1105_static_config_upload, in two cases memory is leaked: when static_config_buf_prepare_for_upload fails and when sja1105_inhibit_tx fails. In both cases config_buf should be released. Fixes: 8aa9ebccae87 ("net: dsa: Introduce driver for NXP SJA1105 5-port L2 switch") Fixes: 1a4c69406cc1 ("net: dsa: sja1105: Prevent PHY jabbering during switch reset") Signed-off-by: Navid Emamdoost Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_spi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c index 84dc603138cf..58dd37ecde17 100644 --- a/drivers/net/dsa/sja1105/sja1105_spi.c +++ b/drivers/net/dsa/sja1105/sja1105_spi.c @@ -409,7 +409,8 @@ int sja1105_static_config_upload(struct sja1105_private *priv) rc = static_config_buf_prepare_for_upload(priv, config_buf, buf_len); if (rc < 0) { dev_err(dev, "Invalid config, cannot upload\n"); - return -EINVAL; + rc = -EINVAL; + goto out; } /* Prevent PHY jabbering during switch reset by inhibiting * Tx on all ports and waiting for current packet to drain. @@ -418,7 +419,8 @@ int sja1105_static_config_upload(struct sja1105_private *priv) rc = sja1105_inhibit_tx(priv, port_bitmap, true); if (rc < 0) { dev_err(dev, "Failed to inhibit Tx on ports\n"); - return -ENXIO; + rc = -ENXIO; + goto out; } /* Wait for an eventual egress packet to finish transmission * (reach IFG). It is guaranteed that a second one will not From 68ce6688a5baefde30914fc07fc27292dbbe8320 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 02:01:39 +0300 Subject: [PATCH 157/312] net: sched: taprio: Fix potential integer overflow in taprio_set_picos_per_byte The speed divisor is used in a context expecting an s64, but it is evaluated using 32-bit arithmetic. To avoid that happening, instead of multiplying by 1,000,000 in the first place, simplify the fraction and do a standard 32 bit division instead. Fixes: f04b514c0ce2 ("taprio: Set default link speed to 10 Mbps in taprio_set_picos_per_byte") Reported-by: Gustavo A. R. Silva Suggested-by: Eric Dumazet Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2f7b34205c82..2aab46ada94f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1048,8 +1048,7 @@ static void taprio_set_picos_per_byte(struct net_device *dev, speed = ecmd.base.speed; skip: - picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, - speed * 1000 * 1000); + picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", From df359f0d09dc029829b66322707a2f558cb720f7 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Fri, 27 Sep 2019 16:49:20 +0100 Subject: [PATCH 158/312] xen/efi: Set nonblocking callbacks Other parts of the kernel expect these nonblocking EFI callbacks to exist and crash when running under Xen. Since the implementations of xen_efi_set_variable() and xen_efi_query_variable_info() do not take any locks, use them for the nonblocking callbacks too. Signed-off-by: Ross Lagerwall Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- arch/arm/xen/efi.c | 2 ++ arch/x86/xen/efi.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm/xen/efi.c b/arch/arm/xen/efi.c index d687a73044bf..cb2aaf98e243 100644 --- a/arch/arm/xen/efi.c +++ b/arch/arm/xen/efi.c @@ -19,7 +19,9 @@ void __init xen_efi_runtime_setup(void) efi.get_variable = xen_efi_get_variable; efi.get_next_variable = xen_efi_get_next_variable; efi.set_variable = xen_efi_set_variable; + efi.set_variable_nonblocking = xen_efi_set_variable; efi.query_variable_info = xen_efi_query_variable_info; + efi.query_variable_info_nonblocking = xen_efi_query_variable_info; efi.update_capsule = xen_efi_update_capsule; efi.query_capsule_caps = xen_efi_query_capsule_caps; efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 0d3365cb64de..7e3eb70f411a 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -65,7 +65,9 @@ static efi_system_table_t __init *xen_efi_probe(void) efi.get_variable = xen_efi_get_variable; efi.get_next_variable = xen_efi_get_next_variable; efi.set_variable = xen_efi_set_variable; + efi.set_variable_nonblocking = xen_efi_set_variable; efi.query_variable_info = xen_efi_query_variable_info; + efi.query_variable_info_nonblocking = xen_efi_query_variable_info; efi.update_capsule = xen_efi_update_capsule; efi.query_capsule_caps = xen_efi_query_capsule_caps; efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; From 851345828095f1e40d383ec1a1e91767a2051976 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 23 Sep 2019 07:56:47 +0200 Subject: [PATCH 159/312] s390/cpumf: Use consistant debug print format Use consistant debug print format of the form variable blank value. Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_cf_diag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index 5f1fd1581330..2654e348801a 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -390,7 +390,7 @@ static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, debug_sprintf_event(cf_diag_dbg, 6, "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" - " need %zd rc:%d\n", + " need %zd rc %d\n", __func__, ctrset, ctrset_size, cpuhw->info.cfvn, cpuhw->info.csvn, need, rc); return need; @@ -567,7 +567,7 @@ static int cf_diag_add(struct perf_event *event, int flags) int err = 0; debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x cpuhw:%p\n", + "%s event %p cpu %d flags %#x cpuhw %p\n", __func__, event, event->cpu, flags, cpuhw); if (cpuhw->flags & PMU_F_IN_USE) { From 932bfc5aae08f3cb20c1c9f051542f5933710151 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 20 Sep 2019 11:57:43 +0200 Subject: [PATCH 160/312] s390/cpumsf: Check for CPU Measurement sampling s390 IBM z15 introduces a check if the CPU Mesurement Facility sampling is temporarily unavailable. If this is the case return -EBUSY and abort the setup of CPU Measuement facility sampling. Signed-off-by: Thomas Richter Reviewed-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/cpu_mf.h | 5 ++++- arch/s390/kernel/perf_cpum_sf.c | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index ceeb552d3472..753669bfbe99 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -28,6 +28,8 @@ asm(".include \"asm/cpu_mf-insn.h\"\n"); CPU_MF_INT_SF_PRA|CPU_MF_INT_SF_SACA| \ CPU_MF_INT_SF_LSDA) +#define CPU_MF_SF_RIBM_NOTAV 0x1 /* Sampling unavailable */ + /* CPU measurement facility support */ static inline int cpum_cf_avail(void) { @@ -69,7 +71,8 @@ struct hws_qsi_info_block { /* Bit(s) */ unsigned long max_sampl_rate; /* 16-23: maximum sampling interval*/ unsigned long tear; /* 24-31: TEAR contents */ unsigned long dear; /* 32-39: DEAR contents */ - unsigned int rsvrd0; /* 40-43: reserved */ + unsigned int rsvrd0:24; /* 40-42: reserved */ + unsigned int ribm:8; /* 43: Reserved by IBM */ unsigned int cpu_speed; /* 44-47: CPU speed */ unsigned long long rsvrd1; /* 48-55: reserved */ unsigned long long rsvrd2; /* 56-63: reserved */ diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 544a02e944c6..2421f0b62630 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -803,6 +803,12 @@ static int __hw_perf_event_init(struct perf_event *event) goto out; } + if (si.ribm & CPU_MF_SF_RIBM_NOTAV) { + pr_warn("CPU Measurement Facility sampling is temporarily not available\n"); + err = -EBUSY; + goto out; + } + /* Always enable basic sampling */ SAMPL_FLAGS(hwc) = PERF_CPUM_SF_BASIC_MODE; From e14e59c125f6d81c08f2c598eb17ada3fc3ba21f Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 24 Sep 2019 09:31:13 +0200 Subject: [PATCH 161/312] s390/cpumf: Fix indentation in sampling device driver Fix indentation in the s390 CPU Measuement Facility sampling device dirver. Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_sf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 2421f0b62630..3d8b12a9a6ff 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -901,7 +901,7 @@ static int cpumsf_pmu_event_init(struct perf_event *event) /* Check online status of the CPU to which the event is pinned */ if (event->cpu >= 0 && !cpu_online(event->cpu)) - return -ENODEV; + return -ENODEV; /* Force reset of idle/hv excludes regardless of what the * user requested. From 9f494438d4bc4746831cb5289b9836c18e4bbf96 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Fri, 20 Sep 2019 10:17:09 +0200 Subject: [PATCH 162/312] s390/qdio: clarify size of the QIB parm area The QIB parm area is 128 bytes long. Current code consistently misuses an _entirely unrelated_ QDIO constant, merely because it has the same value. Stop doing so. Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Reviewed-by: Jens Remus Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/qdio.h | 2 +- drivers/s390/cio/qdio_setup.c | 2 +- drivers/s390/net/qeth_core_main.c | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 78e8a888306d..e3f238e8c611 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -111,7 +111,7 @@ struct qib { /* private: */ u8 res[88]; /* public: */ - u8 parm[QDIO_MAX_BUFFERS_PER_Q]; + u8 parm[128]; } __attribute__ ((packed, aligned(256))); /** diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index f4ca1d29d61b..cd164886132f 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -113,7 +113,7 @@ static void set_impl_params(struct qdio_irq *irq_ptr, irq_ptr->qib.pfmt = qib_param_field_format; if (qib_param_field) memcpy(irq_ptr->qib.parm, qib_param_field, - QDIO_MAX_BUFFERS_PER_Q); + sizeof(irq_ptr->qib.parm)); if (!input_slib_elements) goto output; diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index a7868c8133ee..dda274351c21 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -4715,8 +4715,7 @@ static int qeth_qdio_establish(struct qeth_card *card) QETH_CARD_TEXT(card, 2, "qdioest"); - qib_param_field = kzalloc(QDIO_MAX_BUFFERS_PER_Q, - GFP_KERNEL); + qib_param_field = kzalloc(FIELD_SIZEOF(struct qib, parm), GFP_KERNEL); if (!qib_param_field) { rc = -ENOMEM; goto out_free_nothing; From e1fba49cc1e965a3dacd897367ba1e7b340cf0f4 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 30 Sep 2019 16:38:54 -0700 Subject: [PATCH 163/312] kvm: vmx: Limit guest PMCs to those supported on the host KVM can only virtualize as many PMCs as the host supports. Limit the number of generic counters and fixed counters to the number of corresponding counters supported on the host, rather than to INTEL_PMC_MAX_GENERIC and INTEL_PMC_MAX_FIXED, respectively. Note that INTEL_PMC_MAX_GENERIC is currently 32, which exceeds the 18 contiguous MSR indices reserved by Intel for event selectors. Since the existing code relies on a contiguous range of MSR indices for event selectors, it can't possibly work for more than 18 general purpose counters. Fixes: f5132b01386b5a ("KVM: Expose a version 2 architectural PMU to a guests") Signed-off-by: Jim Mattson Reviewed-by: Marc Orr Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4dea0e0e7e39..3e9c059099e9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -262,6 +262,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -283,8 +284,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; + perf_get_x86_pmu_capability(&x86_pmu); + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -294,7 +297,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } else { pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } From f5a1a536fa14895ccff4e94e6a5af90901ce86aa Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:52 +1000 Subject: [PATCH 164/312] lib: introduce copy_struct_from_user() helper A common pattern for syscall extensions is increasing the size of a struct passed from userspace, such that the zero-value of the new fields result in the old kernel behaviour (allowing for a mix of userspace and kernel vintages to operate on one another in most cases). While this interface exists for communication in both directions, only one interface is straightforward to have reasonable semantics for (userspace passing a struct to the kernel). For kernel returns to userspace, what the correct semantics are (whether there should be an error if userspace is unaware of a new extension) is very syscall-dependent and thus probably cannot be unified between syscalls (a good example of this problem is [1]). Previously there was no common lib/ function that implemented the necessary extension-checking semantics (and different syscalls implemented them slightly differently or incompletely[2]). Future patches replace common uses of this pattern to make use of copy_struct_from_user(). Some in-kernel selftests that insure that the handling of alignment and various byte patterns are all handled identically to memchr_inv() usage. [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code") [2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do similar checks to copy_struct_from_user() while rt_sigprocmask(2) always rejects differently-sized struct arguments. Suggested-by: Rasmus Villemoes Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191001011055.19283-2-cyphar@cyphar.com Signed-off-by: Christian Brauner --- include/linux/bitops.h | 7 +++ include/linux/uaccess.h | 70 +++++++++++++++++++++ lib/strnlen_user.c | 8 +-- lib/test_user_copy.c | 136 ++++++++++++++++++++++++++++++++++++++-- lib/usercopy.c | 55 ++++++++++++++++ 5 files changed, 263 insertions(+), 13 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index cf074bce3eb3..c94a9ff9f082 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,6 +4,13 @@ #include #include +/* Set bits in the first 'n' bytes when loaded from memory */ +#ifdef __LITTLE_ENDIAN +# define aligned_byte_mask(n) ((1UL << 8*(n))-1) +#else +# define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) +#endif + #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 70bbdc38dc37..e47d0522a1f4 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -231,6 +231,76 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from, #endif /* ARCH_HAS_NOCACHE_UACCESS */ +extern __must_check int check_zeroed_user(const void __user *from, size_t size); + +/** + * copy_struct_from_user: copy a struct from userspace + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @src: Source address, in userspace. + * @usize: (Alleged) size of @src struct. + * + * Copies a struct from userspace to kernel space, in a way that guarantees + * backwards-compatibility for struct syscall arguments (as long as future + * struct extensions are made such that all new fields are *appended* to the + * old struct, and zeroed-out new fields have the same meaning as the old + * struct). + * + * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. + * The recommended usage is something like the following: + * + * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) + * { + * int err; + * struct foo karg = {}; + * + * if (usize > PAGE_SIZE) + * return -E2BIG; + * if (usize < FOO_SIZE_VER0) + * return -EINVAL; + * + * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); + * if (err) + * return err; + * + * // ... + * } + * + * There are three cases to consider: + * * If @usize == @ksize, then it's copied verbatim. + * * If @usize < @ksize, then the userspace has passed an old struct to a + * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) + * are to be zero-filled. + * * If @usize > @ksize, then the userspace has passed a new struct to an + * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) + * are checked to ensure they are zeroed, otherwise -E2BIG is returned. + * + * Returns (in all cases, some data may have been copied): + * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. + * * -EFAULT: access to userspace failed. + */ +static __always_inline __must_check int +copy_struct_from_user(void *dst, size_t ksize, const void __user *src, + size_t usize) +{ + size_t size = min(ksize, usize); + size_t rest = max(ksize, usize) - size; + + /* Deal with trailing bytes. */ + if (usize < ksize) { + memset(dst + size, 0, rest); + } else if (usize > ksize) { + int ret = check_zeroed_user(src + size, rest); + if (ret <= 0) + return ret ?: -E2BIG; + } + /* Copy the interoperable parts of the struct. */ + if (copy_from_user(dst, src, size)) + return -EFAULT; + return 0; +} + /* * probe_kernel_read(): safely attempt to read from a location * @dst: pointer to the buffer that shall take the data diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 28ff554a1be8..6c0005d5dd5c 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -3,16 +3,10 @@ #include #include #include +#include #include -/* Set bits in the first 'n' bytes when loaded from memory */ -#ifdef __LITTLE_ENDIAN -# define aligned_byte_mask(n) ((1ul << 8*(n))-1) -#else -# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n))) -#endif - /* * Do a strnlen, return length of string *with* final '\0'. * 'count' is the user-supplied count, while 'max' is the diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c index 67bcd5dfd847..950ee88cd6ac 100644 --- a/lib/test_user_copy.c +++ b/lib/test_user_copy.c @@ -31,14 +31,133 @@ # define TEST_U64 #endif -#define test(condition, msg) \ -({ \ - int cond = (condition); \ - if (cond) \ - pr_warn("%s\n", msg); \ - cond; \ +#define test(condition, msg, ...) \ +({ \ + int cond = (condition); \ + if (cond) \ + pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \ + cond; \ }) +static bool is_zeroed(void *from, size_t size) +{ + return memchr_inv(from, 0x0, size) == NULL; +} + +static int test_check_nonzero_user(char *kmem, char __user *umem, size_t size) +{ + int ret = 0; + size_t start, end, i; + size_t zero_start = size / 4; + size_t zero_end = size - zero_start; + + /* + * We conduct a series of check_nonzero_user() tests on a block of memory + * with the following byte-pattern (trying every possible [start,end] + * pair): + * + * [ 00 ff 00 ff ... 00 00 00 00 ... ff 00 ff 00 ] + * + * And we verify that check_nonzero_user() acts identically to memchr_inv(). + */ + + memset(kmem, 0x0, size); + for (i = 1; i < zero_start; i += 2) + kmem[i] = 0xff; + for (i = zero_end; i < size; i += 2) + kmem[i] = 0xff; + + ret |= test(copy_to_user(umem, kmem, size), + "legitimate copy_to_user failed"); + + for (start = 0; start <= size; start++) { + for (end = start; end <= size; end++) { + size_t len = end - start; + int retval = check_zeroed_user(umem + start, len); + int expected = is_zeroed(kmem + start, len); + + ret |= test(retval != expected, + "check_nonzero_user(=%d) != memchr_inv(=%d) mismatch (start=%zu, end=%zu)", + retval, expected, start, end); + } + } + + return ret; +} + +static int test_copy_struct_from_user(char *kmem, char __user *umem, + size_t size) +{ + int ret = 0; + char *umem_src = NULL, *expected = NULL; + size_t ksize, usize; + + umem_src = kmalloc(size, GFP_KERNEL); + if (ret |= test(umem_src == NULL, "kmalloc failed")) + goto out_free; + + expected = kmalloc(size, GFP_KERNEL); + if (ret |= test(expected == NULL, "kmalloc failed")) + goto out_free; + + /* Fill umem with a fixed byte pattern. */ + memset(umem_src, 0x3e, size); + ret |= test(copy_to_user(umem, umem_src, size), + "legitimate copy_to_user failed"); + + /* Check basic case -- (usize == ksize). */ + ksize = size; + usize = size; + + memcpy(expected, umem_src, ksize); + + memset(kmem, 0x0, size); + ret |= test(copy_struct_from_user(kmem, ksize, umem, usize), + "copy_struct_from_user(usize == ksize) failed"); + ret |= test(memcmp(kmem, expected, ksize), + "copy_struct_from_user(usize == ksize) gives unexpected copy"); + + /* Old userspace case -- (usize < ksize). */ + ksize = size; + usize = size / 2; + + memcpy(expected, umem_src, usize); + memset(expected + usize, 0x0, ksize - usize); + + memset(kmem, 0x0, size); + ret |= test(copy_struct_from_user(kmem, ksize, umem, usize), + "copy_struct_from_user(usize < ksize) failed"); + ret |= test(memcmp(kmem, expected, ksize), + "copy_struct_from_user(usize < ksize) gives unexpected copy"); + + /* New userspace (-E2BIG) case -- (usize > ksize). */ + ksize = size / 2; + usize = size; + + memset(kmem, 0x0, size); + ret |= test(copy_struct_from_user(kmem, ksize, umem, usize) != -E2BIG, + "copy_struct_from_user(usize > ksize) didn't give E2BIG"); + + /* New userspace (success) case -- (usize > ksize). */ + ksize = size / 2; + usize = size; + + memcpy(expected, umem_src, ksize); + ret |= test(clear_user(umem + ksize, usize - ksize), + "legitimate clear_user failed"); + + memset(kmem, 0x0, size); + ret |= test(copy_struct_from_user(kmem, ksize, umem, usize), + "copy_struct_from_user(usize > ksize) failed"); + ret |= test(memcmp(kmem, expected, ksize), + "copy_struct_from_user(usize > ksize) gives unexpected copy"); + +out_free: + kfree(expected); + kfree(umem_src); + return ret; +} + static int __init test_user_copy_init(void) { int ret = 0; @@ -106,6 +225,11 @@ static int __init test_user_copy_init(void) #endif #undef test_legit + /* Test usage of check_nonzero_user(). */ + ret |= test_check_nonzero_user(kmem, usermem, 2 * PAGE_SIZE); + /* Test usage of copy_struct_from_user(). */ + ret |= test_copy_struct_from_user(kmem, usermem, 2 * PAGE_SIZE); + /* * Invalid usage: none of these copies should succeed. */ diff --git a/lib/usercopy.c b/lib/usercopy.c index c2bfbcaeb3dc..cbb4d9ec00f2 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include /* out-of-line parts */ @@ -31,3 +32,57 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) } EXPORT_SYMBOL(_copy_to_user); #endif + +/** + * check_zeroed_user: check if a userspace buffer only contains zero bytes + * @from: Source address, in userspace. + * @size: Size of buffer. + * + * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for + * userspace addresses (and is more efficient because we don't care where the + * first non-zero byte is). + * + * Returns: + * * 0: There were non-zero bytes present in the buffer. + * * 1: The buffer was full of zero bytes. + * * -EFAULT: access to userspace failed. + */ +int check_zeroed_user(const void __user *from, size_t size) +{ + unsigned long val; + uintptr_t align = (uintptr_t) from % sizeof(unsigned long); + + if (unlikely(size == 0)) + return 1; + + from -= align; + size += align; + + if (!user_access_begin(from, size)) + return -EFAULT; + + unsafe_get_user(val, (unsigned long __user *) from, err_fault); + if (align) + val &= ~aligned_byte_mask(align); + + while (size > sizeof(unsigned long)) { + if (unlikely(val)) + goto done; + + from += sizeof(unsigned long); + size -= sizeof(unsigned long); + + unsafe_get_user(val, (unsigned long __user *) from, err_fault); + } + + if (size < sizeof(unsigned long)) + val &= aligned_byte_mask(size); + +done: + user_access_end(); + return (val == 0); +err_fault: + user_access_end(); + return -EFAULT; +} +EXPORT_SYMBOL(check_zeroed_user); From f14c234b4bc5184fd40d9a47830e5b32c3b36d49 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:53 +1000 Subject: [PATCH 165/312] clone3: switch to copy_struct_from_user() Switch clone3() syscall from it's own copying struct clone_args from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Additionally, explicitly define CLONE_ARGS_SIZE_VER0 to match the other users of the struct-extension pattern. Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-3-cyphar@cyphar.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 2 ++ kernel/fork.c | 34 +++++++--------------------------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b3105ac1381a..0945805982b4 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -47,6 +47,8 @@ struct clone_args { __aligned_u64 tls; }; +#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ + /* * Scheduling policies */ diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..2ef529869c64 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2525,39 +2525,19 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, - size_t size) + size_t usize) { + int err; struct clone_args args; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(usize > PAGE_SIZE)) return -E2BIG; - - if (unlikely(size < sizeof(struct clone_args))) + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) return -EINVAL; - if (unlikely(!access_ok(uargs, size))) - return -EFAULT; - - if (size > sizeof(struct clone_args)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uargs + sizeof(struct clone_args); - end = (void __user *)uargs + size; - - for (; addr < end; addr++) { - if (get_user(val, addr)) - return -EFAULT; - if (val) - return -E2BIG; - } - - size = sizeof(struct clone_args); - } - - if (copy_from_user(&args, uargs, size)) - return -EFAULT; + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (err) + return err; /* * Verify that higher 32bits of exit_signal are unset and that From dff3a85feceade213daf153556fd32a3b0a73c63 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:54 +1000 Subject: [PATCH 166/312] sched_setattr: switch to copy_struct_from_user() Switch sched_setattr() syscall from it's own copying struct sched_attr from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Ideally we could also unify sched_getattr(2)-style syscalls as well, but unfortunately the correct semantics for such syscalls are much less clear (see [1] for more detail). In future we could come up with a more sane idea for how the syscall interface should look. [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code") Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-4-cyphar@cyphar.com Signed-off-by: Christian Brauner --- kernel/sched/core.c | 43 +++++++------------------------------------ 1 file changed, 7 insertions(+), 36 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7880f4f64d0e..dd05a378631a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5106,9 +5106,6 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a u32 size; int ret; - if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - /* Zero the full structure, so that a short copy will be nice: */ memset(attr, 0, sizeof(*attr)); @@ -5116,45 +5113,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return ret; - /* Bail out on silly large: */ - if (size > PAGE_SIZE) - goto err_size; - /* ABI compatibility quirk: */ if (!size) size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -5354,7 +5325,7 @@ sched_attr_copy_to_user(struct sched_attr __user *uattr, * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. + * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, From c2ba8f41ad366dc12840dd87d8f1565185845126 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:55 +1000 Subject: [PATCH 167/312] perf_event_open: switch to copy_struct_from_user() Switch perf_event_open() syscall from it's own copying struct perf_event_attr from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-5-cyphar@cyphar.com Signed-off-by: Christian Brauner --- kernel/events/core.c | 47 +++++++++----------------------------------- 1 file changed, 9 insertions(+), 38 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4655adbbae10..3f0cb82e4fbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10586,55 +10586,26 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, u32 size; int ret; - if (!access_ok(uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ + /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ + /* ABI compatibility quirk: */ + if (!size) size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) + if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - attr->size = size; if (attr->__reserved_1) From 85560117d00f5d528e928918b8f61cadcefff98b Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Wed, 4 Sep 2019 21:49:01 +0200 Subject: [PATCH 168/312] loop: change queue block size to match when using DIO The loop driver assumes that if the passed in fd is opened with O_DIRECT, the caller wants to use direct I/O on the loop device. However, if the underlying block device has a different block size than the loop block queue, direct I/O can't be enabled. Instead of requiring userspace to manually change the blocksize and re-enable direct I/O, just change the queue block sizes to match, as well as the io_min size. Reviewed-by: Christoph Hellwig Signed-off-by: Martijn Coenen Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1410fa893653..f6f77eaa7217 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -994,6 +994,16 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); + if (io_is_direct(lo->lo_backing_file) && inode->i_sb->s_bdev) { + /* In case of direct I/O, match underlying block size */ + unsigned short bsize = bdev_logical_block_size( + inode->i_sb->s_bdev); + + blk_queue_logical_block_size(lo->lo_queue, bsize); + blk_queue_physical_block_size(lo->lo_queue, bsize); + blk_queue_io_min(lo->lo_queue, bsize); + } + loop_update_rotational(lo); loop_update_dio(lo); set_capacity(lo->lo_disk, size); From bdf200731145f07a6127cb16753e2e8fdc159cf4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 1 Oct 2019 09:53:29 -0600 Subject: [PATCH 169/312] io_uring: use __kernel_timespec in timeout ABI All system calls use struct __kernel_timespec instead of the old struct timespec, but this one was just added with the old-style ABI. Change it now to enforce the use of __kernel_timespec, avoiding ABI confusion and the need for compat handlers on 32-bit architectures. Any user space caller will have to use __kernel_timespec now, but this is unambiguous and works for any C library regardless of the time_t definition. A nicer way to specify the timeout would have been a less ambiguous 64-bit nanosecond value, but I suppose it's too late now to change that as this would impact both 32-bit and 64-bit users. Fixes: 5262f567987d ("io_uring: IORING_OP_TIMEOUT support") Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index dd094b387cab..0bc167aca46d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1892,15 +1892,15 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned count, req_dist, tail_index; struct io_ring_ctx *ctx = req->ctx; struct list_head *entry; - struct timespec ts; + struct timespec64 ts; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || sqe->len != 1) return -EINVAL; - if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr, - sizeof(ts))) + + if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; /* @@ -1934,7 +1934,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts), + hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), HRTIMER_MODE_REL); return 0; } From dd45483981ac62f432e073fea6e5e11200b9070d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Tue, 1 Oct 2019 17:34:38 +0200 Subject: [PATCH 170/312] s390/dasd: Fix error handling during online processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible that the CCW commands for reading volume and extent pool information are not supported, either by the storage server (for dedicated DASDs) or by z/VM (for virtual devices, such as MDISKs). As a command reject will occur in such a case, the current error handling leads to a failing online processing and thus the DASD can't be used at all. Since the data being read is not essential for an fully operational DASD, the error handling can be removed. Information about the failing command is sent to the s390dbf debug feature. Fixes: c729696bcf8b ("s390/dasd: Recognise data for ESE volumes") Cc: # 5.3 Reported-by: Frank Heimes Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index fc53e1e221f0..b213c40d1a51 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1553,8 +1553,8 @@ static int dasd_eckd_read_vol_info(struct dasd_device *device) if (rc == 0) { memcpy(&private->vsq, vsq, sizeof(*vsq)); } else { - dev_warn(&device->cdev->dev, - "Reading the volume storage information failed with rc=%d\n", rc); + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, + "Reading the volume storage information failed with rc=%d", rc); } if (useglobal) @@ -1737,8 +1737,8 @@ static int dasd_eckd_read_ext_pool_info(struct dasd_device *device) if (rc == 0) { dasd_eckd_cpy_ext_pool_data(device, lcq); } else { - dev_warn(&device->cdev->dev, - "Reading the logical configuration failed with rc=%d\n", rc); + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, + "Reading the logical configuration failed with rc=%d", rc); } dasd_sfree_request(cqr, cqr->memdev); @@ -2020,14 +2020,10 @@ dasd_eckd_check_characteristics(struct dasd_device *device) dasd_eckd_read_features(device); /* Read Volume Information */ - rc = dasd_eckd_read_vol_info(device); - if (rc) - goto out_err3; + dasd_eckd_read_vol_info(device); /* Read Extent Pool Information */ - rc = dasd_eckd_read_ext_pool_info(device); - if (rc) - goto out_err3; + dasd_eckd_read_ext_pool_info(device); /* Read Device Characteristics */ rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC, @@ -5663,14 +5659,10 @@ static int dasd_eckd_restore_device(struct dasd_device *device) dasd_eckd_read_features(device); /* Read Volume Information */ - rc = dasd_eckd_read_vol_info(device); - if (rc) - goto out_err2; + dasd_eckd_read_vol_info(device); /* Read Extent Pool Information */ - rc = dasd_eckd_read_ext_pool_info(device); - if (rc) - goto out_err2; + dasd_eckd_read_ext_pool_info(device); /* Read Device Characteristics */ rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC, From 964ce509e2ded52c1a61ad86044cc4d70abd9eb8 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Tue, 1 Oct 2019 17:34:39 +0200 Subject: [PATCH 171/312] Revert "s390/dasd: Add discard support for ESE volumes" This reverts commit 7e64db1597fe114b83fe17d0ba96c6aa5fca419a. The thin provisioning feature introduces an IOCTL and the discard support to allow userspace tools and filesystems to release unused and previously allocated space respectively. During some internal performance improvements and further tests, the release of allocated space revealed some issues that may lead to data corruption in some configurations when filesystems are mounted with discard support enabled. While we're working on a fix and trying to clarify the situation, this commit reverts the discard support for ESE volumes to prevent potential data corruption. Cc: # 5.3 Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 57 ++-------------------------------- 1 file changed, 3 insertions(+), 54 deletions(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index b213c40d1a51..c94184d080f8 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -2055,9 +2055,6 @@ dasd_eckd_check_characteristics(struct dasd_device *device) if (readonly) set_bit(DASD_FLAG_DEVICE_RO, &device->flags); - if (dasd_eckd_is_ese(device)) - dasd_set_feature(device->cdev, DASD_FEATURE_DISCARD, 1); - dev_info(&device->cdev->dev, "New DASD %04X/%02X (CU %04X/%02X) " "with %d cylinders, %d heads, %d sectors%s\n", private->rdc_data.dev_type, @@ -3691,14 +3688,6 @@ static int dasd_eckd_release_space(struct dasd_device *device, return -EINVAL; } -static struct dasd_ccw_req * -dasd_eckd_build_cp_discard(struct dasd_device *device, struct dasd_block *block, - struct request *req, sector_t first_trk, - sector_t last_trk) -{ - return dasd_eckd_dso_ras(device, block, req, first_trk, last_trk, 1); -} - static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( struct dasd_device *startdev, struct dasd_block *block, @@ -4443,10 +4432,6 @@ static struct dasd_ccw_req *dasd_eckd_build_cp(struct dasd_device *startdev, cmdwtd = private->features.feature[12] & 0x40; use_prefix = private->features.feature[8] & 0x01; - if (req_op(req) == REQ_OP_DISCARD) - return dasd_eckd_build_cp_discard(startdev, block, req, - first_trk, last_trk); - cqr = NULL; if (cdlspecial || dasd_page_cache) { /* do nothing, just fall through to the cmd mode single case */ @@ -4725,14 +4710,12 @@ static struct dasd_ccw_req *dasd_eckd_build_alias_cp(struct dasd_device *base, struct dasd_block *block, struct request *req) { - struct dasd_device *startdev = NULL; struct dasd_eckd_private *private; - struct dasd_ccw_req *cqr; + struct dasd_device *startdev; unsigned long flags; + struct dasd_ccw_req *cqr; - /* Discard requests can only be processed on base devices */ - if (req_op(req) != REQ_OP_DISCARD) - startdev = dasd_alias_get_start_dev(base); + startdev = dasd_alias_get_start_dev(base); if (!startdev) startdev = base; private = startdev->private; @@ -6513,20 +6496,8 @@ static void dasd_eckd_setup_blk_queue(struct dasd_block *block) unsigned int logical_block_size = block->bp_block; struct request_queue *q = block->request_queue; struct dasd_device *device = block->base; - struct dasd_eckd_private *private; - unsigned int max_discard_sectors; - unsigned int max_bytes; - unsigned int ext_bytes; /* Extent Size in Bytes */ - int recs_per_trk; - int trks_per_cyl; - int ext_limit; - int ext_size; /* Extent Size in Cylinders */ int max; - private = device->private; - trks_per_cyl = private->rdc_data.trk_per_cyl; - recs_per_trk = recs_per_track(&private->rdc_data, 0, logical_block_size); - if (device->features & DASD_FEATURE_USERAW) { /* * the max_blocks value for raw_track access is 256 @@ -6547,28 +6518,6 @@ static void dasd_eckd_setup_blk_queue(struct dasd_block *block) /* With page sized segments each segment can be translated into one idaw/tidaw */ blk_queue_max_segment_size(q, PAGE_SIZE); blk_queue_segment_boundary(q, PAGE_SIZE - 1); - - if (dasd_eckd_is_ese(device)) { - /* - * Depending on the extent size, up to UINT_MAX bytes can be - * accepted. However, neither DASD_ECKD_RAS_EXTS_MAX nor the - * device limits should be exceeded. - */ - ext_size = dasd_eckd_ext_size(device); - ext_limit = min(private->real_cyl / ext_size, DASD_ECKD_RAS_EXTS_MAX); - ext_bytes = ext_size * trks_per_cyl * recs_per_trk * - logical_block_size; - max_bytes = UINT_MAX - (UINT_MAX % ext_bytes); - if (max_bytes / ext_bytes > ext_limit) - max_bytes = ext_bytes * ext_limit; - - max_discard_sectors = max_bytes / 512; - - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); - q->limits.discard_granularity = ext_bytes; - q->limits.discard_alignment = ext_bytes; - } } static struct ccw_driver dasd_eckd_driver = { From f88eb7c0d002a67ef31aeb7850b42ff69abc46dc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 20 Sep 2019 21:54:17 +0200 Subject: [PATCH 172/312] nl80211: validate beacon head We currently don't validate the beacon head, i.e. the header, fixed part and elements that are to go in front of the TIM element. This means that the variable elements there can be malformed, e.g. have a length exceeding the buffer size, but most downstream code from this assumes that this has already been checked. Add the necessary checks to the netlink policy. Cc: stable@vger.kernel.org Fixes: ed1b6cc7f80f ("cfg80211/nl80211: add beacon settings") Link: https://lore.kernel.org/r/1569009255-I7ac7fbe9436e9d8733439eab8acbbd35e55c74ef@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d21b1581a665..7386421e2ad3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -201,6 +201,38 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info) return __cfg80211_rdev_from_attrs(netns, info->attrs); } +static int validate_beacon_head(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + const struct element *elem; + const struct ieee80211_mgmt *mgmt = (void *)data; + unsigned int fixedlen = offsetof(struct ieee80211_mgmt, + u.beacon.variable); + + if (len < fixedlen) + goto err; + + if (ieee80211_hdrlen(mgmt->frame_control) != + offsetof(struct ieee80211_mgmt, u.beacon)) + goto err; + + data += fixedlen; + len -= fixedlen; + + for_each_element(elem, data, len) { + /* nothing */ + } + + if (for_each_element_completed(elem, data, len)) + return 0; + +err: + NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head"); + return -EINVAL; +} + static int validate_ie_attr(const struct nlattr *attr, struct netlink_ext_ack *extack) { @@ -338,8 +370,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, - [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY, - .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_BEACON_HEAD] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head, + IEEE80211_MAX_DATA_LEN), [NL80211_ATTR_BEACON_TAIL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, IEEE80211_MAX_DATA_LEN), From 242b0931c1918c56cd1dc5563fd250a3c39b996d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 20 Sep 2019 21:54:18 +0200 Subject: [PATCH 173/312] cfg80211: validate SSID/MBSSID element ordering assumption The code copying the data assumes that the SSID element is before the MBSSID element, but since the data is untrusted from the AP, this cannot be guaranteed. Validate that this is indeed the case and ignore the MBSSID otherwise, to avoid having to deal with both cases for the copy of data that should be between them. Cc: stable@vger.kernel.org Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Link: https://lore.kernel.org/r/1569009255-I1673911f5eae02964e21bdc11b2bf58e5e207e59@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d313c9befa23..ff1016607f0b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1723,7 +1723,12 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, return; new_ie_len -= trans_ssid[1]; mbssid = cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen); - if (!mbssid) + /* + * It's not valid to have the MBSSID element before SSID + * ignore if that happens - the code below assumes it is + * after (while copying things inbetween). + */ + if (!mbssid || mbssid < trans_ssid) return; new_ie_len -= mbssid[1]; rcu_read_lock(); From f43e5210c739fe76a4b0ed851559d6902f20ceb1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 23 Sep 2019 13:51:16 +0200 Subject: [PATCH 174/312] cfg80211: initialize on-stack chandefs In a few places we don't properly initialize on-stack chandefs, resulting in EDMG data to be non-zero, which broke things. Additionally, in a few places we rely on the driver to init the data completely, but perhaps we shouldn't as non-EDMG drivers may not initialize the EDMG data, also initialize it there. Cc: stable@vger.kernel.org Fixes: 2a38075cd0be ("nl80211: Add support for EDMG channels") Reported-by: Dmitry Osipenko Tested-by: Dmitry Osipenko Link: https://lore.kernel.org/r/1569239475-I2dcce394ecf873376c386a78f31c2ec8b538fa25@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 +++- net/wireless/reg.c | 2 +- net/wireless/wext-compat.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7386421e2ad3..10617566a117 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2669,6 +2669,8 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]); + memset(chandef, 0, sizeof(*chandef)); + chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = control_freq; @@ -3209,7 +3211,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_channel) { int ret; - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; ret = rdev_get_channel(rdev, wdev, &chandef); if (ret == 0) { diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 5311d0ae2454..420c4207ab59 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2108,7 +2108,7 @@ static void reg_call_notifier(struct wiphy *wiphy, static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) { - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); enum nl80211_iftype iftype; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 7b6529d81c61..cac9e28d852b 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -798,7 +798,7 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; int ret; switch (wdev->iftype) { From b501426cf86e70649c983c52f4c823b3c40d72a3 Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Thu, 26 Sep 2019 16:16:50 +0800 Subject: [PATCH 175/312] nl80211: fix null pointer dereference If the interface is not in MESH mode, the command 'iw wlanx mpath del' will cause kernel panic. The root cause is null pointer access in mpp_flush_by_proxy(), as the pointer 'sdata->u.mesh.mpp_paths' is NULL for non MESH interface. Unable to handle kernel NULL pointer dereference at virtual address 00000068 [...] PC is at _raw_spin_lock_bh+0x20/0x5c LR is at mesh_path_del+0x1c/0x17c [mac80211] [...] Process iw (pid: 4537, stack limit = 0xd83e0238) [...] [] (_raw_spin_lock_bh) from [] (mesh_path_del+0x1c/0x17c [mac80211]) [] (mesh_path_del [mac80211]) from [] (extack_doit+0x20/0x68 [compat]) [] (extack_doit [compat]) from [] (genl_rcv_msg+0x274/0x30c) [] (genl_rcv_msg) from [] (netlink_rcv_skb+0x58/0xac) [] (netlink_rcv_skb) from [] (genl_rcv+0x20/0x34) [] (genl_rcv) from [] (netlink_unicast+0x11c/0x204) [] (netlink_unicast) from [] (netlink_sendmsg+0x30c/0x370) [] (netlink_sendmsg) from [] (sock_sendmsg+0x70/0x84) [] (sock_sendmsg) from [] (___sys_sendmsg.part.3+0x188/0x228) [] (___sys_sendmsg.part.3) from [] (__sys_sendmsg+0x4c/0x70) [] (__sys_sendmsg) from [] (ret_fast_syscall+0x0/0x44) Code: e2822c02 e2822001 e5832004 f590f000 (e1902f9f) ---[ end trace bbd717600f8f884d ]--- Signed-off-by: Miaoqing Pan Link: https://lore.kernel.org/r/1569485810-761-1-git-send-email-miaoqing@codeaurora.org [trim useless data from commit message] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 10617566a117..141cdb171665 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6305,6 +6305,9 @@ static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->del_mpath) return -EOPNOTSUPP; + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + return rdev_del_mpath(rdev, dev, dst); } From 8ed31a264065ae92058ce54aa3cc8da8d81dc6d7 Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Fri, 27 Sep 2019 10:03:16 +0800 Subject: [PATCH 176/312] mac80211: fix txq null pointer dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the interface type is P2P_DEVICE or NAN, read the file of '/sys/kernel/debug/ieee80211/phyx/netdev:wlanx/aqm' will get a NULL pointer dereference. As for those interface type, the pointer sdata->vif.txq is NULL. Unable to handle kernel NULL pointer dereference at virtual address 00000011 CPU: 1 PID: 30936 Comm: cat Not tainted 4.14.104 #1 task: ffffffc0337e4880 task.stack: ffffff800cd20000 PC is at ieee80211_if_fmt_aqm+0x34/0xa0 [mac80211] LR is at ieee80211_if_fmt_aqm+0x34/0xa0 [mac80211] [...] Process cat (pid: 30936, stack limit = 0xffffff800cd20000) [...] [] ieee80211_if_fmt_aqm+0x34/0xa0 [mac80211] [] ieee80211_if_read+0x60/0xbc [mac80211] [] ieee80211_if_read_aqm+0x28/0x30 [mac80211] [] full_proxy_read+0x2c/0x48 [] __vfs_read+0x2c/0xd4 [] vfs_read+0x8c/0x108 [] SyS_read+0x40/0x7c Signed-off-by: Miaoqing Pan Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/1569549796-8223-1-git-send-email-miaoqing@codeaurora.org [trim useless data from commit message] Signed-off-by: Johannes Berg --- net/mac80211/debugfs_netdev.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index b1438fd4d876..64b544ae9966 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -487,9 +487,14 @@ static ssize_t ieee80211_if_fmt_aqm( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) { struct ieee80211_local *local = sdata->local; - struct txq_info *txqi = to_txq_info(sdata->vif.txq); + struct txq_info *txqi; int len; + if (!sdata->vif.txq) + return 0; + + txqi = to_txq_info(sdata->vif.txq); + spin_lock_bh(&local->fq.lock); rcu_read_lock(); @@ -658,7 +663,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); DEBUGFS_ADD(hw_queues); - if (sdata->local->ops->wake_tx_queue) + if (sdata->local->ops->wake_tx_queue && + sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_NAN) DEBUGFS_ADD(aqm); } From d8dec42b5c2d2b273bc30b0e073cfbe832d69902 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 1 Oct 2019 13:19:23 +0200 Subject: [PATCH 177/312] mac80211: keep BHs disabled while calling drv_tx_wake_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers typically expect this, as it's the case for almost all cases where this is called (i.e. from the TX path). Also, the code in mac80211 itself (if the driver calls ieee80211_tx_dequeue()) expects this as it uses this_cpu_ptr() without additional protection. This should fix various reports of the problem: https://bugzilla.kernel.org/show_bug.cgi?id=204127 https://lore.kernel.org/linux-wireless/CAN5HydrWb3o_FE6A1XDnP1E+xS66d5kiEuhHfiGKkLNQokx13Q@mail.gmail.com/ https://lore.kernel.org/lkml/nycvar.YFH.7.76.1909111238470.473@cbobk.fhfr.pm/ Cc: stable@vger.kernel.org Reported-and-tested-by: Jiri Kosina Reported-by: Aaron Hill Reported-by: Lukas Redlinger Reported-by: Oleksii Shevchuk Fixes: 21a5d4c3a45c ("mac80211: add stop/start logic for software TXQs") Link: https://lore.kernel.org/r/1569928763-I3e8838c5ecad878e59d4a94eb069a90f6641461a@changeid Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/util.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 051a02ddcb85..32a7a53833c0 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -247,7 +247,8 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) struct sta_info *sta; int i; - spin_lock_bh(&fq->lock); + local_bh_disable(); + spin_lock(&fq->lock); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; @@ -273,9 +274,9 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) &txqi->flags)) continue; - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); - spin_lock_bh(&fq->lock); + spin_lock(&fq->lock); } } @@ -288,12 +289,14 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) goto out; - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); + local_bh_enable(); return; out: - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); + local_bh_enable(); } static void From 55131dec2b1c7417d216f861ea7a29dc7c4d2d20 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 1 Oct 2019 10:33:51 +0200 Subject: [PATCH 178/312] net: socionext: netsec: always grab descriptor lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Always acquire tx descriptor spinlock even if a xdp program is not loaded on the netsec device since ndo_xdp_xmit can run concurrently with netsec_netdev_start_xmit and netsec_clean_tx_dring. This can happen loading a xdp program on a different device (e.g virtio-net) and xdp_do_redirect_map/xdp_do_redirect_slow can redirect to netsec even if we do not have a xdp program on it. Fixes: ba2b232108d3 ("net: netsec: add XDP support") Tested-by: Ilias Apalodimas Signed-off-by: Lorenzo Bianconi Reviewed-by: Ilias Apalodimas Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/netsec.c | 30 ++++++------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 55db7fbd43cc..f9e6744d8fd6 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -282,7 +282,6 @@ struct netsec_desc_ring { void *vaddr; u16 head, tail; u16 xdp_xmit; /* netsec_xdp_xmit packets */ - bool is_xdp; struct page_pool *page_pool; struct xdp_rxq_info xdp_rxq; spinlock_t lock; /* XDP tx queue locking */ @@ -634,8 +633,7 @@ static bool netsec_clean_tx_dring(struct netsec_priv *priv) unsigned int bytes; int cnt = 0; - if (dring->is_xdp) - spin_lock(&dring->lock); + spin_lock(&dring->lock); bytes = 0; entry = dring->vaddr + DESC_SZ * tail; @@ -682,8 +680,8 @@ next: entry = dring->vaddr + DESC_SZ * tail; cnt++; } - if (dring->is_xdp) - spin_unlock(&dring->lock); + + spin_unlock(&dring->lock); if (!cnt) return false; @@ -799,9 +797,6 @@ static void netsec_set_tx_de(struct netsec_priv *priv, de->data_buf_addr_lw = lower_32_bits(desc->dma_addr); de->buf_len_info = (tx_ctrl->tcp_seg_len << 16) | desc->len; de->attr = attr; - /* under spin_lock if using XDP */ - if (!dring->is_xdp) - dma_wmb(); dring->desc[idx] = *desc; if (desc->buf_type == TYPE_NETSEC_SKB) @@ -1123,12 +1118,10 @@ static netdev_tx_t netsec_netdev_start_xmit(struct sk_buff *skb, u16 tso_seg_len = 0; int filled; - if (dring->is_xdp) - spin_lock_bh(&dring->lock); + spin_lock_bh(&dring->lock); filled = netsec_desc_used(dring); if (netsec_check_stop_tx(priv, filled)) { - if (dring->is_xdp) - spin_unlock_bh(&dring->lock); + spin_unlock_bh(&dring->lock); net_warn_ratelimited("%s %s Tx queue full\n", dev_name(priv->dev), ndev->name); return NETDEV_TX_BUSY; @@ -1161,8 +1154,7 @@ static netdev_tx_t netsec_netdev_start_xmit(struct sk_buff *skb, tx_desc.dma_addr = dma_map_single(priv->dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE); if (dma_mapping_error(priv->dev, tx_desc.dma_addr)) { - if (dring->is_xdp) - spin_unlock_bh(&dring->lock); + spin_unlock_bh(&dring->lock); netif_err(priv, drv, priv->ndev, "%s: DMA mapping failed\n", __func__); ndev->stats.tx_dropped++; @@ -1177,8 +1169,7 @@ static netdev_tx_t netsec_netdev_start_xmit(struct sk_buff *skb, netdev_sent_queue(priv->ndev, skb->len); netsec_set_tx_de(priv, dring, &tx_ctrl, &tx_desc, skb); - if (dring->is_xdp) - spin_unlock_bh(&dring->lock); + spin_unlock_bh(&dring->lock); netsec_write(priv, NETSEC_REG_NRM_TX_PKTCNT, 1); /* submit another tx */ return NETDEV_TX_OK; @@ -1262,7 +1253,6 @@ err: static void netsec_setup_tx_dring(struct netsec_priv *priv) { struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_TX]; - struct bpf_prog *xdp_prog = READ_ONCE(priv->xdp_prog); int i; for (i = 0; i < DESC_NUM; i++) { @@ -1275,12 +1265,6 @@ static void netsec_setup_tx_dring(struct netsec_priv *priv) */ de->attr = 1U << NETSEC_TX_SHIFT_OWN_FIELD; } - - if (xdp_prog) - dring->is_xdp = true; - else - dring->is_xdp = false; - } static int netsec_setup_rx_dring(struct netsec_priv *priv) From c5f75a14a081dc2289739f3958af39392943b1c6 Mon Sep 17 00:00:00 2001 From: Adam Zerella Date: Tue, 1 Oct 2019 21:16:58 +1000 Subject: [PATCH 179/312] docs: networking: Add title caret and missing doc Resolving a couple of Sphinx documentation warnings that are generated in the networking section. - WARNING: document isn't included in any toctree - WARNING: Title underline too short. Signed-off-by: Adam Zerella Signed-off-by: David S. Miller --- Documentation/networking/device_drivers/index.rst | 1 + Documentation/networking/j1939.rst | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst index f51f92571e39..c1f7f75e5fd9 100644 --- a/Documentation/networking/device_drivers/index.rst +++ b/Documentation/networking/device_drivers/index.rst @@ -23,6 +23,7 @@ Contents: intel/ice google/gve mellanox/mlx5 + netronome/nfp pensando/ionic .. only:: subproject and html diff --git a/Documentation/networking/j1939.rst b/Documentation/networking/j1939.rst index ce7e7a044e08..dc60b13fcd09 100644 --- a/Documentation/networking/j1939.rst +++ b/Documentation/networking/j1939.rst @@ -272,7 +272,7 @@ supported flags are: * MSG_DONTWAIT, i.e. non-blocking operation. recvmsg(2) -^^^^^^^^^ +^^^^^^^^^^ In most cases recvmsg(2) is needed if you want to extract more information than recvfrom(2) can provide. For example package priority and timestamp. The From 6de6d18591b5781b7fdfb9d0ade0a4b5afacf882 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 1 Oct 2019 16:21:40 +0200 Subject: [PATCH 180/312] ionic: select CONFIG_NET_DEVLINK When no other driver selects the devlink library code, ionic produces a link failure: drivers/net/ethernet/pensando/ionic/ionic_devlink.o: In function `ionic_devlink_alloc': ionic_devlink.c:(.text+0xd): undefined reference to `devlink_alloc' drivers/net/ethernet/pensando/ionic/ionic_devlink.o: In function `ionic_devlink_register': ionic_devlink.c:(.text+0x71): undefined reference to `devlink_register' Add the same 'select' statement that the other drivers use here. Fixes: fbfb8031533c ("ionic: Add hardware init and device commands") Signed-off-by: Arnd Bergmann Acked-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/Kconfig b/drivers/net/ethernet/pensando/Kconfig index bd0583e409df..d25b88f53de4 100644 --- a/drivers/net/ethernet/pensando/Kconfig +++ b/drivers/net/ethernet/pensando/Kconfig @@ -20,6 +20,7 @@ if NET_VENDOR_PENSANDO config IONIC tristate "Pensando Ethernet IONIC Support" depends on 64BIT && PCI + select NET_DEVLINK help This enables the support for the Pensando family of Ethernet adapters. More specific information on this driver can be From 895b5c9f206eb7d25dc1360a8ccfc5958895eb89 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 29 Sep 2019 20:54:03 +0200 Subject: [PATCH 181/312] netfilter: drop bridge nf reset from nf_reset commit 174e23810cd31 ("sk_buff: drop all skb extensions on free and skb scrubbing") made napi recycle always drop skb extensions. The additional skb_ext_del() that is performed via nf_reset on napi skb recycle is not needed anymore. Most nf_reset() calls in the stack are there so queued skb won't block 'rmmod nf_conntrack' indefinitely. This removes the skb_ext_del from nf_reset, and renames it to a more fitting nf_reset_ct(). In a few selected places, add a call to skb_ext_reset to make sure that no active extensions remain. I am submitting this for "net", because we're still early in the release cycle. The patch applies to net-next too, but I think the rename causes needless divergence between those trees. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- drivers/net/ppp/pptp.c | 4 ++-- drivers/net/tun.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/net/vrf.c | 8 ++++---- drivers/net/wireless/mac80211_hwsim.c | 4 ++-- drivers/staging/octeon/ethernet-tx.c | 6 ++---- include/linux/skbuff.h | 5 +---- net/batman-adv/soft-interface.c | 2 +- net/core/skbuff.c | 2 +- net/dccp/ipv4.c | 2 +- net/ipv4/ip_input.c | 2 +- net/ipv4/ipmr.c | 4 ++-- net/ipv4/netfilter/nf_dup_ipv4.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 4 ++-- net/ipv6/ip6_input.c | 2 +- net/ipv6/netfilter/nf_dup_ipv6.c | 2 +- net/ipv6/raw.c | 2 +- net/l2tp/l2tp_core.c | 2 +- net/l2tp/l2tp_eth.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/openvswitch/vport-internal_dev.c | 2 +- net/packet/af_packet.c | 4 ++-- net/sctp/input.c | 2 +- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_interface.c | 2 +- net/xfrm/xfrm_output.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 31 files changed, 40 insertions(+), 45 deletions(-) diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 734de7de03f7..e1fabb3e3246 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -238,7 +238,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); - nf_reset(skb); + nf_reset_ct(skb); skb->ip_summed = CHECKSUM_NONE; ip_select_ident(net, skb, NULL); @@ -358,7 +358,7 @@ static int pptp_rcv(struct sk_buff *skb) po = lookup_chan(htons(header->call_id), iph->saddr); if (po) { skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk_pppox(po), skb, 0); } drop: diff --git a/drivers/net/tun.c b/drivers/net/tun.c index aab0be40d443..812dc3a65efb 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1104,7 +1104,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) */ skb_orphan(skb); - nf_reset(skb); + nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) goto drop; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ba98e0971b84..5a635f028bdc 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1585,7 +1585,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) /* Don't wait up for transmitted skbs to be freed. */ if (!use_napi) { skb_orphan(skb); - nf_reset(skb); + nf_reset_ct(skb); } /* If running out of space, stop queue to avoid getting packets that we diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index a4b38a980c3c..ee52bde058df 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -366,7 +366,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk, struct neighbour *neigh; int ret; - nf_reset(skb); + nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -459,7 +459,7 @@ static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, /* reset skb device */ if (likely(err == 1)) - nf_reset(skb); + nf_reset_ct(skb); else skb = NULL; @@ -560,7 +560,7 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s bool is_v6gw = false; int ret = -EINVAL; - nf_reset(skb); + nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -670,7 +670,7 @@ static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, /* reset skb device */ if (likely(err == 1)) - nf_reset(skb); + nf_reset_ct(skb); else skb = NULL; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 635956024e88..45c73a6f09a1 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1261,8 +1261,8 @@ static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw, skb_orphan(skb); skb_dst_drop(skb); skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); /* * Get absolute mactime here so all HWs RX at the "same time", and diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c index c64728fc21f2..a62057555d1b 100644 --- a/drivers/staging/octeon/ethernet-tx.c +++ b/drivers/staging/octeon/ethernet-tx.c @@ -349,10 +349,8 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev) */ dst_release(skb_dst(skb)); skb_dst_set(skb, NULL); -#ifdef CONFIG_XFRM - secpath_reset(skb); -#endif - nf_reset(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); #ifdef CONFIG_NET_SCHED skb->tc_index = 0; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e7d3b1a513ef..4351577b14d7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4160,15 +4160,12 @@ static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} #endif /* CONFIG_SKB_EXTENSIONS */ -static inline void nf_reset(struct sk_buff *skb) +static inline void nf_reset_ct(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(skb)); skb->_nfct = 0; #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - skb_ext_del(skb, SKB_EXT_BRIDGE_NF); -#endif } static inline void nf_reset_trace(struct sk_buff *skb) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index a1146cb10919..9cbed6f5a85a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -436,7 +436,7 @@ void batadv_interface_rx(struct net_device *soft_iface, /* clean the netfilter state now that the batman-adv header has been * removed */ - nf_reset(skb); + nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 01d65206f4fb..529133611ea2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5120,7 +5120,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb_ext_reset(skb); - nf_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); #ifdef CONFIG_NET_SWITCHDEV diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b685bc82f8d0..d9b4200ed12d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -871,7 +871,7 @@ lookup: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - nf_reset(skb); + nf_reset_ct(skb); return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1e2392b7c64e..c59a78a267c3 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -199,7 +199,7 @@ resubmit: kfree_skb(skb); return; } - nf_reset(skb); + nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 313470f6bb14..716d5472c022 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1794,7 +1794,7 @@ static void ip_encap(struct net *net, struct sk_buff *skb, ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - nf_reset(skb); + nf_reset_ct(skb); } static inline int ipmr_forward_finish(struct net *net, struct sock *sk, @@ -2140,7 +2140,7 @@ int ip_mr_input(struct sk_buff *skb) mroute_sk = rcu_dereference(mrt->mroute_sk); if (mroute_sk) { - nf_reset(skb); + nf_reset_ct(skb); raw_rcv(mroute_sk, skb); return 0; } diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index af3fbf76dbd3..6cc5743c553a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -65,7 +65,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ - nf_reset(skb); + nf_reset_ct(skb); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif /* diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 80da5a66d5d7..3183413ebc6c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -332,7 +332,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return NET_RX_DROP; } - nf_reset(skb); + nf_reset_ct(skb); skb_push(skb, skb->data - skb_network_header(skb)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2ee45e3755e9..bf124b1742df 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1916,7 +1916,7 @@ process: if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; - nf_reset(skb); + nf_reset_ct(skb); if (tcp_filter(sk, skb)) goto discard_and_relse; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cf755156a684..e8443cc5c1ab 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1969,7 +1969,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; - nf_reset(skb); + nf_reset_ct(skb); if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); @@ -2298,7 +2298,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - nf_reset(skb); + nf_reset_ct(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_lib_checksum_complete(skb)) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index d432d0011c16..7e5df23cbe7b 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -371,7 +371,7 @@ resubmit_final: /* Free reference early: we don't need it any more, and it may hold ip_conntrack module loaded indefinitely. */ - nf_reset(skb); + nf_reset_ct(skb); skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index e6c9da9866b1..a0a2de30be3e 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -54,7 +54,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, return; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_reset(skb); + nf_reset_ct(skb); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif if (hooknum == NF_INET_PRE_ROUTING || diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 6e1888ee4036..a77f6b7d3a7c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -215,7 +215,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) /* Not releasing hash table! */ if (clone) { - nf_reset(clone); + nf_reset_ct(clone); rawv6_rcv(sk, clone); } } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 105e5a7092e7..f82ea12bac37 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1078,7 +1078,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); - nf_reset(skb); + nf_reset_ct(skb); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index bd3f39349d40..fd5ac2788e45 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -151,7 +151,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); rcu_read_lock(); dev = rcu_dereference(spriv->dev); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 622833317dcb..0d7c887a2b75 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -193,7 +193,7 @@ pass_up: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 687e23a8b326..802f19aba7e3 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -206,7 +206,7 @@ pass_up: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 9c464d24beec..888d3068a492 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -613,7 +613,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) ret = ip_vs_confirm_conntrack(skb); if (ret == NF_ACCEPT) { - nf_reset(skb); + nf_reset_ct(skb); skb_forward_csum(skb); } return ret; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index d2437b5b2f6a..21c90d3a7ebf 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -237,7 +237,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb) } skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); secpath_reset(skb); skb->pkt_type = PACKET_HOST; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2742b006d25..82a50e850245 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1821,7 +1821,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; @@ -2121,7 +2121,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; diff --git a/net/sctp/input.c b/net/sctp/input.c index 1008cdc44dd6..5a070fb5b278 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -201,7 +201,7 @@ int sctp_rcv(struct sk_buff *skb) if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; - nf_reset(skb); + nf_reset_ct(skb); if (sk_filter(sk, skb)) goto discard_release; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 6088bc2dc11e..9b599ed66d97 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -706,7 +706,7 @@ resume: if (err) goto drop; - nf_reset(skb); + nf_reset_ct(skb); if (decaps) { sp = skb_sec_path(skb); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 2ab4859df55a..0f5131bc3342 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -185,7 +185,7 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9499b35feb92..b1db55b50ba1 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -502,7 +502,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err) struct net *net = xs_net(skb_dst(skb)->xfrm); while (likely((err = xfrm_output_one(skb, err)) == 0)) { - nf_reset(skb); + nf_reset_ct(skb); err = skb_dst(skb)->ops->local_out(net, skb->sk, skb); if (unlikely(err != 1)) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 21e939235b39..f2d1e573ea55 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2808,7 +2808,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) continue; } - nf_reset(skb); + nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); From 34a4c95abd25ab41fb390b985a08a651b1fa0b0f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 30 Sep 2019 11:05:49 +0200 Subject: [PATCH 182/312] netfilter: nft_connlimit: disable bh on garbage collection BH must be disabled when invoking nf_conncount_gc_list() to perform garbage collection, otherwise deadlock might happen. nf_conncount_add+0x1f/0x50 [nf_conncount] nft_connlimit_eval+0x4c/0xe0 [nft_connlimit] nft_dynset_eval+0xb5/0x100 [nf_tables] nft_do_chain+0xea/0x420 [nf_tables] ? sch_direct_xmit+0x111/0x360 ? noqueue_init+0x10/0x10 ? __qdisc_run+0x84/0x510 ? tcp_packet+0x655/0x1610 [nf_conntrack] ? ip_finish_output2+0x1a7/0x430 ? tcp_error+0x130/0x150 [nf_conntrack] ? nf_conntrack_in+0x1fc/0x4c0 [nf_conntrack] nft_do_chain_ipv4+0x66/0x80 [nf_tables] nf_hook_slow+0x44/0xc0 ip_rcv+0xb5/0xd0 ? ip_rcv_finish_core.isra.19+0x360/0x360 __netif_receive_skb_one_core+0x52/0x70 netif_receive_skb_internal+0x34/0xe0 napi_gro_receive+0xba/0xe0 e1000_clean_rx_irq+0x1e9/0x420 [e1000e] e1000e_poll+0xbe/0x290 [e1000e] net_rx_action+0x149/0x3b0 __do_softirq+0xde/0x2d8 irq_exit+0xba/0xc0 do_IRQ+0x85/0xd0 common_interrupt+0xf/0xf RIP: 0010:nf_conncount_gc_list+0x3b/0x130 [nf_conncount] Fixes: 2f971a8f4255 ("netfilter: nf_conncount: move all list iterations under spinlock") Reported-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_connlimit.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index af1497ab9464..69d6173f91e2 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -218,8 +218,13 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); + bool ret; - return nf_conncount_gc_list(net, &priv->list); + local_bh_disable(); + ret = nf_conncount_gc_list(net, &priv->list); + local_bh_enable(); + + return ret; } static struct nft_expr_type nft_connlimit_type; From 9a9251a3534745d08a92abfeca0ca467b912b5f6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 02:37:22 +0300 Subject: [PATCH 183/312] net: sched: taprio: Avoid division by zero on invalid link speed The check in taprio_set_picos_per_byte is currently not robust enough and will trigger this division by zero, due to e.g. PHYLINK not setting kset->base.speed when there is no PHY connected: [ 27.109992] Division by zero in kernel. [ 27.113842] CPU: 1 PID: 198 Comm: tc Not tainted 5.3.0-rc5-01246-gc4006b8c2637-dirty #212 [ 27.121974] Hardware name: Freescale LS1021A [ 27.126234] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 27.133938] [] (show_stack) from [] (dump_stack+0xb0/0xc4) [ 27.141124] [] (dump_stack) from [] (Ldiv0_64+0x8/0x18) [ 27.148052] [] (Ldiv0_64) from [] (div64_u64+0xcc/0xf0) [ 27.154978] [] (div64_u64) from [] (div64_s64+0x4c/0x68) [ 27.161993] [] (div64_s64) from [] (taprio_set_picos_per_byte+0xe8/0xf4) [ 27.170388] [] (taprio_set_picos_per_byte) from [] (taprio_change+0x668/0xcec) [ 27.179302] [] (taprio_change) from [] (qdisc_create+0x1fc/0x4f4) [ 27.187091] [] (qdisc_create) from [] (tc_modify_qdisc+0x1ac/0x6f8) [ 27.195055] [] (tc_modify_qdisc) from [] (rtnetlink_rcv_msg+0x268/0x2dc) [ 27.203449] [] (rtnetlink_rcv_msg) from [] (netlink_rcv_skb+0xe0/0x114) [ 27.211756] [] (netlink_rcv_skb) from [] (netlink_unicast+0x1b4/0x22c) [ 27.219977] [] (netlink_unicast) from [] (netlink_sendmsg+0x284/0x340) [ 27.228198] [] (netlink_sendmsg) from [] (sock_sendmsg+0x14/0x24) [ 27.235988] [] (sock_sendmsg) from [] (___sys_sendmsg+0x214/0x228) [ 27.243863] [] (___sys_sendmsg) from [] (__sys_sendmsg+0x50/0x8c) [ 27.251652] [] (__sys_sendmsg) from [] (ret_fast_syscall+0x0/0x54) [ 27.259524] Exception stack(0xe8045fa8 to 0xe8045ff0) [ 27.264546] 5fa0: b6f608c8 000000f8 00000003 bed7e2f0 00000000 00000000 [ 27.272681] 5fc0: b6f608c8 000000f8 004ce54c 00000128 5d3ce8c7 00000000 00000026 00505c9c [ 27.280812] 5fe0: 00000070 bed7e298 004ddd64 b6dd1e64 Russell King points out that the ethtool API says zero is a valid return value of __ethtool_get_link_ksettings: * If it is enabled then they are read-only; if the link * is up they represent the negotiated link mode; if the link is down, * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. So, it seems that taprio is not following the API... I'd suggest either fixing taprio, or getting agreement to change the ethtool API. The chosen path was to fix taprio. Fixes: 7b9eba7ba0c1 ("net/sched: taprio: fix picos_per_byte miscalculation") Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2aab46ada94f..68b543f85a96 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1044,7 +1044,7 @@ static void taprio_set_picos_per_byte(struct net_device *dev, if (err < 0) goto skip; - if (ecmd.base.speed != SPEED_UNKNOWN) + if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: From 83c8c3cf45163f0c823db37be6ab04dfcf8ac751 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 02:39:48 +0300 Subject: [PATCH 184/312] net: sched: cbs: Avoid division by zero when calculating the port rate As explained in the "net: sched: taprio: Avoid division by zero on invalid link speed" commit, it is legal for the ethtool API to return zero as a link speed. So guard against it to ensure we don't perform a division by zero in kernel. Fixes: e0a7683d30e9 ("net/sched: cbs: fix port_rate miscalculation") Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 1bef152c5721..b2905b03a432 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -306,7 +306,7 @@ static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) if (err < 0) goto skip; - if (ecmd.base.speed != SPEED_UNKNOWN) + if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: From d2c50b1cd94528aea8c8e9abb4cce81590f32cc4 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Sun, 29 Sep 2019 14:54:24 +0800 Subject: [PATCH 185/312] net: mscc: ocelot: add missing of_node_put after calling of_get_child_by_name of_node_put needs to be called when the device node which is got from of_get_child_by_name finished using. In both cases of success and failure, we need to release 'ports', so clean up the code using goto. fixes: a556c76adc05 ("net: mscc: Add initial Ocelot switch support") Signed-off-by: Wen Yang Cc: Alexandre Belloni Cc: Microchip Linux Driver Support Cc: "David S. Miller" Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_board.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c index b063eb78fa0c..aac115136720 100644 --- a/drivers/net/ethernet/mscc/ocelot_board.c +++ b/drivers/net/ethernet/mscc/ocelot_board.c @@ -388,13 +388,14 @@ static int mscc_ocelot_probe(struct platform_device *pdev) continue; phy = of_phy_find_device(phy_node); + of_node_put(phy_node); if (!phy) continue; err = ocelot_probe_port(ocelot, port, regs, phy); if (err) { of_node_put(portnp); - return err; + goto out_put_ports; } phy_mode = of_get_phy_mode(portnp); @@ -422,7 +423,8 @@ static int mscc_ocelot_probe(struct platform_device *pdev) "invalid phy mode for port%d, (Q)SGMII only\n", port); of_node_put(portnp); - return -EINVAL; + err = -EINVAL; + goto out_put_ports; } serdes = devm_of_phy_get(ocelot->dev, portnp, NULL); @@ -435,7 +437,8 @@ static int mscc_ocelot_probe(struct platform_device *pdev) "missing SerDes phys for port%d\n", port); - goto err_probe_ports; + of_node_put(portnp); + goto out_put_ports; } ocelot->ports[port]->serdes = serdes; @@ -447,9 +450,8 @@ static int mscc_ocelot_probe(struct platform_device *pdev) dev_info(&pdev->dev, "Ocelot switch probed\n"); - return 0; - -err_probe_ports: +out_put_ports: + of_node_put(ports); return err; } From f32eb9d80470dab05df26b6efd02d653c72e6a11 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Sun, 29 Sep 2019 15:00:47 +0800 Subject: [PATCH 186/312] net: dsa: rtl8366rb: add missing of_node_put after calling of_get_child_by_name of_node_put needs to be called when the device node which is got from of_get_child_by_name finished using. irq_domain_add_linear() also calls of_node_get() to increase refcount, so irq_domain will not be affected when it is released. Fixes: d8652956cf37 ("net: dsa: realtek-smi: Add Realtek SMI driver") Signed-off-by: Wen Yang Cc: Linus Walleij Cc: Andrew Lunn Cc: Vivien Didelot Cc: Florian Fainelli Cc: "David S. Miller" Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Linus Walleij Signed-off-by: David S. Miller --- drivers/net/dsa/rtl8366rb.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/rtl8366rb.c b/drivers/net/dsa/rtl8366rb.c index a268085ffad2..f5cc8b0a7c74 100644 --- a/drivers/net/dsa/rtl8366rb.c +++ b/drivers/net/dsa/rtl8366rb.c @@ -507,7 +507,8 @@ static int rtl8366rb_setup_cascaded_irq(struct realtek_smi *smi) irq = of_irq_get(intc, 0); if (irq <= 0) { dev_err(smi->dev, "failed to get parent IRQ\n"); - return irq ? irq : -EINVAL; + ret = irq ? irq : -EINVAL; + goto out_put_node; } /* This clears the IRQ status register */ @@ -515,7 +516,7 @@ static int rtl8366rb_setup_cascaded_irq(struct realtek_smi *smi) &val); if (ret) { dev_err(smi->dev, "can't read interrupt status\n"); - return ret; + goto out_put_node; } /* Fetch IRQ edge information from the descriptor */ @@ -537,7 +538,7 @@ static int rtl8366rb_setup_cascaded_irq(struct realtek_smi *smi) val); if (ret) { dev_err(smi->dev, "could not configure IRQ polarity\n"); - return ret; + goto out_put_node; } ret = devm_request_threaded_irq(smi->dev, irq, NULL, @@ -545,7 +546,7 @@ static int rtl8366rb_setup_cascaded_irq(struct realtek_smi *smi) "RTL8366RB", smi); if (ret) { dev_err(smi->dev, "unable to request irq: %d\n", ret); - return ret; + goto out_put_node; } smi->irqdomain = irq_domain_add_linear(intc, RTL8366RB_NUM_INTERRUPT, @@ -553,12 +554,15 @@ static int rtl8366rb_setup_cascaded_irq(struct realtek_smi *smi) smi); if (!smi->irqdomain) { dev_err(smi->dev, "failed to create IRQ domain\n"); - return -EINVAL; + ret = -EINVAL; + goto out_put_node; } for (i = 0; i < smi->num_ports; i++) irq_set_parent(irq_create_mapping(smi->irqdomain, i), irq); - return 0; +out_put_node: + of_node_put(intc); + return ret; } static int rtl8366rb_set_addr(struct realtek_smi *smi) From 93c2fcb01ae919b6e882b0931383d33aaa9bf7a6 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 30 Sep 2019 11:52:21 +0530 Subject: [PATCH 187/312] devlink: Fix error handling in param and info_get dumpit cb If any of the param or info_get op returns error, dumpit cb is skipping to dump remaining params or info_get ops for all the drivers. Fix to not return if any of the param/info_get op returns error as not supported and continue to dump remaining information. v2: Modify the patch to return error, except for params/info_get op that return -EOPNOTSUPP as suggested by Andrew Lunn. Also, modify commit message to reflect the same. Cc: Andrew Lunn Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam Acked-by: Jiri Pirko Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/core/devlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index e48680efe54a..f80151eeaf51 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3172,7 +3172,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { + if (err && err != -EOPNOTSUPP) { mutex_unlock(&devlink->lock); goto out; } @@ -3432,7 +3432,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { + if (err && err != -EOPNOTSUPP) { mutex_unlock(&devlink->lock); goto out; } @@ -4088,7 +4088,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); mutex_unlock(&devlink->lock); - if (err) + if (err && err != -EOPNOTSUPP) break; idx++; } From 9a2ae7b3960eb2426a8560cbc3251e3453230d21 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:05 +0200 Subject: [PATCH 188/312] net: stmmac: xgmac: Not all Unicast addresses may be available Some setups may not have all Unicast addresses filters available. Let's check this before trying to setup filters. Fixes: 0efedbf11f07 ("net: stmmac: xgmac: Fix XGMAC selftests") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 2b277b2c586b..6d8ac2ef4fc2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -472,7 +472,7 @@ static void dwxgmac2_set_filter(struct mac_device_info *hw, dwxgmac2_set_mchash(ioaddr, mc_filter, mcbitslog2); /* Handle multiple unicast addresses */ - if (netdev_uc_count(dev) > XGMAC_ADDR_MAX) { + if (netdev_uc_count(dev) > hw->unicast_filter_entries) { value |= XGMAC_FILTER_PR; } else { struct netdev_hw_addr *ha; From c11986b9fa74cd3ce218e0ca7e0de88d30e6f497 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:06 +0200 Subject: [PATCH 189/312] net: stmmac: xgmac: Detect Hash Table size dinamically Since commit b8ef7020d6e5 ("net: stmmac: add support for hash table size 128/256 in dwmac4"), we can detect the Hash Table dinamically. Let's implement this feature in XGMAC cores and fix possible setups that don't support the maximum size for Hash Table. Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h | 1 + drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index 5923ca62d793..f7eb06f8fb37 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -122,6 +122,7 @@ #define XGMAC_HWFEAT_GMIISEL BIT(1) #define XGMAC_HW_FEATURE1 0x00000120 #define XGMAC_HWFEAT_L3L4FNUM GENMASK(30, 27) +#define XGMAC_HWFEAT_HASHTBLSZ GENMASK(25, 24) #define XGMAC_HWFEAT_RSSEN BIT(20) #define XGMAC_HWFEAT_TSOEN BIT(18) #define XGMAC_HWFEAT_SPHEN BIT(17) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 53c4a40d8386..965cbe3e6f51 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -380,6 +380,7 @@ static void dwxgmac2_get_hw_feature(void __iomem *ioaddr, /* MAC HW feature 1 */ hw_cap = readl(ioaddr + XGMAC_HW_FEATURE1); dma_cap->l3l4fnum = (hw_cap & XGMAC_HWFEAT_L3L4FNUM) >> 27; + dma_cap->hash_tb_sz = (hw_cap & XGMAC_HWFEAT_HASHTBLSZ) >> 24; dma_cap->rssen = (hw_cap & XGMAC_HWFEAT_RSSEN) >> 20; dma_cap->tsoen = (hw_cap & XGMAC_HWFEAT_TSOEN) >> 18; dma_cap->sphen = (hw_cap & XGMAC_HWFEAT_SPHEN) >> 17; From 432439fef933602cae51fb141f33014fb03f2b38 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:07 +0200 Subject: [PATCH 190/312] net: stmmac: selftests: Always use max DMA size in Jumbo Test Although some XGMAC setups support frames larger than DMA size, some of them may not. As we can't know before-hand which ones support let's use the maximum DMA buffer size in the Jumbo Tests. User can always reconfigure the MTU to achieve larger frames. Fixes: 427849e8c37f ("net: stmmac: selftests: Add Jumbo Frame tests") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c index 5f66f6161629..cc76a42c7466 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c @@ -1564,10 +1564,6 @@ static int __stmmac_test_jumbo(struct stmmac_priv *priv, u16 queue) struct stmmac_packet_attrs attr = { }; int size = priv->dma_buf_sz; - /* Only XGMAC has SW support for multiple RX descs in same packet */ - if (priv->plat->has_xgmac) - size = priv->dev->max_mtu; - attr.dst = priv->dev->dev_addr; attr.max_size = size - ETH_FCS_LEN; attr.queue_mapping = queue; From f79bfda3756c50a86c0ee65091935c42c5bbe0cb Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:08 +0200 Subject: [PATCH 191/312] net: stmmac: dwmac4: Always update the MAC Hash Filter We need to always update the MAC Hash Filter so that previous entries are invalidated. Found out while running stmmac selftests. Fixes: b8ef7020d6e5 ("net: stmmac: add support for hash table size 128/256 in dwmac4") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 9b4b5f69fc02..2cb9c53f93b8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -401,8 +401,11 @@ static void dwmac4_set_filter(struct mac_device_info *hw, int numhashregs = (hw->multicast_filter_bins >> 5); int mcbitslog2 = hw->mcast_bits_log2; unsigned int value; + u32 mc_filter[8]; int i; + memset(mc_filter, 0, sizeof(mc_filter)); + value = readl(ioaddr + GMAC_PACKET_FILTER); value &= ~GMAC_PACKET_FILTER_HMC; value &= ~GMAC_PACKET_FILTER_HPF; @@ -416,16 +419,13 @@ static void dwmac4_set_filter(struct mac_device_info *hw, /* Pass all multi */ value |= GMAC_PACKET_FILTER_PM; /* Set all the bits of the HASH tab */ - for (i = 0; i < numhashregs; i++) - writel(0xffffffff, ioaddr + GMAC_HASH_TAB(i)); + memset(mc_filter, 0xff, sizeof(mc_filter)); } else if (!netdev_mc_empty(dev)) { struct netdev_hw_addr *ha; - u32 mc_filter[8]; /* Hash filter for multicast */ value |= GMAC_PACKET_FILTER_HMC; - memset(mc_filter, 0, sizeof(mc_filter)); netdev_for_each_mc_addr(ha, dev) { /* The upper n bits of the calculated CRC are used to * index the contents of the hash table. The number of @@ -440,10 +440,11 @@ static void dwmac4_set_filter(struct mac_device_info *hw, */ mc_filter[bit_nr >> 5] |= (1 << (bit_nr & 0x1f)); } - for (i = 0; i < numhashregs; i++) - writel(mc_filter[i], ioaddr + GMAC_HASH_TAB(i)); } + for (i = 0; i < numhashregs; i++) + writel(mc_filter[i], ioaddr + GMAC_HASH_TAB(i)); + value |= GMAC_PACKET_FILTER_HPF; /* Handle multiple unicast addresses */ From 14f347334bf232074616e29e29103dd0c7c54dec Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:09 +0200 Subject: [PATCH 192/312] net: stmmac: Correctly take timestamp for PTPv2 The case for PTPV2_EVENT requires event packets to be captured so add this setting to the list of enabled captures. Fixes: 891434b18ec0 ("stmmac: add IEEE PTPv1 and PTPv2 support.") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d3232738fb25..31a237ec73bc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -629,6 +629,7 @@ static int stmmac_hwtstamp_set(struct net_device *dev, struct ifreq *ifr) config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; ptp_v2 = PTP_TCR_TSVER2ENA; snap_type_sel = PTP_TCR_SNAPTYPSEL_1; + ts_event_en = PTP_TCR_TSEVNTENA; ptp_over_ipv4_udp = PTP_TCR_TSIPV4ENA; ptp_over_ipv6_udp = PTP_TCR_TSIPV6ENA; ptp_over_ethernet = PTP_TCR_TSIPENA; From 3e2bf04fb0447aa4b967b8000125178f55ae7800 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:10 +0200 Subject: [PATCH 193/312] net: stmmac: Do not stop PHY if WoL is enabled If WoL is enabled we can't really stop the PHY, otherwise we will not receive the WoL packet. Fix this by telling phylink that only the MAC is down and only stop the PHY if WoL is not enabled. Fixes: 74371272f97f ("net: stmmac: Convert to phylink and remove phylib logic") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 31a237ec73bc..843d53e084b7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4718,9 +4718,7 @@ int stmmac_suspend(struct device *dev) mutex_lock(&priv->lock); - rtnl_lock(); - phylink_stop(priv->phylink); - rtnl_unlock(); + phylink_mac_change(priv->phylink, false); netif_device_detach(ndev); stmmac_stop_all_queues(priv); @@ -4735,6 +4733,10 @@ int stmmac_suspend(struct device *dev) stmmac_pmt(priv, priv->hw, priv->wolopts); priv->irq_wake = 1; } else { + rtnl_lock(); + phylink_stop(priv->phylink); + rtnl_unlock(); + stmmac_mac_set(priv, priv->ioaddr, false); pinctrl_pm_select_sleep_state(priv->device); /* Disable clock in case of PWM is off */ @@ -4825,9 +4827,13 @@ int stmmac_resume(struct device *dev) stmmac_start_all_queues(priv); - rtnl_lock(); - phylink_start(priv->phylink); - rtnl_unlock(); + if (!device_may_wakeup(priv->device)) { + rtnl_lock(); + phylink_start(priv->phylink); + rtnl_unlock(); + } + + phylink_mac_change(priv->phylink, true); mutex_unlock(&priv->lock); From 30300d9f9150e4673bbef68fcae40cf267b87293 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:11 +0200 Subject: [PATCH 194/312] net: stmmac: xgmac: Disable the Timestamp interrupt by default We don't use it anyway as XGMAC only supports polling for timestamp (in current SW implementation). This greatly reduces the system load by reducing the number of interrupts. Fixes: 2142754f8b9c ("net: stmmac: Add MAC related callbacks for XGMAC2") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h index f7eb06f8fb37..99037386080a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h @@ -84,7 +84,7 @@ #define XGMAC_TSIE BIT(12) #define XGMAC_LPIIE BIT(5) #define XGMAC_PMTIE BIT(4) -#define XGMAC_INT_DEFAULT_EN (XGMAC_LPIIE | XGMAC_PMTIE | XGMAC_TSIE) +#define XGMAC_INT_DEFAULT_EN (XGMAC_LPIIE | XGMAC_PMTIE) #define XGMAC_Qx_TX_FLOW_CTRL(x) (0x00000070 + (x) * 4) #define XGMAC_PT GENMASK(31, 16) #define XGMAC_PT_SHIFT 16 From 3c72d4d33059cf84a23492e827731f12c023ab00 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:12 +0200 Subject: [PATCH 195/312] net: stmmac: xgmac: Fix RSS not writing all Keys to HW The sizeof(cfg->key) is != ARRAY_SIZE(cfg->key). Fix it. This warning is triggered when running with cc flag -Wsizeof-array-div. Reported-by: kbuild test robot Reported-by: Nick Desaulniers Reported-by: Nathan Chancellor Fixes: 76067459c686 ("net: stmmac: Implement RSS and enable it in XGMAC core") Reviewed-by: Nick Desaulniers Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 6d8ac2ef4fc2..4a1f52474dbc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -533,7 +533,7 @@ static int dwxgmac2_rss_configure(struct mac_device_info *hw, return 0; } - for (i = 0; i < (sizeof(cfg->key) / sizeof(u32)); i++) { + for (i = 0; i < (ARRAY_SIZE(cfg->key) / sizeof(u32)); i++) { ret = dwxgmac2_rss_write_reg(ioaddr, true, i, cfg->key[i]); if (ret) return ret; From 56627336b655494356a23e65200c21718a1c3b2b Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 30 Sep 2019 10:19:13 +0200 Subject: [PATCH 196/312] net: stmmac: xgmac: Fix RSS writing wrong keys Commit b6b6cc9acd7b, changed the call to dwxgmac2_rss_write_reg() passing it the variable cfg->key[i]. As key is an u8 but we write 32 bits at a time we need to cast it into an u32 so that the correct key values are written. Notice that the for loop already takes this into account so we don't try to write past the keys size. Fixes: b6b6cc9acd7b ("net: stmmac: selftest: avoid large stack usage") Signed-off-by: Jose Abreu Reviewed-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 4a1f52474dbc..5031398e612c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -523,8 +523,8 @@ static int dwxgmac2_rss_configure(struct mac_device_info *hw, struct stmmac_rss *cfg, u32 num_rxq) { void __iomem *ioaddr = hw->pcsr; + u32 value, *key; int i, ret; - u32 value; value = readl(ioaddr + XGMAC_RSS_CTRL); if (!cfg || !cfg->enable) { @@ -533,8 +533,9 @@ static int dwxgmac2_rss_configure(struct mac_device_info *hw, return 0; } + key = (u32 *)cfg->key; for (i = 0; i < (ARRAY_SIZE(cfg->key) / sizeof(u32)); i++) { - ret = dwxgmac2_rss_write_reg(ioaddr, true, i, cfg->key[i]); + ret = dwxgmac2_rss_write_reg(ioaddr, true, i, key[i]); if (ret) return ret; } From 569aad4fcd82cba64eb10ede235d330a00f0aa09 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 1 Oct 2019 08:41:47 +0200 Subject: [PATCH 197/312] net: ag71xx: fix mdio subnode support This patch is syncing driver with actual devicetree documentation: Documentation/devicetree/bindings/net/qca,ar71xx.txt |Optional subnodes: |- mdio : specifies the mdio bus, used as a container for phy nodes | according to phy.txt in the same directory The driver was working with fixed phy without any noticeable issues. This bug was uncovered by introducing dsa ar9331-switch driver. Since no one reported this bug until now, I assume no body is using it and this patch should not brake existing system. Fixes: d51b6ce441d3 ("net: ethernet: add ag71xx driver") Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/ag71xx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index 7548247455d7..1b1a09095c0d 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -526,7 +526,7 @@ static int ag71xx_mdio_probe(struct ag71xx *ag) struct device *dev = &ag->pdev->dev; struct net_device *ndev = ag->ndev; static struct mii_bus *mii_bus; - struct device_node *np; + struct device_node *np, *mnp; int err; np = dev->of_node; @@ -571,7 +571,9 @@ static int ag71xx_mdio_probe(struct ag71xx *ag) msleep(200); } - err = of_mdiobus_register(mii_bus, np); + mnp = of_get_child_by_name(np, "mdio"); + err = of_mdiobus_register(mii_bus, mnp); + of_node_put(mnp); if (err) goto mdio_err_put_clk; From 73956fc07dd7b25d4a33ab3fdd6247c60d0b237c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Oct 2019 10:50:33 +0200 Subject: [PATCH 198/312] membarrier: Fix RCU locking bug caused by faulty merge The following commit: 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy load") got fat fingered by me when merging it with other patches. It meant to move the RCU section out of the for loop but ended up doing it partially, leaving a superfluous rcu_read_lock() inside, causing havok. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris Metcalf Cc: Christoph Lameter Cc: Eric W. Biederman Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Russell King - ARM Linux admin Cc: Thomas Gleixner Cc: linux-tip-commits@vger.kernel.org Fixes: 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy load") Link: https://lkml.kernel.org/r/20191001085033.GP4519@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a39bed2c784f..168479a7d61b 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -174,7 +174,6 @@ static int membarrier_private_expedited(int flags) */ if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); From ade77a2da81efb5eaff1f7cc1b1bc14cd9380faa Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 1 Oct 2019 10:38:03 -0700 Subject: [PATCH 199/312] ARM: xen: unexport HYPERVISOR_platform_op function HYPERVISOR_platform_op() is an inline function and should not be exported. Since commit 15bfc2348d54 ("modpost: check for static EXPORT_SYMBOL* functions"), this causes a warning: WARNING: "HYPERVISOR_platform_op" [vmlinux] is a static EXPORT_SYMBOL_GPL Instead, export the underlying function called by the static inline: HYPERVISOR_platform_op_raw. Fixes: 15bfc2348d54 ("modpost: check for static EXPORT_SYMBOL* functions") Signed-off-by: Arnd Bergmann Signed-off-by: Stefano Stabellini Signed-off-by: Stefano Stabellini --- arch/arm/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 1e57692552d9..522c97d43ef8 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -437,7 +437,7 @@ EXPORT_SYMBOL_GPL(HYPERVISOR_memory_op); EXPORT_SYMBOL_GPL(HYPERVISOR_physdev_op); EXPORT_SYMBOL_GPL(HYPERVISOR_vcpu_op); EXPORT_SYMBOL_GPL(HYPERVISOR_tmem_op); -EXPORT_SYMBOL_GPL(HYPERVISOR_platform_op); +EXPORT_SYMBOL_GPL(HYPERVISOR_platform_op_raw); EXPORT_SYMBOL_GPL(HYPERVISOR_multicall); EXPORT_SYMBOL_GPL(HYPERVISOR_vm_assist); EXPORT_SYMBOL_GPL(HYPERVISOR_dm_op); From c5ad81eb029570c5ca5859539b0679f07a776d25 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 27 Sep 2019 17:46:28 +0200 Subject: [PATCH 200/312] xen/balloon: Set pages PageOffline() in balloon_add_region() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are missing a __SetPageOffline(), which is why we can get !PageOffline() pages onto the balloon list, where alloc_xenballooned_pages() will complain: page:ffffea0003e7ffc0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0xffffe00001000(reserved) raw: 000ffffe00001000 dead000000000100 dead000000000200 0000000000000000 raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: VM_BUG_ON_PAGE(!PageOffline(page)) ------------[ cut here ]------------ kernel BUG at include/linux/page-flags.h:744! invalid opcode: 0000 [#1] SMP NOPTI Reported-by: Marek Marczykowski-Górecki Tested-by: Marek Marczykowski-Górecki Fixes: 77c4adf6a6df ("xen/balloon: mark inflated pages PG_offline") Cc: stable@vger.kernel.org # v5.1+ Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Signed-off-by: David Hildenbrand Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/balloon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 4e11de6cde81..91cba70b69df 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -688,6 +688,7 @@ static void __init balloon_add_region(unsigned long start_pfn, /* totalram_pages and totalhigh_pages do not include the boot-time balloon extension, so don't subtract from it. */ + __SetPageOffline(page); __balloon_append(page); } From dde3285ffa1a75dbd67c6a61e5f3eba2267c6593 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 1 Oct 2019 11:01:50 +0200 Subject: [PATCH 201/312] xen/balloon: Drop __balloon_append() Let's simply use balloon_append() directly. Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Signed-off-by: David Hildenbrand Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/balloon.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 91cba70b69df..37443c5fda99 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -156,7 +156,7 @@ static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC) /* balloon_append: add the given page to the balloon. */ -static void __balloon_append(struct page *page) +static void balloon_append(struct page *page) { /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { @@ -169,11 +169,6 @@ static void __balloon_append(struct page *page) wake_up(&balloon_wq); } -static void balloon_append(struct page *page) -{ - __balloon_append(page); -} - /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ static struct page *balloon_retrieve(bool require_lowmem) { @@ -378,7 +373,7 @@ static void xen_online_page(struct page *page, unsigned int order) p = pfn_to_page(start_pfn + i); __online_page_set_limits(p); __SetPageOffline(p); - __balloon_append(p); + balloon_append(p); } mutex_unlock(&balloon_mutex); } @@ -689,7 +684,7 @@ static void __init balloon_add_region(unsigned long start_pfn, include the boot-time balloon extension, so don't subtract from it. */ __SetPageOffline(page); - __balloon_append(page); + balloon_append(page); } balloon_stats.total_pages += extra_pfn_end - start_pfn; From 59b52f105f23edebf497bf8f0e57660e649e3f17 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 1 Oct 2019 11:01:51 +0200 Subject: [PATCH 202/312] xen/balloon: Mark pages PG_offline in balloon_append() Let's move the __SetPageOffline() call which all callers perform into balloon_append(). In bp_state decrease_reservation(), pages are now marked PG_offline a little later than before, however, this should not matter for XEN. Suggested-by: Boris Ostrovsky Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Signed-off-by: David Hildenbrand Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/balloon.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 37443c5fda99..8c245e99bb06 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -158,6 +158,8 @@ static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); /* balloon_append: add the given page to the balloon. */ static void balloon_append(struct page *page) { + __SetPageOffline(page); + /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { list_add_tail(&page->lru, &ballooned_pages); @@ -372,7 +374,6 @@ static void xen_online_page(struct page *page, unsigned int order) for (i = 0; i < size; i++) { p = pfn_to_page(start_pfn + i); __online_page_set_limits(p); - __SetPageOffline(p); balloon_append(p); } mutex_unlock(&balloon_mutex); @@ -466,7 +467,6 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) state = BP_EAGAIN; break; } - __SetPageOffline(page); adjust_managed_page_count(page, -1); xenmem_reservation_scrub_page(page); list_add(&page->lru, &pages); @@ -648,10 +648,8 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) mutex_lock(&balloon_mutex); for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - __SetPageOffline(pages[i]); + if (pages[i]) balloon_append(pages[i]); - } } balloon_stats.target_unpopulated -= nr_pages; @@ -669,7 +667,6 @@ static void __init balloon_add_region(unsigned long start_pfn, unsigned long pages) { unsigned long pfn, extra_pfn_end; - struct page *page; /* * If the amount of usable memory has been limited (e.g., with @@ -679,12 +676,10 @@ static void __init balloon_add_region(unsigned long start_pfn, extra_pfn_end = min(max_pfn, start_pfn + pages); for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { - page = pfn_to_page(pfn); /* totalram_pages and totalhigh_pages do not include the boot-time balloon extension, so don't subtract from it. */ - __SetPageOffline(page); - balloon_append(page); + balloon_append(pfn_to_page(pfn)); } balloon_stats.total_pages += extra_pfn_end - start_pfn; From 04ccbdc390b14d8fdc4612abbf023ff49cdc5cd8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 1 Oct 2019 11:01:52 +0200 Subject: [PATCH 203/312] xen/balloon: Clear PG_offline in balloon_retrieve() Let's move the clearing to balloon_retrieve(). In bp_state increase_reservation(), we now clear the flag a little earlier than before, however, this should not matter for XEN. Suggested-by: Boris Ostrovsky Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Signed-off-by: David Hildenbrand Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/balloon.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 8c245e99bb06..5bae515c8e25 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -189,6 +189,7 @@ static struct page *balloon_retrieve(bool require_lowmem) else balloon_stats.balloon_low--; + __ClearPageOffline(page); return page; } @@ -440,7 +441,6 @@ static enum bp_state increase_reservation(unsigned long nr_pages) xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); /* Relinquish the page back to the allocator. */ - __ClearPageOffline(page); free_reserved_page(page); } @@ -606,7 +606,6 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) while (pgno < nr_pages) { page = balloon_retrieve(true); if (page) { - __ClearPageOffline(page); pages[pgno++] = page; #ifdef CONFIG_XEN_HAVE_PVMMU /* From d345d9cad22532ac18afa70efa0506b1d8cface5 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 27 Sep 2019 13:33:39 +0800 Subject: [PATCH 204/312] MIPS: cpu-bugs64: Mark inline functions as __always_inline Commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") allows compiler to uninline functions marked as 'inline'. Leading to section mismatch in this case. Since we're using const variables to pass assembly flags, 'inline's can't be dropped. So we simply mark them as __always_inline. Signed-off-by: Jiaxun Yang Cc: linux-mips@vger.kernel.org [paul.burton@mips.com: - Annotate these functions with __init, even if it only serves to inform human readers when the code can be used. - Drop the __always_inline from check_daddi() & check_daddiu() which don't use arguments as immediates in inline asm. - Rewrap the commit message.] Signed-off-by: Paul Burton --- arch/mips/kernel/cpu-bugs64.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/mips/kernel/cpu-bugs64.c b/arch/mips/kernel/cpu-bugs64.c index fa62cd1dff93..6a7afe7ef4d3 100644 --- a/arch/mips/kernel/cpu-bugs64.c +++ b/arch/mips/kernel/cpu-bugs64.c @@ -24,7 +24,8 @@ static char r4kwar[] __initdata = static char daddiwar[] __initdata = "Enable CPU_DADDI_WORKAROUNDS to rectify."; -static inline void align_mod(const int align, const int mod) +static __always_inline __init +void align_mod(const int align, const int mod) { asm volatile( ".set push\n\t" @@ -38,8 +39,9 @@ static inline void align_mod(const int align, const int mod) : "n"(align), "n"(mod)); } -static __always_inline void mult_sh_align_mod(long *v1, long *v2, long *w, - const int align, const int mod) +static __always_inline __init +void mult_sh_align_mod(long *v1, long *v2, long *w, + const int align, const int mod) { unsigned long flags; int m1, m2; @@ -113,7 +115,7 @@ static __always_inline void mult_sh_align_mod(long *v1, long *v2, long *w, *w = lw; } -static inline void check_mult_sh(void) +static __always_inline __init void check_mult_sh(void) { long v1[8], v2[8], w[8]; int bug, fix, i; @@ -176,7 +178,7 @@ asmlinkage void __init do_daddi_ov(struct pt_regs *regs) exception_exit(prev_state); } -static inline void check_daddi(void) +static __init void check_daddi(void) { extern asmlinkage void handle_daddi_ov(void); unsigned long flags; @@ -242,7 +244,7 @@ static inline void check_daddi(void) int daddiu_bug = IS_ENABLED(CONFIG_CPU_MIPSR6) ? 0 : -1; -static inline void check_daddiu(void) +static __init void check_daddiu(void) { long v, w, tmp; From 18856604b3e7090ce42d533995173ee70c24b1c9 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 24 Sep 2019 17:15:56 -0700 Subject: [PATCH 205/312] RISC-V: Clear load reservations while restoring hart contexts This is almost entirely a comment. The bug is unlikely to manifest on existing hardware because there is a timeout on load reservations, but manifests on QEMU because there is no timeout. Signed-off-by: Palmer Dabbelt Reviewed-by: Christoph Hellwig Signed-off-by: Paul Walmsley --- arch/riscv/include/asm/asm.h | 1 + arch/riscv/kernel/entry.S | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index 5a02b7d50940..9c992a88d858 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -22,6 +22,7 @@ #define REG_L __REG_SEL(ld, lw) #define REG_S __REG_SEL(sd, sw) +#define REG_SC __REG_SEL(sc.d, sc.w) #define SZREG __REG_SEL(8, 4) #define LGREG __REG_SEL(3, 2) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index da7aa88113c2..2d592da1e776 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -98,7 +98,26 @@ _save_context: */ .macro RESTORE_ALL REG_L a0, PT_SSTATUS(sp) - REG_L a2, PT_SEPC(sp) + /* + * The current load reservation is effectively part of the processor's + * state, in the sense that load reservations cannot be shared between + * different hart contexts. We can't actually save and restore a load + * reservation, so instead here we clear any existing reservation -- + * it's always legal for implementations to clear load reservations at + * any point (as long as the forward progress guarantee is kept, but + * we'll ignore that here). + * + * Dangling load reservations can be the result of taking a trap in the + * middle of an LR/SC sequence, but can also be the result of a taken + * forward branch around an SC -- which is how we implement CAS. As a + * result we need to clear reservations between the last CAS and the + * jump back to the new context. While it is unlikely the store + * completes, implementations are allowed to expand reservations to be + * arbitrarily large. + */ + REG_L a2, PT_SEPC(sp) + REG_SC x0, a2, PT_SEPC(sp) + csrw CSR_SSTATUS, a0 csrw CSR_SEPC, a2 From 922b0375fc93fb1a20c5617e37c389c26bbccb70 Mon Sep 17 00:00:00 2001 From: Albert Ou Date: Fri, 27 Sep 2019 16:14:18 -0700 Subject: [PATCH 206/312] riscv: Fix memblock reservation for device tree blob This fixes an error with how the FDT blob is reserved in memblock. An incorrect physical address calculation exposed the FDT header to unintended corruption, which typically manifested with of_fdt_raw_init() faulting during late boot after fdt_totalsize() returned a wrong value. Systems with smaller physical memory sizes more frequently trigger this issue, as the kernel is more likely to allocate from the DMA32 zone where bbl places the DTB after the kernel image. Commit 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages") changed the mapping of the DTB to reside in the fixmap area. Consequently, early_init_fdt_reserve_self() cannot be used anymore in setup_bootmem() since it relies on __pa() to derive a physical address, which does not work with dtb_early_va that is no longer a valid kernel logical address. The reserved[0x1] region shows the effect of the pointer underflow resulting from the __pa(initial_boot_params) offset subtraction: [ 0.000000] MEMBLOCK configuration: [ 0.000000] memory size = 0x000000001fe00000 reserved size = 0x0000000000a2e514 [ 0.000000] memory.cnt = 0x1 [ 0.000000] memory[0x0] [0x0000000080200000-0x000000009fffffff], 0x000000001fe00000 bytes flags: 0x0 [ 0.000000] reserved.cnt = 0x2 [ 0.000000] reserved[0x0] [0x0000000080200000-0x0000000080c2dfeb], 0x0000000000a2dfec bytes flags: 0x0 [ 0.000000] reserved[0x1] [0xfffffff080100000-0xfffffff080100527], 0x0000000000000528 bytes flags: 0x0 With the fix applied: [ 0.000000] MEMBLOCK configuration: [ 0.000000] memory size = 0x000000001fe00000 reserved size = 0x0000000000a2e514 [ 0.000000] memory.cnt = 0x1 [ 0.000000] memory[0x0] [0x0000000080200000-0x000000009fffffff], 0x000000001fe00000 bytes flags: 0x0 [ 0.000000] reserved.cnt = 0x2 [ 0.000000] reserved[0x0] [0x0000000080200000-0x0000000080c2dfeb], 0x0000000000a2dfec bytes flags: 0x0 [ 0.000000] reserved[0x1] [0x0000000080e00000-0x0000000080e00527], 0x0000000000000528 bytes flags: 0x0 Fixes: 671f9a3e2e24 ("RISC-V: Setup initial page tables in two stages") Signed-off-by: Albert Ou Tested-by: Bin Meng Reviewed-by: Anup Patel Signed-off-by: Paul Walmsley --- arch/riscv/mm/init.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f0ba71304b6e..83f7d12042fb 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,8 @@ disable: } #endif /* CONFIG_BLK_DEV_INITRD */ +static phys_addr_t dtb_early_pa __initdata; + void __init setup_bootmem(void) { struct memblock_region *reg; @@ -117,7 +120,12 @@ void __init setup_bootmem(void) setup_initrd(); #endif /* CONFIG_BLK_DEV_INITRD */ - early_init_fdt_reserve_self(); + /* + * Avoid using early_init_fdt_reserve_self() since __pa() does + * not work for DTB pointers that are fixmap addresses + */ + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + early_init_fdt_scan_reserved_mem(); memblock_allow_resize(); memblock_dump_all(); @@ -393,6 +401,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) /* Save pointer to DTB for early FDT parsing */ dtb_early_va = (void *)fix_to_virt(FIX_FDT) + (dtb_pa & ~PAGE_MASK); + /* Save physical address for memblock reservation */ + dtb_early_pa = dtb_pa; } static void __init setup_vm_final(void) From 8353da9fa69722b54cba82b2ec740afd3d438748 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 30 Sep 2019 17:12:41 +0200 Subject: [PATCH 207/312] hso: fix NULL-deref on tty open Fix NULL-pointer dereference on tty open due to a failure to handle a missing interrupt-in endpoint when probing modem ports: BUG: kernel NULL pointer dereference, address: 0000000000000006 ... RIP: 0010:tiocmget_submit_urb+0x1c/0xe0 [hso] ... Call Trace: hso_start_serial_device+0xdc/0x140 [hso] hso_serial_open+0x118/0x1b0 [hso] tty_open+0xf1/0x490 Fixes: 542f54823614 ("tty: Modem functions for the HSO driver") Signed-off-by: Johan Hovold Signed-off-by: David S. Miller --- drivers/net/usb/hso.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index ce78714f536f..a505b2ab88b8 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -2620,14 +2620,18 @@ static struct hso_device *hso_create_bulk_serial_device( */ if (serial->tiocmget) { tiocmget = serial->tiocmget; + tiocmget->endp = hso_get_ep(interface, + USB_ENDPOINT_XFER_INT, + USB_DIR_IN); + if (!tiocmget->endp) { + dev_err(&interface->dev, "Failed to find INT IN ep\n"); + goto exit; + } + tiocmget->urb = usb_alloc_urb(0, GFP_KERNEL); if (tiocmget->urb) { mutex_init(&tiocmget->mutex); init_waitqueue_head(&tiocmget->waitq); - tiocmget->endp = hso_get_ep( - interface, - USB_ENDPOINT_XFER_INT, - USB_DIR_IN); } else hso_free_tiomget(serial); } From 0d9138ffac24cf8b75366ede3a68c951e6dcc575 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 30 Sep 2019 18:43:50 +0000 Subject: [PATCH 208/312] vsock: Fix a lockdep warning in __vsock_release() Lockdep is unhappy if two locks from the same class are held. Fix the below warning for hyperv and virtio sockets (vmci socket code doesn't have the issue) by using lock_sock_nested() when __vsock_release() is called recursively: ============================================ WARNING: possible recursive locking detected 5.3.0+ #1 Not tainted -------------------------------------------- server/1795 is trying to acquire lock: ffff8880c5158990 (sk_lock-AF_VSOCK){+.+.}, at: hvs_release+0x10/0x120 [hv_sock] but task is already holding lock: ffff8880c5158150 (sk_lock-AF_VSOCK){+.+.}, at: __vsock_release+0x2e/0xf0 [vsock] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(sk_lock-AF_VSOCK); lock(sk_lock-AF_VSOCK); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by server/1795: #0: ffff8880c5d05ff8 (&sb->s_type->i_mutex_key#10){+.+.}, at: __sock_release+0x2d/0xa0 #1: ffff8880c5158150 (sk_lock-AF_VSOCK){+.+.}, at: __vsock_release+0x2e/0xf0 [vsock] stack backtrace: CPU: 5 PID: 1795 Comm: server Not tainted 5.3.0+ #1 Call Trace: dump_stack+0x67/0x90 __lock_acquire.cold.67+0xd2/0x20b lock_acquire+0xb5/0x1c0 lock_sock_nested+0x6d/0x90 hvs_release+0x10/0x120 [hv_sock] __vsock_release+0x24/0xf0 [vsock] __vsock_release+0xa0/0xf0 [vsock] vsock_release+0x12/0x30 [vsock] __sock_release+0x37/0xa0 sock_close+0x14/0x20 __fput+0xc1/0x250 task_work_run+0x98/0xc0 do_exit+0x344/0xc60 do_group_exit+0x47/0xb0 get_signal+0x15c/0xc50 do_signal+0x30/0x720 exit_to_usermode_loop+0x50/0xa0 do_syscall_64+0x24e/0x270 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f4184e85f31 Tested-by: Stefano Garzarella Signed-off-by: Dexuan Cui Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 16 ++++++++++++---- net/vmw_vsock/hyperv_transport.c | 2 +- net/vmw_vsock/virtio_transport_common.c | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ab47bf3ab66e..2ab43b2bba31 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -638,7 +638,7 @@ struct sock *__vsock_create(struct net *net, } EXPORT_SYMBOL_GPL(__vsock_create); -static void __vsock_release(struct sock *sk) +static void __vsock_release(struct sock *sk, int level) { if (sk) { struct sk_buff *skb; @@ -648,9 +648,17 @@ static void __vsock_release(struct sock *sk) vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ + /* The release call is supposed to use lock_sock_nested() + * rather than lock_sock(), if a sock lock should be acquired. + */ transport->release(vsk); - lock_sock(sk); + /* When "level" is SINGLE_DEPTH_NESTING, use the nested + * version to avoid the warning "possible recursive locking + * detected". When "level" is 0, lock_sock_nested(sk, level) + * is the same as lock_sock(sk). + */ + lock_sock_nested(sk, level); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; @@ -659,7 +667,7 @@ static void __vsock_release(struct sock *sk) /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { - __vsock_release(pending); + __vsock_release(pending, SINGLE_DEPTH_NESTING); sock_put(pending); } @@ -708,7 +716,7 @@ EXPORT_SYMBOL_GPL(vsock_stream_has_space); static int vsock_release(struct socket *sock) { - __vsock_release(sock->sk); + __vsock_release(sock->sk, 0); sock->sk = NULL; sock->state = SS_FREE; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 261521d286d6..c443db7af8d4 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -559,7 +559,7 @@ static void hvs_release(struct vsock_sock *vsk) struct sock *sk = sk_vsock(vsk); bool remove_sock; - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); remove_sock = hvs_close_lock_held(vsk); release_sock(sk); if (remove_sock) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 5bb70c692b1e..a666ef8fc54e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -820,7 +820,7 @@ void virtio_transport_release(struct vsock_sock *vsk) struct sock *sk = &vsk->sk; bool remove_sock = true; - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); From 3256a2d6ab1f71f9a1bd2d7f6f18eb8108c48d17 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Sep 2019 15:44:44 -0700 Subject: [PATCH 209/312] tcp: adjust rto_base in retransmits_timed_out() The cited commit exposed an old retransmits_timed_out() bug which assumed it could call tcp_model_timeout() with TCP_RTO_MIN as rto_base for all states. But flows in SYN_SENT or SYN_RECV state uses a different RTO base (1 sec instead of 200 ms, unless BPF choses another value) This caused a reduction of SYN retransmits from 6 to 4 with the default /proc/sys/net/ipv4/tcp_syn_retries value. Fixes: a41e8a88b06e ("tcp: better handle TCP_USER_TIMEOUT in SYN_SENT state") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Marek Majkowski Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 40de2d2364a1..05be564414e9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -198,8 +198,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (likely(timeout == 0)) - timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN); + if (likely(timeout == 0)) { + unsigned int rto_base = TCP_RTO_MIN; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + rto_base = tcp_timeout_init(sk); + timeout = tcp_model_timeout(sk, boundary, rto_base); + } return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } From a3ce2a21bb8969ae27917281244fa91bf5f286d7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Sep 2019 20:28:34 -0700 Subject: [PATCH 210/312] ipv6: Handle race in addrconf_dad_work Rajendra reported a kernel panic when a link was taken down: [ 6870.263084] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 [ 6870.271856] IP: [] __ipv6_ifa_notify+0x154/0x290 [ 6870.570501] Call Trace: [ 6870.573238] [] ? ipv6_ifa_notify+0x26/0x40 [ 6870.579665] [] ? addrconf_dad_completed+0x4c/0x2c0 [ 6870.586869] [] ? ipv6_dev_mc_inc+0x196/0x260 [ 6870.593491] [] ? addrconf_dad_work+0x10a/0x430 [ 6870.600305] [] ? __switch_to_asm+0x34/0x70 [ 6870.606732] [] ? process_one_work+0x18a/0x430 [ 6870.613449] [] ? worker_thread+0x4d/0x490 [ 6870.619778] [] ? process_one_work+0x430/0x430 [ 6870.626495] [] ? kthread+0xd9/0xf0 [ 6870.632145] [] ? __switch_to_asm+0x34/0x70 [ 6870.638573] [] ? kthread_park+0x60/0x60 [ 6870.644707] [] ? ret_from_fork+0x57/0x70 [ 6870.650936] Code: 31 c0 31 d2 41 b9 20 00 08 02 b9 09 00 00 0 addrconf_dad_work is kicked to be scheduled when a device is brought up. There is a race between addrcond_dad_work getting scheduled and taking the rtnl lock and a process taking the link down (under rtnl). The latter removes the host route from the inet6_addr as part of addrconf_ifdown which is run for NETDEV_DOWN. The former attempts to use the host route in ipv6_ifa_notify. If the down event removes the host route due to the race to the rtnl, then the BUG listed above occurs. This scenario does not occur when the ipv6 address is not kept (net.ipv6.conf.all.keep_addr_on_down = 0) as addrconf_ifdown sets the state of the ifp to DEAD. Handle when the addresses are kept by checking IF_READY which is reset by addrconf_ifdown. The 'dead' flag for an inet6_addr is set only under rtnl, in addrconf_ifdown and it means the device is getting removed (or IPv6 is disabled). The interesting cases for changing the idev flag are addrconf_notify (NETDEV_UP and NETDEV_CHANGE) and addrconf_ifdown (reset the flag). The former does not have the idev lock - only rtnl; the latter has both. Based on that the existing dead + IF_READY check can be moved to right after the rtnl_lock in addrconf_dad_work. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Reported-by: Rajendra Dendukuri Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6a576ff92c39..dd3be06d5a06 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4032,6 +4032,12 @@ static void addrconf_dad_work(struct work_struct *w) rtnl_lock(); + /* check if device was taken down before this delayed work + * function could be canceled + */ + if (idev->dead || !(idev->if_flags & IF_READY)) + goto out; + spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { action = DAD_BEGIN; @@ -4077,11 +4083,6 @@ static void addrconf_dad_work(struct work_struct *w) goto out; write_lock_bh(&idev->lock); - if (idev->dead || !(idev->if_flags & IF_READY)) { - write_unlock_bh(&idev->lock); - goto out; - } - spin_lock(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_DEAD) { spin_unlock(&ifp->lock); From a761129e3625688310aecf26e1be9e98e85f8eb5 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Tue, 1 Oct 2019 21:56:41 +0800 Subject: [PATCH 211/312] xen-netfront: do not use ~0U as error return value for xennet_fill_frags() xennet_fill_frags() uses ~0U as return value when the sk_buff is not able to cache extra fragments. This is incorrect because the return type of xennet_fill_frags() is RING_IDX and 0xffffffff is an expected value for ring buffer index. In the situation when the rsp_cons is approaching 0xffffffff, the return value of xennet_fill_frags() may become 0xffffffff which xennet_poll() (the caller) would regard as error. As a result, queue->rx.rsp_cons is set incorrectly because it is updated only when there is error. If there is no error, xennet_poll() would be responsible to update queue->rx.rsp_cons. Finally, queue->rx.rsp_cons would point to the rx ring buffer entries whose queue->rx_skbs[i] and queue->grant_rx_ref[i] are already cleared to NULL. This leads to NULL pointer access in the next iteration to process rx ring buffer entries. The symptom is similar to the one fixed in commit 00b368502d18 ("xen-netfront: do not assume sk_buff_head list is empty in error handling"). This patch changes the return type of xennet_fill_frags() to indicate whether it is successful or failed. The queue->rx.rsp_cons will be always updated inside this function. Fixes: ad4f15dc2c70 ("xen/netfront: don't bug in case of too many frags") Signed-off-by: Dongli Zhang Reviewed-by: Juergen Gross Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index e14ec75b61d6..482c6c8b0fb7 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -887,9 +887,9 @@ static int xennet_set_skb_gso(struct sk_buff *skb, return 0; } -static RING_IDX xennet_fill_frags(struct netfront_queue *queue, - struct sk_buff *skb, - struct sk_buff_head *list) +static int xennet_fill_frags(struct netfront_queue *queue, + struct sk_buff *skb, + struct sk_buff_head *list) { RING_IDX cons = queue->rx.rsp_cons; struct sk_buff *nskb; @@ -908,7 +908,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue, if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) { queue->rx.rsp_cons = ++cons + skb_queue_len(list); kfree_skb(nskb); - return ~0U; + return -ENOENT; } skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, @@ -919,7 +919,9 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue, kfree_skb(nskb); } - return cons; + queue->rx.rsp_cons = cons; + + return 0; } static int checksum_setup(struct net_device *dev, struct sk_buff *skb) @@ -1045,8 +1047,7 @@ err: skb->data_len = rx->status; skb->len += rx->status; - i = xennet_fill_frags(queue, skb, &tmpq); - if (unlikely(i == ~0U)) + if (unlikely(xennet_fill_frags(queue, skb, &tmpq))) goto err; if (rx->flags & XEN_NETRXF_csum_blank) @@ -1056,7 +1057,7 @@ err: __skb_queue_tail(&rxq, skb); - queue->rx.rsp_cons = ++i; + i = ++queue->rx.rsp_cons; work_done++; } From ec066de1a567169c4737a5933218c329cedf52bd Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 16 Sep 2019 09:51:33 +0000 Subject: [PATCH 212/312] arm: xen: mm: use __GPF_DMA32 for arm64 arm64 shares some code under arch/arm/xen, including mm.c. However ZONE_DMA is removed by commit ad67f5a6545("arm64: replace ZONE_DMA with ZONE_DMA32"). So add a check if CONFIG_ZONE_DMA32 is enabled use __GFP_DMA32. Signed-off-by: Peng Fan Acked-by: Stefano Stabellini Signed-off-by: Stefano Stabellini --- arch/arm/xen/mm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index d33b77e9add3..018093fef80a 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -28,7 +28,10 @@ unsigned long xen_get_swiotlb_free_pages(unsigned int order) for_each_memblock(memory, reg) { if (reg->base < (phys_addr_t)0xffffffff) { - flags |= __GFP_DMA; + if (IS_ENABLED(CONFIG_ZONE_DMA32)) + flags |= __GFP_DMA32; + else + flags |= __GFP_DMA; break; } } From 09515706857a7d5a2ffb5ce6a44c0bc7859a745b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 1 Oct 2019 10:25:34 +0200 Subject: [PATCH 213/312] xen/efi: have a common runtime setup function Today the EFI runtime functions are setup in architecture specific code (x86 and arm), with the functions themselves living in drivers/xen as they are not architecture dependent. As the setup is exactly the same for arm and x86 move the setup to drivers/xen, too. This at once removes the need to make the single functions global visible. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich [boris: "Dropped EXPORT_SYMBOL_GPL(xen_efi_runtime_setup)"] Signed-off-by: Boris Ostrovsky --- arch/arm/include/asm/xen/xen-ops.h | 6 -- arch/arm/xen/Makefile | 1 - arch/arm/xen/efi.c | 30 ---------- arch/arm/xen/enlighten.c | 1 - arch/arm64/include/asm/xen/xen-ops.h | 7 --- arch/arm64/xen/Makefile | 1 - arch/x86/xen/efi.c | 16 +----- drivers/xen/efi.c | 84 ++++++++++++++++------------ include/xen/xen-ops.h | 25 +-------- 9 files changed, 49 insertions(+), 122 deletions(-) delete mode 100644 arch/arm/include/asm/xen/xen-ops.h delete mode 100644 arch/arm/xen/efi.c delete mode 100644 arch/arm64/include/asm/xen/xen-ops.h diff --git a/arch/arm/include/asm/xen/xen-ops.h b/arch/arm/include/asm/xen/xen-ops.h deleted file mode 100644 index ec154e719b11..000000000000 --- a/arch/arm/include/asm/xen/xen-ops.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_XEN_OPS_H -#define _ASM_XEN_OPS_H - -void xen_efi_runtime_setup(void); - -#endif /* _ASM_XEN_OPS_H */ diff --git a/arch/arm/xen/Makefile b/arch/arm/xen/Makefile index 7ed28982c4c3..c32d04713ba0 100644 --- a/arch/arm/xen/Makefile +++ b/arch/arm/xen/Makefile @@ -1,3 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := enlighten.o hypercall.o grant-table.o p2m.o mm.o -obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/arm/xen/efi.c b/arch/arm/xen/efi.c deleted file mode 100644 index cb2aaf98e243..000000000000 --- a/arch/arm/xen/efi.c +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2015, Linaro Limited, Shannon Zhao - */ - -#include -#include -#include - -/* Set XEN EFI runtime services function pointers. Other fields of struct efi, - * e.g. efi.systab, will be set like normal EFI. - */ -void __init xen_efi_runtime_setup(void) -{ - efi.get_time = xen_efi_get_time; - efi.set_time = xen_efi_set_time; - efi.get_wakeup_time = xen_efi_get_wakeup_time; - efi.set_wakeup_time = xen_efi_set_wakeup_time; - efi.get_variable = xen_efi_get_variable; - efi.get_next_variable = xen_efi_get_next_variable; - efi.set_variable = xen_efi_set_variable; - efi.set_variable_nonblocking = xen_efi_set_variable; - efi.query_variable_info = xen_efi_query_variable_info; - efi.query_variable_info_nonblocking = xen_efi_query_variable_info; - efi.update_capsule = xen_efi_update_capsule; - efi.query_capsule_caps = xen_efi_query_capsule_caps; - efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; - efi.reset_system = xen_efi_reset_system; -} -EXPORT_SYMBOL_GPL(xen_efi_runtime_setup); diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 522c97d43ef8..dd6804a64f1a 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm64/include/asm/xen/xen-ops.h b/arch/arm64/include/asm/xen/xen-ops.h deleted file mode 100644 index e6e784051932..000000000000 --- a/arch/arm64/include/asm/xen/xen-ops.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_XEN_OPS_H -#define _ASM_XEN_OPS_H - -void xen_efi_runtime_setup(void); - -#endif /* _ASM_XEN_OPS_H */ diff --git a/arch/arm64/xen/Makefile b/arch/arm64/xen/Makefile index a4fc65f3928d..b66215e8658e 100644 --- a/arch/arm64/xen/Makefile +++ b/arch/arm64/xen/Makefile @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only xen-arm-y += $(addprefix ../../arm/xen/, enlighten.o grant-table.o p2m.o mm.o) obj-y := xen-arm.o hypercall.o -obj-$(CONFIG_XEN_EFI) += $(addprefix ../../arm/xen/, efi.o) diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 7e3eb70f411a..a04551ee5568 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -57,21 +57,7 @@ static efi_system_table_t __init *xen_efi_probe(void) return NULL; /* Here we know that Xen runs on EFI platform. */ - - efi.get_time = xen_efi_get_time; - efi.set_time = xen_efi_set_time; - efi.get_wakeup_time = xen_efi_get_wakeup_time; - efi.set_wakeup_time = xen_efi_set_wakeup_time; - efi.get_variable = xen_efi_get_variable; - efi.get_next_variable = xen_efi_get_next_variable; - efi.set_variable = xen_efi_set_variable; - efi.set_variable_nonblocking = xen_efi_set_variable; - efi.query_variable_info = xen_efi_query_variable_info; - efi.query_variable_info_nonblocking = xen_efi_query_variable_info; - efi.update_capsule = xen_efi_update_capsule; - efi.query_capsule_caps = xen_efi_query_capsule_caps; - efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; - efi.reset_system = xen_efi_reset_system; + xen_efi_runtime_setup(); efi_systab_xen.tables = info->cfg.addr; efi_systab_xen.nr_tables = info->cfg.nent; diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index 89d60f8e3c18..d1ff2186ebb4 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -40,7 +40,7 @@ #define efi_data(op) (op.u.efi_runtime_call) -efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +static efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { struct xen_platform_op op = INIT_EFI_OP(get_time); @@ -61,9 +61,8 @@ efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_time); -efi_status_t xen_efi_set_time(efi_time_t *tm) +static efi_status_t xen_efi_set_time(efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(set_time); @@ -75,10 +74,10 @@ efi_status_t xen_efi_set_time(efi_time_t *tm) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_time); -efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, - efi_time_t *tm) +static efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(get_wakeup_time); @@ -98,9 +97,8 @@ efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_wakeup_time); -efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +static efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) { struct xen_platform_op op = INIT_EFI_OP(set_wakeup_time); @@ -117,11 +115,10 @@ efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_wakeup_time); -efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 *attr, unsigned long *data_size, - void *data) +static efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 *attr, unsigned long *data_size, + void *data) { struct xen_platform_op op = INIT_EFI_OP(get_variable); @@ -141,11 +138,10 @@ efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_variable); -efi_status_t xen_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) +static efi_status_t xen_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) { struct xen_platform_op op = INIT_EFI_OP(get_next_variable_name); @@ -165,11 +161,10 @@ efi_status_t xen_efi_get_next_variable(unsigned long *name_size, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_next_variable); -efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 attr, unsigned long data_size, - void *data) +static efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) { struct xen_platform_op op = INIT_EFI_OP(set_variable); @@ -186,11 +181,10 @@ efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_set_variable); -efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, - u64 *remaining_space, - u64 *max_variable_size) +static efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) { struct xen_platform_op op = INIT_EFI_OP(query_variable_info); @@ -208,9 +202,8 @@ efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_query_variable_info); -efi_status_t xen_efi_get_next_high_mono_count(u32 *count) +static efi_status_t xen_efi_get_next_high_mono_count(u32 *count) { struct xen_platform_op op = INIT_EFI_OP(get_next_high_monotonic_count); @@ -221,10 +214,9 @@ efi_status_t xen_efi_get_next_high_mono_count(u32 *count) return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_get_next_high_mono_count); -efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, - unsigned long count, unsigned long sg_list) +static efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, + unsigned long count, unsigned long sg_list) { struct xen_platform_op op = INIT_EFI_OP(update_capsule); @@ -241,11 +233,9 @@ efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_update_capsule); -efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, - unsigned long count, u64 *max_size, - int *reset_type) +static efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, + unsigned long count, u64 *max_size, int *reset_type) { struct xen_platform_op op = INIT_EFI_OP(query_capsule_capabilities); @@ -264,10 +254,9 @@ efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, return efi_data(op).status; } -EXPORT_SYMBOL_GPL(xen_efi_query_capsule_caps); -void xen_efi_reset_system(int reset_type, efi_status_t status, - unsigned long data_size, efi_char16_t *data) +static void xen_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data) { switch (reset_type) { case EFI_RESET_COLD: @@ -281,4 +270,25 @@ void xen_efi_reset_system(int reset_type, efi_status_t status, BUG(); } } -EXPORT_SYMBOL_GPL(xen_efi_reset_system); + +/* + * Set XEN EFI runtime services function pointers. Other fields of struct efi, + * e.g. efi.systab, will be set like normal EFI. + */ +void __init xen_efi_runtime_setup(void) +{ + efi.get_time = xen_efi_get_time; + efi.set_time = xen_efi_set_time; + efi.get_wakeup_time = xen_efi_get_wakeup_time; + efi.set_wakeup_time = xen_efi_set_wakeup_time; + efi.get_variable = xen_efi_get_variable; + efi.get_next_variable = xen_efi_get_next_variable; + efi.set_variable = xen_efi_set_variable; + efi.set_variable_nonblocking = xen_efi_set_variable; + efi.query_variable_info = xen_efi_query_variable_info; + efi.query_variable_info_nonblocking = xen_efi_query_variable_info; + efi.update_capsule = xen_efi_update_capsule; + efi.query_capsule_caps = xen_efi_query_capsule_caps; + efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; + efi.reset_system = xen_efi_reset_system; +} diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 98b30c1613b2..d89969aa9942 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -212,30 +212,7 @@ int xen_xlate_map_ballooned_pages(xen_pfn_t **pfns, void **vaddr, bool xen_running_on_version_or_later(unsigned int major, unsigned int minor); -efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc); -efi_status_t xen_efi_set_time(efi_time_t *tm); -efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, - efi_time_t *tm); -efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm); -efi_status_t xen_efi_get_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 *attr, unsigned long *data_size, - void *data); -efi_status_t xen_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, efi_guid_t *vendor); -efi_status_t xen_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, - u32 attr, unsigned long data_size, - void *data); -efi_status_t xen_efi_query_variable_info(u32 attr, u64 *storage_space, - u64 *remaining_space, - u64 *max_variable_size); -efi_status_t xen_efi_get_next_high_mono_count(u32 *count); -efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, - unsigned long count, unsigned long sg_list); -efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, - unsigned long count, u64 *max_size, - int *reset_type); -void xen_efi_reset_system(int reset_type, efi_status_t status, - unsigned long data_size, efi_char16_t *data); +void xen_efi_runtime_setup(void); #ifdef CONFIG_PREEMPT From e95584a889e1902fdf1ded9712e2c3c3083baf96 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Wed, 2 Oct 2019 18:49:43 +0700 Subject: [PATCH 214/312] tipc: fix unlimited bundling of small messages We have identified a problem with the "oversubscription" policy in the link transmission code. When small messages are transmitted, and the sending link has reached the transmit window limit, those messages will be bundled and put into the link backlog queue. However, bundles of data messages are counted at the 'CRITICAL' level, so that the counter for that level, instead of the counter for the real, bundled message's level is the one being increased. Subsequent, to-be-bundled data messages at non-CRITICAL levels continue to be tested against the unchanged counter for their own level, while contributing to an unrestrained increase at the CRITICAL backlog level. This leaves a gap in congestion control algorithm for small messages that can result in starvation for other users or a "real" CRITICAL user. Even that eventually can lead to buffer exhaustion & link reset. We fix this by keeping a 'target_bskb' buffer pointer at each levels, then when bundling, we only bundle messages at the same importance level only. This way, we know exactly how many slots a certain level have occupied in the queue, so can manage level congestion accurately. By bundling messages at the same level, we even have more benefits. Let consider this: - One socket sends 64-byte messages at the 'CRITICAL' level; - Another sends 4096-byte messages at the 'LOW' level; When a 64-byte message comes and is bundled the first time, we put the overhead of message bundle to it (+ 40-byte header, data copy, etc.) for later use, but the next message can be a 4096-byte one that cannot be bundled to the previous one. This means the last bundle carries only one payload message which is totally inefficient, as for the receiver also! Later on, another 64-byte message comes, now we make a new bundle and the same story repeats... With the new bundling algorithm, this will not happen, the 64-byte messages will be bundled together even when the 4096-byte message(s) comes in between. However, if the 4096-byte messages are sent at the same level i.e. 'CRITICAL', the bundling algorithm will again cause the same overhead. Also, the same will happen even with only one socket sending small messages at a rate close to the link transmit's one, so that, when one message is bundled, it's transmitted shortly. Then, another message comes, a new bundle is created and so on... We will solve this issue radically by another patch. Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") Reported-by: Hoang Le Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/link.c | 29 ++++++++++++++++++----------- net/tipc/msg.c | 5 +---- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 6cc75ffd9e2c..999eab592de8 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -160,6 +160,7 @@ struct tipc_link { struct { u16 len; u16 limit; + struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; u16 window; @@ -880,6 +881,7 @@ static void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { struct sk_buff_head list; + u32 imp; __skb_queue_head_init(&list); @@ -901,11 +903,10 @@ void tipc_link_reset(struct tipc_link *l) __skb_queue_purge(&l->deferdq); __skb_queue_purge(&l->backlogq); __skb_queue_purge(&l->failover_deferdq); - l->backlog[TIPC_LOW_IMPORTANCE].len = 0; - l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; - l->backlog[TIPC_HIGH_IMPORTANCE].len = 0; - l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0; - l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0; + for (imp = 0; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) { + l->backlog[imp].len = 0; + l->backlog[imp].target_bskb = NULL; + } kfree_skb(l->reasm_buf); kfree_skb(l->reasm_tnlmsg); kfree_skb(l->failover_reasm_skb); @@ -947,7 +948,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; struct sk_buff_head *transmq = &l->transmq; struct sk_buff_head *backlogq = &l->backlogq; - struct sk_buff *skb, *_skb, *bskb; + struct sk_buff *skb, *_skb, **tskb; int pkt_cnt = skb_queue_len(list); int rc = 0; @@ -999,19 +1000,21 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, seqno++; continue; } - if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) { + tskb = &l->backlog[imp].target_bskb; + if (tipc_msg_bundle(*tskb, hdr, mtu)) { kfree_skb(__skb_dequeue(list)); l->stats.sent_bundled++; continue; } - if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) { + if (tipc_msg_make_bundle(tskb, hdr, mtu, l->addr)) { kfree_skb(__skb_dequeue(list)); - __skb_queue_tail(backlogq, bskb); - l->backlog[msg_importance(buf_msg(bskb))].len++; + __skb_queue_tail(backlogq, *tskb); + l->backlog[imp].len++; l->stats.sent_bundled++; l->stats.sent_bundles++; continue; } + l->backlog[imp].target_bskb = NULL; l->backlog[imp].len += skb_queue_len(list); skb_queue_splice_tail_init(list, backlogq); } @@ -1027,6 +1030,7 @@ static void tipc_link_advance_backlog(struct tipc_link *l, u16 seqno = l->snd_nxt; u16 ack = l->rcv_nxt - 1; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u32 imp; while (skb_queue_len(&l->transmq) < l->window) { skb = skb_peek(&l->backlogq); @@ -1037,7 +1041,10 @@ static void tipc_link_advance_backlog(struct tipc_link *l, break; __skb_dequeue(&l->backlogq); hdr = buf_msg(skb); - l->backlog[msg_importance(hdr)].len--; + imp = msg_importance(hdr); + l->backlog[imp].len--; + if (unlikely(skb == l->backlog[imp].target_bskb)) + l->backlog[imp].target_bskb = NULL; __skb_queue_tail(&l->transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index e6d49cdc61b4..922d262e153f 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -543,10 +543,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, bmsg = buf_msg(_skb); tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, INT_H_SIZE, dnode); - if (msg_isdata(msg)) - msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE); - else - msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE); + msg_set_importance(bmsg, msg_importance(msg)); msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); From 8b6b82ad163b54a23f0e89710fb25461d45109c4 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 2 Oct 2019 14:12:41 +0200 Subject: [PATCH 215/312] mlx5: avoid 64-bit division in dr_icm_pool_mr_create() Recently added code introduces 64-bit division in dr_icm_pool_mr_create() so that build on 32-bit architectures fails with ERROR: "__umoddi3" [drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.ko] undefined! As the divisor is always a power of 2, we can use bitwise operation instead. Fixes: 29cf8febd185 ("net/mlx5: DR, ICM pool memory allocator") Reported-by: Borislav Petkov Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c index 913f1e5aaaf2..d7c7467e2d53 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c @@ -137,7 +137,8 @@ dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool, icm_mr->icm_start_addr = icm_mr->dm.addr; - align_diff = icm_mr->icm_start_addr % align_base; + /* align_base is always a power of 2 */ + align_diff = icm_mr->icm_start_addr & (align_base - 1); if (align_diff) icm_mr->used_length = align_base - align_diff; From e8521e53cca584ddf8ec4584d3c550a6c65f88c4 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 1 Oct 2019 16:28:43 +0200 Subject: [PATCH 216/312] net: dsa: rtl8366: Check VLAN ID and not ports There has been some confusion between the port number and the VLAN ID in this driver. What we need to check for validity is the VLAN ID, nothing else. The current confusion came from assigning a few default VLANs for default routing and we need to rewrite that properly. Instead of checking if the port number is a valid VLAN ID, check the actual VLAN IDs passed in to the callback one by one as expected. Fixes: d8652956cf37 ("net: dsa: realtek-smi: Add Realtek SMI driver") Signed-off-by: Linus Walleij Signed-off-by: David S. Miller --- drivers/net/dsa/rtl8366.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/rtl8366.c b/drivers/net/dsa/rtl8366.c index ca3d17e43ed8..ac88caca5ad4 100644 --- a/drivers/net/dsa/rtl8366.c +++ b/drivers/net/dsa/rtl8366.c @@ -339,10 +339,12 @@ int rtl8366_vlan_prepare(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { struct realtek_smi *smi = ds->priv; + u16 vid; int ret; - if (!smi->ops->is_vlan_valid(smi, port)) - return -EINVAL; + for (vid = vlan->vid_begin; vid < vlan->vid_end; vid++) + if (!smi->ops->is_vlan_valid(smi, vid)) + return -EINVAL; dev_info(smi->dev, "prepare VLANs %04x..%04x\n", vlan->vid_begin, vlan->vid_end); @@ -370,8 +372,9 @@ void rtl8366_vlan_add(struct dsa_switch *ds, int port, u16 vid; int ret; - if (!smi->ops->is_vlan_valid(smi, port)) - return; + for (vid = vlan->vid_begin; vid < vlan->vid_end; vid++) + if (!smi->ops->is_vlan_valid(smi, vid)) + return; dev_info(smi->dev, "add VLAN on port %d, %s, %s\n", port, From d64bf89a75b65f83f06be9fb8f978e60d53752db Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Tue, 1 Oct 2019 10:21:02 -0700 Subject: [PATCH 217/312] net/rds: Fix error handling in rds_ib_add_one() rds_ibdev:ipaddr_list and rds_ibdev:conn_list are initialized after allocation some resources such as protection domain. If allocation of such resources fail, then these uninitialized variables are accessed in rds_ib_dev_free() in failure path. This can potentially crash the system. The code has been updated to initialize these variables very early in the function. Signed-off-by: Dotan Barak Signed-off-by: Sudhakar Dindukurti Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index 45acab2de0cf..9de2ae22d583 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -143,6 +143,9 @@ static void rds_ib_add_one(struct ib_device *device) refcount_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); @@ -203,9 +206,6 @@ static void rds_ib_add_one(struct ib_device *device) device->name, rds_ibdev->use_fastreg ? "FRMR" : "FMR"); - INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); - INIT_LIST_HEAD(&rds_ibdev->conn_list); - down_write(&rds_ib_devices_lock); list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); up_write(&rds_ib_devices_lock); From d6530e5ad45089c018c3cc5b5957a34721249f6f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Oct 2019 21:58:18 +0300 Subject: [PATCH 218/312] net: dsa: sja1105: Initialize the meta_lock Otherwise, with CONFIG_DEBUG_SPINLOCK=y, this stack trace gets printed when enabling RX timestamping and receiving a PTP frame: [ 318.537078] INFO: trying to register non-static key. [ 318.542040] the code is fine but needs lockdep annotation. [ 318.547500] turning off the locking correctness validator. [ 318.552972] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-13257-g0825b0669811-dirty #1962 [ 318.561283] Hardware name: Freescale LS1021A [ 318.565566] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 318.573289] [] (show_stack) from [] (dump_stack+0xd4/0x100) [ 318.580579] [] (dump_stack) from [] (register_lock_class+0x728/0x734) [ 318.588731] [] (register_lock_class) from [] (__lock_acquire+0x78/0x25cc) [ 318.597227] [] (__lock_acquire) from [] (lock_acquire+0xd8/0x234) [ 318.605033] [] (lock_acquire) from [] (_raw_spin_lock+0x44/0x54) [ 318.612755] [] (_raw_spin_lock) from [] (sja1105_rcv+0x1f8/0x4e8) [ 318.620561] [] (sja1105_rcv) from [] (dsa_switch_rcv+0x80/0x204) [ 318.628283] [] (dsa_switch_rcv) from [] (__netif_receive_skb_one_core+0x50/0x6c) [ 318.637386] [] (__netif_receive_skb_one_core) from [] (netif_receive_skb_internal+0xac/0x264) [ 318.647611] [] (netif_receive_skb_internal) from [] (napi_gro_receive+0x1d8/0x338) [ 318.656887] [] (napi_gro_receive) from [] (gfar_clean_rx_ring+0x328/0x724) [ 318.665472] [] (gfar_clean_rx_ring) from [] (gfar_poll_rx_sq+0x34/0x94) [ 318.673795] [] (gfar_poll_rx_sq) from [] (net_rx_action+0x128/0x4f8) [ 318.681860] [] (net_rx_action) from [] (__do_softirq+0x148/0x5ac) [ 318.689666] [] (__do_softirq) from [] (irq_exit+0x160/0x170) [ 318.697040] [] (irq_exit) from [] (__handle_domain_irq+0x60/0xb4) [ 318.704847] [] (__handle_domain_irq) from [] (gic_handle_irq+0x58/0x9c) [ 318.713172] [] (gic_handle_irq) from [] (__irq_svc+0x70/0x98) [ 318.720622] Exception stack(0xc2001f18 to 0xc2001f60) [ 318.725656] 1f00: 00000001 00000006 [ 318.733805] 1f20: 00000000 c20165c0 ffffe000 c2010cac c2010cf4 00000001 00000000 c2010c88 [ 318.741955] 1f40: c1f7a5a8 00000000 00000000 c2001f68 c03ba140 c030a288 200e0013 ffffffff [ 318.750110] [] (__irq_svc) from [] (arch_cpu_idle+0x24/0x3c) [ 318.757486] [] (arch_cpu_idle) from [] (do_idle+0x1b8/0x2a4) [ 318.764859] [] (do_idle) from [] (cpu_startup_entry+0x18/0x1c) [ 318.772407] [] (cpu_startup_entry) from [] (start_kernel+0x4cc/0x4fc) Fixes: 844d7edc6a34 ("net: dsa: sja1105: Add a global sja1105_tagger_data structure") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index f3d38ff144c4..ea2e7f4f96d0 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2201,6 +2201,7 @@ static int sja1105_probe(struct spi_device *spi) tagger_data = &priv->tagger_data; skb_queue_head_init(&tagger_data->skb_rxtstamp_queue); INIT_WORK(&tagger_data->rxtstamp_work, sja1105_rxtstamp_work); + spin_lock_init(&tagger_data->meta_lock); /* Connections between dsa_port and sja1105_port */ for (i = 0; i < SJA1105_NUM_PORTS; i++) { From 3e8db7e56082156a37b71d7334860c10fcea8025 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Oct 2019 21:58:19 +0300 Subject: [PATCH 219/312] net: dsa: sja1105: Fix sleeping while atomic in .port_hwtstamp_set Currently this stack trace can be seen with CONFIG_DEBUG_ATOMIC_SLEEP=y: [ 41.568348] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 41.576757] in_atomic(): 1, irqs_disabled(): 0, pid: 208, name: ptp4l [ 41.583212] INFO: lockdep is turned off. [ 41.587123] CPU: 1 PID: 208 Comm: ptp4l Not tainted 5.3.0-rc6-01445-ge950f2d4bc7f-dirty #1827 [ 41.599873] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 41.607584] [] (show_stack) from [] (dump_stack+0xd4/0x100) [ 41.614863] [] (dump_stack) from [] (___might_sleep+0x1c8/0x2b4) [ 41.622574] [] (___might_sleep) from [] (__mutex_lock+0x48/0xab8) [ 41.630368] [] (__mutex_lock) from [] (mutex_lock_nested+0x1c/0x24) [ 41.638340] [] (mutex_lock_nested) from [] (sja1105_static_config_reload+0x30/0x27c) [ 41.647779] [] (sja1105_static_config_reload) from [] (sja1105_hwtstamp_set+0x108/0x1cc) [ 41.657562] [] (sja1105_hwtstamp_set) from [] (dev_ifsioc+0x18c/0x330) [ 41.665788] [] (dev_ifsioc) from [] (dev_ioctl+0x320/0x6e8) [ 41.673064] [] (dev_ioctl) from [] (sock_ioctl+0x334/0x5e8) [ 41.680340] [] (sock_ioctl) from [] (do_vfs_ioctl+0xb0/0xa10) [ 41.687789] [] (do_vfs_ioctl) from [] (ksys_ioctl+0x34/0x58) [ 41.695151] [] (ksys_ioctl) from [] (ret_fast_syscall+0x0/0x28) [ 41.702768] Exception stack(0xe8495fa8 to 0xe8495ff0) [ 41.707796] 5fa0: beff4a8c 00000001 00000011 000089b0 beff4a8c beff4a80 [ 41.715933] 5fc0: beff4a8c 00000001 0000000c 00000036 b6fa98c8 004e19c1 00000001 00000000 [ 41.724069] 5fe0: 004dcedc beff4a6c 004c0738 b6e7af4c [ 41.729860] BUG: scheduling while atomic: ptp4l/208/0x00000002 [ 41.735682] INFO: lockdep is turned off. Enabling RX timestamping will logically disturb the fastpath (processing of meta frames). Replace bool hwts_rx_en with a bit that is checked atomically from the fastpath and temporarily unset from the sleepable context during a change of the RX timestamping process (a destructive operation anyways, requires switch reset). If found unset, the fastpath (net/dsa/tag_sja1105.c) will just drop any received meta frame and not take the meta_lock at all. Fixes: a602afd200f5 ("net: dsa: sja1105: Expose PTP timestamping ioctls to userspace") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 19 +++++++++++-------- include/linux/dsa/sja1105.h | 4 +++- net/dsa/tag_sja1105.c | 12 +++++++++++- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index ea2e7f4f96d0..7687ddcae159 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1897,7 +1897,9 @@ static int sja1105_set_ageing_time(struct dsa_switch *ds, return sja1105_static_config_reload(priv); } -/* Caller must hold priv->tagger_data.meta_lock */ +/* Must be called only with priv->tagger_data.state bit + * SJA1105_HWTS_RX_EN cleared + */ static int sja1105_change_rxtstamping(struct sja1105_private *priv, bool on) { @@ -1954,16 +1956,17 @@ static int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, break; } - if (rx_on != priv->tagger_data.hwts_rx_en) { - spin_lock(&priv->tagger_data.meta_lock); + if (rx_on != test_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state)) { + clear_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state); + rc = sja1105_change_rxtstamping(priv, rx_on); - spin_unlock(&priv->tagger_data.meta_lock); if (rc < 0) { dev_err(ds->dev, "Failed to change RX timestamping: %d\n", rc); - return -EFAULT; + return rc; } - priv->tagger_data.hwts_rx_en = rx_on; + if (rx_on) + set_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state); } if (copy_to_user(ifr->ifr_data, &config, sizeof(config))) @@ -1982,7 +1985,7 @@ static int sja1105_hwtstamp_get(struct dsa_switch *ds, int port, config.tx_type = HWTSTAMP_TX_ON; else config.tx_type = HWTSTAMP_TX_OFF; - if (priv->tagger_data.hwts_rx_en) + if (test_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state)) config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT; else config.rx_filter = HWTSTAMP_FILTER_NONE; @@ -2031,7 +2034,7 @@ static bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, struct sja1105_private *priv = ds->priv; struct sja1105_tagger_data *data = &priv->tagger_data; - if (!data->hwts_rx_en) + if (!test_bit(SJA1105_HWTS_RX_EN, &data->state)) return false; /* We need to read the full PTP clock to reconstruct the Rx diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 79435cfc20eb..897e799dbcb9 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -31,6 +31,8 @@ #define SJA1105_META_SMAC 0x222222222222ull #define SJA1105_META_DMAC 0x0180C200000Eull +#define SJA1105_HWTS_RX_EN 0 + /* Global tagger data: each struct sja1105_port has a reference to * the structure defined in struct sja1105_private. */ @@ -42,7 +44,7 @@ struct sja1105_tagger_data { * from taggers running on multiple ports on SMP systems */ spinlock_t meta_lock; - bool hwts_rx_en; + unsigned long state; }; struct sja1105_skb_cb { diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 9c9aff3e52cf..63ef2a14c934 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -156,7 +156,11 @@ static struct sk_buff /* Step 1: A timestampable frame was received. * Buffer it until we get its meta frame. */ - if (is_link_local && sp->data->hwts_rx_en) { + if (is_link_local) { + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) + /* Do normal processing. */ + return skb; + spin_lock(&sp->data->meta_lock); /* Was this a link-local frame instead of the meta * that we were expecting? @@ -187,6 +191,12 @@ static struct sk_buff } else if (is_meta) { struct sk_buff *stampable_skb; + /* Drop the meta frame if we're not in the right state + * to process it. + */ + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) + return NULL; + spin_lock(&sp->data->meta_lock); stampable_skb = sp->data->stampable_skb; From db34a4714c013b644eec2de0ec81b1f0373b8b93 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Oct 2019 22:07:01 +0300 Subject: [PATCH 220/312] ptp_qoriq: Initialize the registers' spinlock before calling ptp_qoriq_settime Because ptp_qoriq_settime is being called prior to spin_lock_init, the following stack trace can be seen at driver probe time: [ 2.269117] the code is fine but needs lockdep annotation. [ 2.274569] turning off the locking correctness validator. [ 2.280027] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc7-01478-g01eaa67a4797 #263 [ 2.288073] Hardware name: Freescale LS1021A [ 2.292337] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.300045] [] (show_stack) from [] (dump_stack+0xcc/0xf8) [ 2.307235] [] (dump_stack) from [] (register_lock_class+0x730/0x73c) [ 2.315372] [] (register_lock_class) from [] (__lock_acquire+0x78/0x270c) [ 2.323856] [] (__lock_acquire) from [] (lock_acquire+0xe0/0x22c) [ 2.331649] [] (lock_acquire) from [] (_raw_spin_lock_irqsave+0x54/0x68) [ 2.340048] [] (_raw_spin_lock_irqsave) from [] (ptp_qoriq_settime+0x38/0x80) [ 2.348878] [] (ptp_qoriq_settime) from [] (ptp_qoriq_init+0x1f8/0x484) [ 2.357189] [] (ptp_qoriq_init) from [] (ptp_qoriq_probe+0xd0/0x184) [ 2.365243] [] (ptp_qoriq_probe) from [] (platform_drv_probe+0x48/0x9c) [ 2.373555] [] (platform_drv_probe) from [] (really_probe+0x1c4/0x400) [ 2.381779] [] (really_probe) from [] (driver_probe_device+0x78/0x1b8) [ 2.390003] [] (driver_probe_device) from [] (device_driver_attach+0x58/0x60) [ 2.398832] [] (device_driver_attach) from [] (__driver_attach+0xfc/0x160) [ 2.407402] [] (__driver_attach) from [] (bus_for_each_dev+0x68/0xb4) [ 2.415539] [] (bus_for_each_dev) from [] (bus_add_driver+0x104/0x20c) [ 2.423763] [] (bus_add_driver) from [] (driver_register+0x78/0x10c) [ 2.431815] [] (driver_register) from [] (do_one_initcall+0x8c/0x3ac) [ 2.439954] [] (do_one_initcall) from [] (kernel_init_freeable+0x468/0x548) [ 2.448610] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x10c) [ 2.456745] [] (kernel_init) from [] (ret_from_fork+0x14/0x20) [ 2.464273] Exception stack(0xea89ffb0 to 0xea89fff8) [ 2.469297] ffa0: 00000000 00000000 00000000 00000000 [ 2.477432] ffc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 2.485566] ffe0: 00000000 00000000 00000000 00000000 00000013 00000000 Fixes: ff54571a747b ("ptp_qoriq: convert to use ptp_qoriq_init/free") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/ptp/ptp_qoriq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index c61f00b72e15..a577218d1ab7 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -507,6 +507,8 @@ int ptp_qoriq_init(struct ptp_qoriq *ptp_qoriq, void __iomem *base, ptp_qoriq->regs.etts_regs = base + ETTS_REGS_OFFSET; } + spin_lock_init(&ptp_qoriq->lock); + ktime_get_real_ts64(&now); ptp_qoriq_settime(&ptp_qoriq->caps, &now); @@ -514,7 +516,6 @@ int ptp_qoriq_init(struct ptp_qoriq *ptp_qoriq, void __iomem *base, (ptp_qoriq->tclk_period & TCLK_PERIOD_MASK) << TCLK_PERIOD_SHIFT | (ptp_qoriq->cksel & CKSEL_MASK) << CKSEL_SHIFT; - spin_lock_init(&ptp_qoriq->lock); spin_lock_irqsave(&ptp_qoriq->lock, flags); regs = &ptp_qoriq->regs; From 0228ecf6128c92b47eadd2ac270c5574d9150c09 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 1 Oct 2019 21:56:37 +0000 Subject: [PATCH 221/312] MIPS: octeon: Include required header; fix octeon ethernet build Commit 171a9bae68c7 ("staging/octeon: Allow test build on !MIPS") moved the inclusion of a bunch of headers by various files in the Octeon ethernet driver into a common header, but in doing so it changed the order in which those headers are included. Prior to the referenced commit drivers/staging/octeon/ethernet.c included asm/octeon/cvmx-pip.h before asm/octeon/cvmx-ipd.h, which makes use of the CVMX_PIP_SFT_RST definition pulled in by the former. After commit 171a9bae68c7 ("staging/octeon: Allow test build on !MIPS") we pull in asm/octeon/cvmx-ipd.h first & builds fail with: In file included from drivers/staging/octeon/octeon-ethernet.h:27, from drivers/staging/octeon/ethernet.c:22: arch/mips/include/asm/octeon/cvmx-ipd.h: In function 'cvmx_ipd_free_ptr': arch/mips/include/asm/octeon/cvmx-ipd.h:330:27: error: storage size of 'pip_sft_rst' isn't known union cvmx_pip_sft_rst pip_sft_rst; ^~~~~~~~~~~ arch/mips/include/asm/octeon/cvmx-ipd.h:331:36: error: 'CVMX_PIP_SFT_RST' undeclared (first use in this function); did you mean 'CVMX_CIU_SOFT_RST'? pip_sft_rst.u64 = cvmx_read_csr(CVMX_PIP_SFT_RST); ^~~~~~~~~~~~~~~~ CVMX_CIU_SOFT_RST arch/mips/include/asm/octeon/cvmx-ipd.h:331:36: note: each undeclared identifier is reported only once for each function it appears in arch/mips/include/asm/octeon/cvmx-ipd.h:330:27: warning: unused variable 'pip_sft_rst' [-Wunused-variable] union cvmx_pip_sft_rst pip_sft_rst; ^~~~~~~~~~~ make[4]: *** [scripts/Makefile.build:266: drivers/staging/octeon/ethernet.o] Error 1 make[3]: *** [scripts/Makefile.build:509: drivers/staging/octeon] Error 2 Fix this by having asm/octeon/cvmx-ipd.h include the asm/octeon/cvmx-pip-defs.h header that it is reliant upon, rather than requiring its users to pull in that header before it. Signed-off-by: Paul Burton Fixes: 171a9bae68c7 ("staging/octeon: Allow test build on !MIPS") Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Matthew Wilcox (Oracle) Cc: linux-mips@vger.kernel.org Cc: David S . Miller Cc: Matthew Wilcox --- arch/mips/include/asm/octeon/cvmx-ipd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/include/asm/octeon/cvmx-ipd.h b/arch/mips/include/asm/octeon/cvmx-ipd.h index cbdc14b77435..adab7b54c3b4 100644 --- a/arch/mips/include/asm/octeon/cvmx-ipd.h +++ b/arch/mips/include/asm/octeon/cvmx-ipd.h @@ -36,6 +36,7 @@ #include #include +#include enum cvmx_ipd_mode { CVMX_IPD_OPC_MODE_STT = 0LL, /* All blocks DRAM, not cached in L2 */ From 69f08e68af25071483b91769957743de3b9deaef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 12 Sep 2019 12:13:50 +0200 Subject: [PATCH 222/312] drm/amdgpu: revert "disable bulk moves for now" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit a213c2c7e235cfc0e0a161a558f7fdf2fb3a624a. The changes to fix this should have landed in 5.1. Signed-off-by: Christian König Reviewed-by: Zhou, David(ChunMing) Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index e2fb141ff2e5..5251352f5922 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -603,14 +603,12 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, struct ttm_bo_global *glob = adev->mman.bdev.glob; struct amdgpu_vm_bo_base *bo_base; -#if 0 if (vm->bulk_moveable) { spin_lock(&glob->lru_lock); ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move); spin_unlock(&glob->lru_lock); return; } -#endif memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move)); From e0e4a2ce7a059d051c66cd7c94314fef3cd91aea Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 26 Sep 2019 16:16:41 +0800 Subject: [PATCH 223/312] drm/amd/powerplay: change metrics update period from 1ms to 100ms v2: change period from 10ms to 100ms (typo error) too high frequence to update mertrics table will cause smu firmware error,so change mertrics table update period from 1ms to 100ms (navi10, 12, 14) Signed-off-by: Kevin Wang Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.3.x --- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index 12c0e469bf35..2574f7186e16 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -547,7 +547,7 @@ static int navi10_get_metrics_table(struct smu_context *smu, struct smu_table_context *smu_table= &smu->smu_table; int ret = 0; - if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + HZ / 1000)) { + if (!smu_table->metrics_time || time_after(jiffies, smu_table->metrics_time + msecs_to_jiffies(100))) { ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0, (void *)smu_table->metrics_table, false); if (ret) { From 00921306751001b539ebee34171485cd9c749024 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 26 Sep 2019 16:22:13 +0800 Subject: [PATCH 224/312] drm/amd/powerplay: add sensor lock support for smu when multithreading access sysfs of amdgpu_pm_info at the sametime. the swsmu driver cause smu firmware hang. eg: single thread access: Message A + Param A ==> right Message B + Param B ==> right Message C + Param C ==> right multithreading access: Message A + Param B ==> error Message B + Param A ==> error Message C + Param C ==> right the patch will add sensor lock(mutex) to avoid this error. Signed-off-by: Kevin Wang Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.3.x --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 2 ++ drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 2 ++ drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 ++ drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 2 ++ 5 files changed, 9 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 33960fb38a5d..4acf139ea014 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -843,6 +843,8 @@ static int smu_sw_init(void *handle) smu->smu_baco.state = SMU_BACO_STATE_EXIT; smu->smu_baco.platform_support = false; + mutex_init(&smu->sensor_lock); + smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c index f1f072012fac..d493a3f8c07a 100644 --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c @@ -1018,6 +1018,7 @@ static int arcturus_read_sensor(struct smu_context *smu, if (!data || !size) return -EINVAL; + mutex_lock(&smu->sensor_lock); switch (sensor) { case AMDGPU_PP_SENSOR_MAX_FAN_RPM: *(uint32_t *)data = pptable->FanMaximumRpm; @@ -1044,6 +1045,7 @@ static int arcturus_read_sensor(struct smu_context *smu, default: ret = smu_smc_read_sensor(smu, sensor, data, size); } + mutex_unlock(&smu->sensor_lock); return ret; } diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index 6109815a0401..23171a4d9a31 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -344,6 +344,7 @@ struct smu_context const struct smu_funcs *funcs; const struct pptable_funcs *ppt_funcs; struct mutex mutex; + struct mutex sensor_lock; uint64_t pool_size; struct smu_table_context smu_table; diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index 2574f7186e16..0b461404af6b 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -1386,6 +1386,7 @@ static int navi10_read_sensor(struct smu_context *smu, if(!data || !size) return -EINVAL; + mutex_lock(&smu->sensor_lock); switch (sensor) { case AMDGPU_PP_SENSOR_MAX_FAN_RPM: *(uint32_t *)data = pptable->FanMaximumRpm; @@ -1409,6 +1410,7 @@ static int navi10_read_sensor(struct smu_context *smu, default: ret = smu_smc_read_sensor(smu, sensor, data, size); } + mutex_unlock(&smu->sensor_lock); return ret; } diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c index 64386ee3f878..bbd8ebd58434 100644 --- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c @@ -3023,6 +3023,7 @@ static int vega20_read_sensor(struct smu_context *smu, if(!data || !size) return -EINVAL; + mutex_lock(&smu->sensor_lock); switch (sensor) { case AMDGPU_PP_SENSOR_MAX_FAN_RPM: *(uint32_t *)data = pptable->FanMaximumRpm; @@ -3048,6 +3049,7 @@ static int vega20_read_sensor(struct smu_context *smu, default: ret = smu_smc_read_sensor(smu, sensor, data, size); } + mutex_unlock(&smu->sensor_lock); return ret; } From 8225630ea61da1c96af84849a055d5b20b404438 Mon Sep 17 00:00:00 2001 From: Aaron Liu Date: Mon, 16 Sep 2019 10:05:09 +0800 Subject: [PATCH 225/312] Revert "drm/amdgpu: disable stutter mode for renoir" This reverts commit 5813f97a5969bf1e7e723397a74e00b5de7278d6. Since SBIOS WCD9925N, NMI printing disappeared. Hence enable stutter mode. Signed-off-by: Aaron Liu Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 8cab6da512a0..a97eea304cc0 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2385,8 +2385,6 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) if (adev->asic_type != CHIP_CARRIZO && adev->asic_type != CHIP_STONEY) dm->dc->debug.disable_stutter = amdgpu_pp_feature_mask & PP_STUTTER_MODE ? false : true; - if (adev->asic_type == CHIP_RENOIR) - dm->dc->debug.disable_stutter = true; return 0; fail: From 49379032aa22a040fdd7708fcb6300d1d98a0f57 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 1 Oct 2019 16:45:27 -0500 Subject: [PATCH 226/312] drm/amdgpu: don't increment vram lost if we are in hibernation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We reset the GPU as part of our hibernation sequence so we need to make sure we don't mark vram as lost in that case. Bug: https://bugs.freedesktop.org/show_bug.cgi?id=111879 Acked-by: Christian König Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/nv.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 85393a99a848..de9b995b65b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -317,10 +317,12 @@ static int nv_asic_reset(struct amdgpu_device *adev) struct smu_context *smu = &adev->smu; if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { - amdgpu_inc_vram_lost(adev); + if (!adev->in_suspend) + amdgpu_inc_vram_lost(adev); ret = smu_baco_reset(smu); } else { - amdgpu_inc_vram_lost(adev); + if (!adev->in_suspend) + amdgpu_inc_vram_lost(adev); ret = nv_asic_mode1_reset(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index f70658a536a9..61df616b5d45 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -558,12 +558,14 @@ static int soc15_asic_reset(struct amdgpu_device *adev) { switch (soc15_asic_reset_method(adev)) { case AMD_RESET_METHOD_BACO: - amdgpu_inc_vram_lost(adev); + if (!adev->in_suspend) + amdgpu_inc_vram_lost(adev); return soc15_asic_baco_reset(adev); case AMD_RESET_METHOD_MODE2: return soc15_mode2_reset(adev); default: - amdgpu_inc_vram_lost(adev); + if (!adev->in_suspend) + amdgpu_inc_vram_lost(adev); return soc15_asic_mode1_reset(adev); } } From 815fb4c9d7da862a47c9d2a9765a4759826c5783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 24 Sep 2019 17:53:25 -0400 Subject: [PATCH 227/312] drm/amdgpu: return tcc_disabled_mask to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UMDs need this for correct programming of harvested chips. Signed-off-by: Marek Olšák Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 ++ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 12 ++++++++++++ include/uapi/drm/amdgpu_drm.h | 2 ++ 5 files changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 264677ab248a..6f8aaf655a9f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -81,9 +81,10 @@ * - 3.32.0 - Add syncobj timeline support to AMDGPU_CS. * - 3.33.0 - Fixes for GDS ENOMEM failures in AMDGPU_CS. * - 3.34.0 - Non-DC can flip correctly between buffers with different pitches + * - 3.35.0 - Add drm_amdgpu_info_device::tcc_disabled_mask */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 34 +#define KMS_DRIVER_MINOR 35 #define KMS_DRIVER_PATCHLEVEL 0 #define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 554a59b3c4a6..6ee4021910e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -165,6 +165,7 @@ struct amdgpu_gfx_config { uint32_t num_sc_per_sh; uint32_t num_packer_per_sc; uint32_t pa_sc_tile_steering_override; + uint64_t tcc_disabled_mask; }; struct amdgpu_cu_info { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index f6147528be64..f2c097983f48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -787,6 +787,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file dev_info.pa_sc_tile_steering_override = adev->gfx.config.pa_sc_tile_steering_override; + dev_info.tcc_disabled_mask = adev->gfx.config.tcc_disabled_mask; + return copy_to_user(out, &dev_info, min((size_t)size, sizeof(dev_info))) ? -EFAULT : 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 638c821611ab..957811b73672 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -1691,6 +1691,17 @@ static void gfx_v10_0_tcp_harvest(struct amdgpu_device *adev) } } +static void gfx_v10_0_get_tcc_info(struct amdgpu_device *adev) +{ + /* TCCs are global (not instanced). */ + uint32_t tcc_disable = RREG32_SOC15(GC, 0, mmCGTS_TCC_DISABLE) | + RREG32_SOC15(GC, 0, mmCGTS_USER_TCC_DISABLE); + + adev->gfx.config.tcc_disabled_mask = + REG_GET_FIELD(tcc_disable, CGTS_TCC_DISABLE, TCC_DISABLE) | + (REG_GET_FIELD(tcc_disable, CGTS_TCC_DISABLE, HI_TCC_DISABLE) << 16); +} + static void gfx_v10_0_constants_init(struct amdgpu_device *adev) { u32 tmp; @@ -1702,6 +1713,7 @@ static void gfx_v10_0_constants_init(struct amdgpu_device *adev) gfx_v10_0_setup_rb(adev); gfx_v10_0_get_cu_info(adev, &adev->gfx.cu_info); + gfx_v10_0_get_tcc_info(adev); adev->gfx.config.pa_sc_tile_steering_override = gfx_v10_0_init_pa_sc_tile_steering_override(adev); diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index c99b4f2482c6..4fe35d600ab8 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -1003,6 +1003,8 @@ struct drm_amdgpu_info_device { __u64 high_va_max; /* gfx10 pa_sc_tile_steering_override */ __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; }; struct drm_amdgpu_info_hw_ip { From 57be09c6e8747bf48704136d9e3f92bfb93f5725 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Tue, 1 Oct 2019 22:46:07 -0500 Subject: [PATCH 228/312] drm/amdgpu: fix multiple memory leaks in acp_hw_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In acp_hw_init there are some allocations that needs to be released in case of failure: 1- adev->acp.acp_genpd should be released if any allocation attemp for adev->acp.acp_cell, adev->acp.acp_res or i2s_pdata fails. 2- all of those allocations should be released if mfd_add_hotplug_devices or pm_genpd_add_device fail. 3- Release is needed in case of time out values expire. Reviewed-by: Christian König Signed-off-by: Navid Emamdoost Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c | 34 ++++++++++++++++--------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c index eba42c752bca..82155ac3288a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c @@ -189,7 +189,7 @@ static int acp_hw_init(void *handle) u32 val = 0; u32 count = 0; struct device *dev; - struct i2s_platform_data *i2s_pdata; + struct i2s_platform_data *i2s_pdata = NULL; struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -231,20 +231,21 @@ static int acp_hw_init(void *handle) adev->acp.acp_cell = kcalloc(ACP_DEVS, sizeof(struct mfd_cell), GFP_KERNEL); - if (adev->acp.acp_cell == NULL) - return -ENOMEM; + if (adev->acp.acp_cell == NULL) { + r = -ENOMEM; + goto failure; + } adev->acp.acp_res = kcalloc(5, sizeof(struct resource), GFP_KERNEL); if (adev->acp.acp_res == NULL) { - kfree(adev->acp.acp_cell); - return -ENOMEM; + r = -ENOMEM; + goto failure; } i2s_pdata = kcalloc(3, sizeof(struct i2s_platform_data), GFP_KERNEL); if (i2s_pdata == NULL) { - kfree(adev->acp.acp_res); - kfree(adev->acp.acp_cell); - return -ENOMEM; + r = -ENOMEM; + goto failure; } switch (adev->asic_type) { @@ -341,14 +342,14 @@ static int acp_hw_init(void *handle) r = mfd_add_hotplug_devices(adev->acp.parent, adev->acp.acp_cell, ACP_DEVS); if (r) - return r; + goto failure; for (i = 0; i < ACP_DEVS ; i++) { dev = get_mfd_cell_dev(adev->acp.acp_cell[i].name, i); r = pm_genpd_add_device(&adev->acp.acp_genpd->gpd, dev); if (r) { dev_err(dev, "Failed to add dev to genpd\n"); - return r; + goto failure; } } @@ -367,7 +368,8 @@ static int acp_hw_init(void *handle) break; if (--count == 0) { dev_err(&adev->pdev->dev, "Failed to reset ACP\n"); - return -ETIMEDOUT; + r = -ETIMEDOUT; + goto failure; } udelay(100); } @@ -384,7 +386,8 @@ static int acp_hw_init(void *handle) break; if (--count == 0) { dev_err(&adev->pdev->dev, "Failed to reset ACP\n"); - return -ETIMEDOUT; + r = -ETIMEDOUT; + goto failure; } udelay(100); } @@ -393,6 +396,13 @@ static int acp_hw_init(void *handle) val &= ~ACP_SOFT_RESET__SoftResetAud_MASK; cgs_write_register(adev->acp.cgs_device, mmACP_SOFT_RESET, val); return 0; + +failure: + kfree(i2s_pdata); + kfree(adev->acp.acp_res); + kfree(adev->acp.acp_cell); + kfree(adev->acp.acp_genpd); + return r; } /** From 055e547478a11a6360c7ce05e2afc3e366968a12 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Mon, 16 Sep 2019 22:20:44 -0500 Subject: [PATCH 229/312] drm/amd/display: memory leak In dcn*_clock_source_create when dcn20_clk_src_construct fails allocated clk_src needs release. Signed-off-by: Navid Emamdoost Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c | 1 + drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c | 1 + 7 files changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c b/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c index 1787b9bf800a..76d54885374a 100644 --- a/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c @@ -668,6 +668,7 @@ struct clock_source *dce100_clock_source_create( return &clk_src->base; } + kfree(clk_src); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c index 318e9c2e2ca8..89620adc81d8 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c @@ -714,6 +714,7 @@ struct clock_source *dce110_clock_source_create( return &clk_src->base; } + kfree(clk_src); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c b/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c index 83e1878161c9..21a657e79306 100644 --- a/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c @@ -687,6 +687,7 @@ struct clock_source *dce112_clock_source_create( return &clk_src->base; } + kfree(clk_src); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c b/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c index 8b85e5274bba..7c52f7f9196c 100644 --- a/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c @@ -500,6 +500,7 @@ static struct clock_source *dce120_clock_source_create( return &clk_src->base; } + kfree(clk_src); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c b/drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c index 4625df9f9fd2..643ccb0ade00 100644 --- a/drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c @@ -701,6 +701,7 @@ struct clock_source *dce80_clock_source_create( return &clk_src->base; } + kfree(clk_src); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c index 59305e411a66..1599bb971111 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c @@ -786,6 +786,7 @@ struct clock_source *dcn10_clock_source_create( return &clk_src->base; } + kfree(clk_src); BREAK_TO_DEBUGGER(); return NULL; } diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c index b4e3ce22ed52..5a2763daff4d 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c @@ -1077,6 +1077,7 @@ struct clock_source *dcn20_clock_source_create( return &clk_src->base; } + kfree(clk_src); BREAK_TO_DEBUGGER(); return NULL; } From ec3e5c0f0c2b716e768c0eee0fec30d572939ef5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Oct 2019 14:01:22 +0200 Subject: [PATCH 230/312] drm/amdgpu: make pmu support optional, again When CONFIG_PERF_EVENTS is disabled, we cannot compile the pmu portion of the amdgpu driver: drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c:48:38: error: no member named 'hw' in 'struct perf_event' struct hw_perf_event *hwc = &event->hw; ~~~~~ ^ drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c:51:13: error: no member named 'attr' in 'struct perf_event' if (event->attr.type != event->pmu->type) ~~~~~ ^ ... The same bug was already fixed by commit d155bef0636e ("amdgpu: make pmu support optional") but broken again by what looks like an incorrectly rebased patch. Fixes: 64f55e629237 ("drm/amdgpu: Add RAS EEPROM table.") Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 42e2c1f57152..00962a659009 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \ amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \ amdgpu_vf_error.o amdgpu_sched.o amdgpu_debugfs.o amdgpu_ids.o \ amdgpu_gmc.o amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \ - amdgpu_vm_sdma.o amdgpu_pmu.o amdgpu_discovery.o amdgpu_ras_eeprom.o smu_v11_0_i2c.o + amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o smu_v11_0_i2c.o amdgpu-$(CONFIG_PERF_EVENTS) += amdgpu_pmu.o From 29174a4310bf9b25b4961c30b38be4151732d1c7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Oct 2019 14:01:23 +0200 Subject: [PATCH 231/312] drm/amdgpu: hide another #warning An earlier patch of mine disabled some #warning statements that get in the way of build testing, but then another instance was added around the same time. Remove that as well. Fixes: b5203d16aef4 ("drm/amd/amdgpu: hide #warning for missing DC config") Fixes: e1c14c43395c ("drm/amdgpu: Enable DC on Renoir") Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/soc15.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 61df616b5d45..f8ab80c8801b 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -773,8 +773,6 @@ int soc15_set_ip_blocks(struct amdgpu_device *adev) #if defined(CONFIG_DRM_AMD_DC) else if (amdgpu_device_has_dc_support(adev)) amdgpu_device_ip_block_add(adev, &dm_ip_block); -#else -# warning "Enable CONFIG_DRM_AMD_DC for display support on SOC15." #endif amdgpu_device_ip_block_add(adev, &vcn_v2_0_ip_block); break; From beda921dbc99ce240ae88e2c9740c99c354c7ca2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Oct 2019 14:01:24 +0200 Subject: [PATCH 232/312] drm/amdgpu: display_mode_vba_21: remove uint typedef The type definition for 'uint' clashes with the generic kernel headers: drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn21/display_mode_vba_21.c:43:22: error: redefinition of typedef 'uint' is a C11 feature [-Werror,-Wtypedef-redefinition] include/linux/types.h:92:23: note: previous definition is here Just remove this type and use plain 'unsigned int' consistently, as it is already use almost everywhere in this file. Fixes: b04641a3f4c5 ("drm/amd/display: Add Renoir DML") Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- .../amd/display/dc/dml/dcn21/display_mode_vba_21.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_mode_vba_21.c b/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_mode_vba_21.c index 456cd0e3289c..3b6ed60dcd35 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_mode_vba_21.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_mode_vba_21.c @@ -39,9 +39,6 @@ * ways. Unless there is something clearly wrong with it the code should * remain as-is as it provides us with a guarantee from HW that it is correct. */ - -typedef unsigned int uint; - typedef struct { double DPPCLK; double DISPCLK; @@ -4774,7 +4771,7 @@ void dml21_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l mode_lib->vba.MaximumReadBandwidthWithoutPrefetch = 0.0; mode_lib->vba.MaximumReadBandwidthWithPrefetch = 0.0; for (k = 0; k <= mode_lib->vba.NumberOfActivePlanes - 1; k++) { - uint m; + unsigned int m; locals->cursor_bw[k] = 0; locals->cursor_bw_pre[k] = 0; @@ -5285,7 +5282,7 @@ static void CalculateWatermarksAndDRAMSpeedChangeSupport( double SecondMinActiveDRAMClockChangeMarginOneDisplayInVBLank; double FullDETBufferingTimeYStutterCriticalPlane = 0; double TimeToFinishSwathTransferStutterCriticalPlane = 0; - uint k, j; + unsigned int k, j; mode_lib->vba.TotalActiveDPP = 0; mode_lib->vba.TotalDCCActiveDPP = 0; @@ -5507,7 +5504,7 @@ static void CalculateDCFCLKDeepSleep( double DPPCLK[], double *DCFCLKDeepSleep) { - uint k; + unsigned int k; double DisplayPipeLineDeliveryTimeLuma; double DisplayPipeLineDeliveryTimeChroma; //double DCFCLKDeepSleepPerPlane[DC__NUM_DPP__MAX]; @@ -5727,7 +5724,7 @@ static void CalculatePixelDeliveryTimes( double DisplayPipeRequestDeliveryTimeChromaPrefetch[]) { double req_per_swath_ub; - uint k; + unsigned int k; for (k = 0; k < NumberOfActivePlanes; ++k) { if (VRatio[k] <= 1) { @@ -5869,7 +5866,7 @@ static void CalculateMetaAndPTETimes( unsigned int dpte_groups_per_row_chroma_ub; unsigned int num_group_per_lower_vm_stage; unsigned int num_req_per_lower_vm_stage; - uint k; + unsigned int k; for (k = 0; k < NumberOfActivePlanes; ++k) { if (GPUVMEnable == true) { From 148d31e38fb37ba442f9b554325114b9e80cf336 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 21:53:58 +0200 Subject: [PATCH 233/312] drm/amd/display: hide an unused variable Without CONFIG_DEBUG_FS, we get a warning for an unused variable: drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:6020:33: error: unused variable 'source' [-Werror,-Wunused-variable] Hide the variable in an #ifdef like its only users. Fixes: 14b2584636c6 ("drm/amd/display: add functionality to grab DPRX CRC entries.") Reviewed-by: Harry Wentland Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index a97eea304cc0..a52f0b13a2c8 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6017,7 +6017,9 @@ static void amdgpu_dm_enable_crtc_interrupts(struct drm_device *dev, struct drm_crtc *crtc; struct drm_crtc_state *old_crtc_state, *new_crtc_state; int i; +#ifdef CONFIG_DEBUG_FS enum amdgpu_dm_pipe_crc_source source; +#endif for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) { From c0e70e10b11b1b7ca0f537e7ce4404a0459cdc7c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Oct 2019 14:01:25 +0200 Subject: [PATCH 234/312] drm/amd/display: fix dcn21 Makefile for clang Just like all the other variants, this one passes invalid compile-time options with clang after the new code got merged: clang: error: unknown argument: '-mpreferred-stack-boundary=4' scripts/Makefile.build:265: recipe for target 'drivers/gpu/drm/amd/amdgpu/../display/dc/dcn21/dcn21_resource.o' failed Use the same variant that we have for dcn20 to fix compilation. Fixes: eced51f9babb ("drm/amd/display: Add hubp block for Renoir (v2)") Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn21/Makefile | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/Makefile b/drivers/gpu/drm/amd/display/dc/dcn21/Makefile index 8cd9de8b1a7a..ef673bffc241 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn21/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dcn21/Makefile @@ -3,7 +3,17 @@ DCN21 = dcn21_hubp.o dcn21_hubbub.o dcn21_resource.o -CFLAGS_$(AMDDALPATH)/dc/dcn21/dcn21_resource.o := -mhard-float -msse -mpreferred-stack-boundary=4 +ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) + cc_stack_align := -mpreferred-stack-boundary=4 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align := -mstack-alignment=16 +endif + +CFLAGS_$(AMDDALPATH)/dc/dcn21/dcn21_resource.o := -mhard-float -msse $(cc_stack_align) + +ifdef CONFIG_CC_IS_CLANG +CFLAGS_$(AMDDALPATH)/dc/dcn21/dcn21_resource.o += -msse2 +endif AMD_DAL_DCN21 = $(addprefix $(AMDDALPATH)/dc/dcn21/,$(DCN21)) From e2973352fa1390c9a40d8a23c8c143dadd0c028c Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 2 Oct 2019 16:42:04 +0200 Subject: [PATCH 235/312] dt-bindings: dsp: Fix fsl,dsp example The fsl,dsp binding requires a memory-region, yet its example doesn't have one which results in a warning. Let's add a memory-region phandle to the example. Fixes: 7db2f2dfc701 ("dt-bindings: dsp: fsl: Add DSP core binding support") Signed-off-by: Maxime Ripard Reviewed-by: Daniel Baluta Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/dsp/fsl,dsp.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/dsp/fsl,dsp.yaml b/Documentation/devicetree/bindings/dsp/fsl,dsp.yaml index 3248595dc93c..f04870d84542 100644 --- a/Documentation/devicetree/bindings/dsp/fsl,dsp.yaml +++ b/Documentation/devicetree/bindings/dsp/fsl,dsp.yaml @@ -85,4 +85,5 @@ examples: <&pd IMX_SC_R_DSP_RAM>; mbox-names = "txdb0", "txdb1", "rxdb0", "rxdb1"; mboxes = <&lsio_mu13 2 0>, <&lsio_mu13 2 1>, <&lsio_mu13 3 0>, <&lsio_mu13 3 1>; + memory-region = <&dsp_reserved>; }; From 58c50fe0c9f02f4065349b7ddefbb9899c589396 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 2 Oct 2019 16:44:00 +0200 Subject: [PATCH 236/312] dt-bindings: media: rc: Fix redundant string The linux,rc-map-name property is described using an enum, yet a value has been put in that enum twice, resulting in a warning. Let's fix that. Fixes: 7c31b9d67342 ("media: dt-bindings: media: Add YAML schemas for the generic RC bindings") Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/media/rc.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/media/rc.yaml b/Documentation/devicetree/bindings/media/rc.yaml index 3d5c154fd230..9054555e6608 100644 --- a/Documentation/devicetree/bindings/media/rc.yaml +++ b/Documentation/devicetree/bindings/media/rc.yaml @@ -73,7 +73,6 @@ properties: - rc-genius-tvgo-a11mce - rc-gotview7135 - rc-hauppauge - - rc-hauppauge - rc-hisi-poplar - rc-hisi-tv-demo - rc-imon-mce From 932bae3a5b4e9b9a40133e4dc989c2099f3d1823 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 2 Oct 2019 16:45:41 +0200 Subject: [PATCH 237/312] dt-bindings: iio: ad7192: Fix Regulator Properties The AD7192 binding describes two regulator properties, avdd-supply and dvdd-supply, but describes it as a constant string that must be avdd and dvdd. This is wrong since a *-supply property is actually a phandle, and results in warnings when the example is validated (or any device tree using that device, for that matter). Let's remove that requirement. Fixes: f7356e47032c ("dt-bindings: iio: adc: ad7192: Add binding documentation for AD7192") Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml index 676ec42e1438..9e62f54c891a 100644 --- a/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml +++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml @@ -43,13 +43,9 @@ properties: dvdd-supply: description: DVdd voltage supply - items: - - const: dvdd avdd-supply: description: AVdd voltage supply - items: - - const: avdd adi,rejection-60-Hz-enable: description: | From 4d32db74a3cafa9c96608265db6c40317040df41 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 2 Oct 2019 16:45:42 +0200 Subject: [PATCH 238/312] dt-bindings: iio: ad7192: Fix DTC warning in the example The example contains an SPI bus and device, but doesn't have the appropriate size and address cells size. This creates a DTC warning when the example is compiled since the default ones will not match what the device uses. Let's add them to remove that warning. Fixes: f7356e47032c ("dt-bindings: iio: adc: ad7192: Add binding documentation for AD7192") Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml index 9e62f54c891a..567a33a83dce 100644 --- a/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml +++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml @@ -95,6 +95,9 @@ required: examples: - | spi0 { + #address-cells = <1>; + #size-cells = <0>; + adc@0 { compatible = "adi,ad7192"; reg = <0>; From f437ade3296bacaddb6d7882ba0515940f01daf4 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 2 Oct 2019 16:46:40 +0200 Subject: [PATCH 239/312] dt-bindings: phy: lantiq: Fix Property Name The binding has a typo where resets-names should read reset-names, which in turn leads to a warning when the example is validated, since reset-names is being used, and the binding prevent the usage of any property that isn't described. Fixes: 088e88be5a38 ("dt-bindings: phy: add binding for the Lantiq VRX200 and ARX300 PCIe PHYs") Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- .../devicetree/bindings/phy/lantiq,vrx200-pcie-phy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/phy/lantiq,vrx200-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/lantiq,vrx200-pcie-phy.yaml index 8a56a8526cef..a97482179cf5 100644 --- a/Documentation/devicetree/bindings/phy/lantiq,vrx200-pcie-phy.yaml +++ b/Documentation/devicetree/bindings/phy/lantiq,vrx200-pcie-phy.yaml @@ -37,7 +37,7 @@ properties: - description: exclusive PHY reset line - description: shared reset line between the PCIe PHY and PCIe controller - resets-names: + reset-names: items: - const: phy - const: pcie From a8fabb38525c51a094607768bac3ba46b3f4a9d5 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 1 Oct 2019 17:03:55 +0200 Subject: [PATCH 240/312] xen/xenbus: fix self-deadlock after killing user process In case a user process using xenbus has open transactions and is killed e.g. via ctrl-C the following cleanup of the allocated resources might result in a deadlock due to trying to end a transaction in the xenbus worker thread: [ 2551.474706] INFO: task xenbus:37 blocked for more than 120 seconds. [ 2551.492215] Tainted: P OE 5.0.0-29-generic #5 [ 2551.510263] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 2551.528585] xenbus D 0 37 2 0x80000080 [ 2551.528590] Call Trace: [ 2551.528603] __schedule+0x2c0/0x870 [ 2551.528606] ? _cond_resched+0x19/0x40 [ 2551.528632] schedule+0x2c/0x70 [ 2551.528637] xs_talkv+0x1ec/0x2b0 [ 2551.528642] ? wait_woken+0x80/0x80 [ 2551.528645] xs_single+0x53/0x80 [ 2551.528648] xenbus_transaction_end+0x3b/0x70 [ 2551.528651] xenbus_file_free+0x5a/0x160 [ 2551.528654] xenbus_dev_queue_reply+0xc4/0x220 [ 2551.528657] xenbus_thread+0x7de/0x880 [ 2551.528660] ? wait_woken+0x80/0x80 [ 2551.528665] kthread+0x121/0x140 [ 2551.528667] ? xb_read+0x1d0/0x1d0 [ 2551.528670] ? kthread_park+0x90/0x90 [ 2551.528673] ret_from_fork+0x35/0x40 Fix this by doing the cleanup via a workqueue instead. Reported-by: James Dingwall Fixes: fd8aa9095a95c ("xen: optimize xenbus driver for multiple concurrent xenstore accesses") Cc: # 4.11 Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/xenbus/xenbus_dev_frontend.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 08adc590f631..597af455a522 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -116,6 +117,8 @@ struct xenbus_file_priv { wait_queue_head_t read_waitq; struct kref kref; + + struct work_struct wq; }; /* Read out any raw xenbus messages queued up. */ @@ -300,14 +303,14 @@ static void watch_fired(struct xenbus_watch *watch, mutex_unlock(&adap->dev_data->reply_mutex); } -static void xenbus_file_free(struct kref *kref) +static void xenbus_worker(struct work_struct *wq) { struct xenbus_file_priv *u; struct xenbus_transaction_holder *trans, *tmp; struct watch_adapter *watch, *tmp_watch; struct read_buffer *rb, *tmp_rb; - u = container_of(kref, struct xenbus_file_priv, kref); + u = container_of(wq, struct xenbus_file_priv, wq); /* * No need for locking here because there are no other users, @@ -333,6 +336,18 @@ static void xenbus_file_free(struct kref *kref) kfree(u); } +static void xenbus_file_free(struct kref *kref) +{ + struct xenbus_file_priv *u; + + /* + * We might be called in xenbus_thread(). + * Use workqueue to avoid deadlock. + */ + u = container_of(kref, struct xenbus_file_priv, kref); + schedule_work(&u->wq); +} + static struct xenbus_transaction_holder *xenbus_get_transaction( struct xenbus_file_priv *u, uint32_t tx_id) { @@ -650,6 +665,7 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) INIT_LIST_HEAD(&u->watches); INIT_LIST_HEAD(&u->read_buffers); init_waitqueue_head(&u->read_waitq); + INIT_WORK(&u->wq, xenbus_worker); mutex_init(&u->reply_mutex); mutex_init(&u->msgbuffer_mutex); From 3fd57e7a9e66b9a8bcbf0560ff09e84d0b8de1bd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 1 Oct 2019 19:50:23 +0200 Subject: [PATCH 241/312] char/random: Add a newline at the end of the file On Tue, Oct 01, 2019 at 10:14:40AM -0700, Linus Torvalds wrote: > The previous state of the file didn't have that 0xa at the end, so you get that > > > -EXPORT_SYMBOL_GPL(add_bootloader_randomness); > \ No newline at end of file > +EXPORT_SYMBOL_GPL(add_bootloader_randomness); > > which is "the '-' line doesn't have a newline, the '+' line does" marker. Aaha, that makes total sense, thanks for explaining. Oh well, let's fix it then so that people don't scratch heads like me. Signed-off-by: Linus Torvalds --- drivers/char/random.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index c2f7de9dc543..de434feb873a 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2520,4 +2520,4 @@ void add_bootloader_randomness(const void *buf, unsigned int size) else add_device_randomness(buf, size); } -EXPORT_SYMBOL_GPL(add_bootloader_randomness); \ No newline at end of file +EXPORT_SYMBOL_GPL(add_bootloader_randomness); From 0671c5b84e9e0a6d42d22da9b5d093787ac1c5f3 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Wed, 2 Oct 2019 18:59:49 +0000 Subject: [PATCH 242/312] MIPS: Wire up clone3 syscall Wire up the new clone3 syscall for MIPS, using save_static_function() to generate a wrapper that saves registers $s0-$s7 prior to invoking the generic sys_clone3 function just like we do for plain old clone. Tested atop 64r6el_defconfig using o32, n32 & n64 builds of the simple test program from: https://lore.kernel.org/lkml/20190716130631.tohj4ub54md25dys@brauner.io/ Signed-off-by: Paul Burton Cc: Christian Brauner Acked-by: Christian Brauner Cc: linux-mips@vger.kernel.org --- arch/mips/include/asm/unistd.h | 1 + arch/mips/kernel/syscall.c | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 2 +- arch/mips/kernel/syscalls/syscall_n64.tbl | 2 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- 5 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 071053ece677..5d70babfc9ee 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -52,6 +52,7 @@ # endif #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 /* whitelists for checksyscalls */ #define __IGNORE_fadvise64_64 diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index b0e25e913bdb..3f16f3823031 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -80,6 +80,7 @@ SYSCALL_DEFINE6(mips_mmap2, unsigned long, addr, unsigned long, len, save_static_function(sys_fork); save_static_function(sys_clone); +save_static_function(sys_clone3); SYSCALL_DEFINE1(set_thread_area, unsigned long, addr) { diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index c9c879ec9b6d..e7c5ab38e403 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -373,4 +373,4 @@ 432 n32 fsmount sys_fsmount 433 n32 fspick sys_fspick 434 n32 pidfd_open sys_pidfd_open -# 435 reserved for clone3 +435 n32 clone3 __sys_clone3 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index bbce9159caa1..13cd66581f3b 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -349,4 +349,4 @@ 432 n64 fsmount sys_fsmount 433 n64 fspick sys_fspick 434 n64 pidfd_open sys_pidfd_open -# 435 reserved for clone3 +435 n64 clone3 __sys_clone3 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 9653591428ec..353539ea4140 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -422,4 +422,4 @@ 432 o32 fsmount sys_fsmount 433 o32 fspick sys_fspick 434 o32 pidfd_open sys_pidfd_open -# 435 reserved for clone3 +435 o32 clone3 __sys_clone3 From 90800281e761247d8096ab213c10d638602655f9 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Wed, 2 Oct 2019 17:46:17 +0000 Subject: [PATCH 243/312] MIPS: VDSO: Remove unused gettimeofday.c arch/mips/vdso/gettimeofday.c has been unused since commit 24640f233b46 ("mips: Add support for generic vDSO"). Remove the dead code. Signed-off-by: Paul Burton Fixes: 24640f233b46 ("mips: Add support for generic vDSO") Cc: Vincenzo Frascino Cc: linux-mips@vger.kernel.org --- arch/mips/vdso/gettimeofday.c | 269 ---------------------------------- 1 file changed, 269 deletions(-) delete mode 100644 arch/mips/vdso/gettimeofday.c diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c deleted file mode 100644 index e8243c7fd5b5..000000000000 --- a/arch/mips/vdso/gettimeofday.c +++ /dev/null @@ -1,269 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2015 Imagination Technologies - * Author: Alex Smith - */ - -#include "vdso.h" - -#include -#include - -#include -#include -#include -#include - -#ifdef CONFIG_MIPS_CLOCK_VSYSCALL - -static __always_inline long gettimeofday_fallback(struct timeval *_tv, - struct timezone *_tz) -{ - register struct timezone *tz asm("a1") = _tz; - register struct timeval *tv asm("a0") = _tv; - register long ret asm("v0"); - register long nr asm("v0") = __NR_gettimeofday; - register long error asm("a3"); - - asm volatile( - " syscall\n" - : "=r" (ret), "=r" (error) - : "r" (tv), "r" (tz), "r" (nr) - : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13", - "$14", "$15", "$24", "$25", "hi", "lo", "memory"); - - return error ? -ret : ret; -} - -#endif - -static __always_inline long clock_gettime_fallback(clockid_t _clkid, - struct timespec *_ts) -{ - register struct timespec *ts asm("a1") = _ts; - register clockid_t clkid asm("a0") = _clkid; - register long ret asm("v0"); - register long nr asm("v0") = __NR_clock_gettime; - register long error asm("a3"); - - asm volatile( - " syscall\n" - : "=r" (ret), "=r" (error) - : "r" (clkid), "r" (ts), "r" (nr) - : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13", - "$14", "$15", "$24", "$25", "hi", "lo", "memory"); - - return error ? -ret : ret; -} - -static __always_inline int do_realtime_coarse(struct timespec *ts, - const union mips_vdso_data *data) -{ - u32 start_seq; - - do { - start_seq = vdso_data_read_begin(data); - - ts->tv_sec = data->xtime_sec; - ts->tv_nsec = data->xtime_nsec >> data->cs_shift; - } while (vdso_data_read_retry(data, start_seq)); - - return 0; -} - -static __always_inline int do_monotonic_coarse(struct timespec *ts, - const union mips_vdso_data *data) -{ - u32 start_seq; - u64 to_mono_sec; - u64 to_mono_nsec; - - do { - start_seq = vdso_data_read_begin(data); - - ts->tv_sec = data->xtime_sec; - ts->tv_nsec = data->xtime_nsec >> data->cs_shift; - - to_mono_sec = data->wall_to_mono_sec; - to_mono_nsec = data->wall_to_mono_nsec; - } while (vdso_data_read_retry(data, start_seq)); - - ts->tv_sec += to_mono_sec; - timespec_add_ns(ts, to_mono_nsec); - - return 0; -} - -#ifdef CONFIG_CSRC_R4K - -static __always_inline u64 read_r4k_count(void) -{ - unsigned int count; - - __asm__ __volatile__( - " .set push\n" - " .set mips32r2\n" - " rdhwr %0, $2\n" - " .set pop\n" - : "=r" (count)); - - return count; -} - -#endif - -#ifdef CONFIG_CLKSRC_MIPS_GIC - -static __always_inline u64 read_gic_count(const union mips_vdso_data *data) -{ - void __iomem *gic = get_gic(data); - u32 hi, hi2, lo; - - do { - hi = __raw_readl(gic + sizeof(lo)); - lo = __raw_readl(gic); - hi2 = __raw_readl(gic + sizeof(lo)); - } while (hi2 != hi); - - return (((u64)hi) << 32) + lo; -} - -#endif - -static __always_inline u64 get_ns(const union mips_vdso_data *data) -{ - u64 cycle_now, delta, nsec; - - switch (data->clock_mode) { -#ifdef CONFIG_CSRC_R4K - case VDSO_CLOCK_R4K: - cycle_now = read_r4k_count(); - break; -#endif -#ifdef CONFIG_CLKSRC_MIPS_GIC - case VDSO_CLOCK_GIC: - cycle_now = read_gic_count(data); - break; -#endif - default: - return 0; - } - - delta = (cycle_now - data->cs_cycle_last) & data->cs_mask; - - nsec = (delta * data->cs_mult) + data->xtime_nsec; - nsec >>= data->cs_shift; - - return nsec; -} - -static __always_inline int do_realtime(struct timespec *ts, - const union mips_vdso_data *data) -{ - u32 start_seq; - u64 ns; - - do { - start_seq = vdso_data_read_begin(data); - - if (data->clock_mode == VDSO_CLOCK_NONE) - return -ENOSYS; - - ts->tv_sec = data->xtime_sec; - ns = get_ns(data); - } while (vdso_data_read_retry(data, start_seq)); - - ts->tv_nsec = 0; - timespec_add_ns(ts, ns); - - return 0; -} - -static __always_inline int do_monotonic(struct timespec *ts, - const union mips_vdso_data *data) -{ - u32 start_seq; - u64 ns; - u64 to_mono_sec; - u64 to_mono_nsec; - - do { - start_seq = vdso_data_read_begin(data); - - if (data->clock_mode == VDSO_CLOCK_NONE) - return -ENOSYS; - - ts->tv_sec = data->xtime_sec; - ns = get_ns(data); - - to_mono_sec = data->wall_to_mono_sec; - to_mono_nsec = data->wall_to_mono_nsec; - } while (vdso_data_read_retry(data, start_seq)); - - ts->tv_sec += to_mono_sec; - ts->tv_nsec = 0; - timespec_add_ns(ts, ns + to_mono_nsec); - - return 0; -} - -#ifdef CONFIG_MIPS_CLOCK_VSYSCALL - -/* - * This is behind the ifdef so that we don't provide the symbol when there's no - * possibility of there being a usable clocksource, because there's nothing we - * can do without it. When libc fails the symbol lookup it should fall back on - * the standard syscall path. - */ -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - const union mips_vdso_data *data = get_vdso_data(); - struct timespec ts; - int ret; - - ret = do_realtime(&ts, data); - if (ret) - return gettimeofday_fallback(tv, tz); - - if (tv) { - tv->tv_sec = ts.tv_sec; - tv->tv_usec = ts.tv_nsec / 1000; - } - - if (tz) { - tz->tz_minuteswest = data->tz_minuteswest; - tz->tz_dsttime = data->tz_dsttime; - } - - return 0; -} - -#endif /* CONFIG_MIPS_CLOCK_VSYSCALL */ - -int __vdso_clock_gettime(clockid_t clkid, struct timespec *ts) -{ - const union mips_vdso_data *data = get_vdso_data(); - int ret = -1; - - switch (clkid) { - case CLOCK_REALTIME_COARSE: - ret = do_realtime_coarse(ts, data); - break; - case CLOCK_MONOTONIC_COARSE: - ret = do_monotonic_coarse(ts, data); - break; - case CLOCK_REALTIME: - ret = do_realtime(ts, data); - break; - case CLOCK_MONOTONIC: - ret = do_monotonic(ts, data); - break; - default: - break; - } - - if (ret) - ret = clock_gettime_fallback(clkid, ts); - - return ret; -} From 8919975b617197ebe39290d379caa02caf6bf3e3 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Wed, 2 Oct 2019 17:46:36 +0000 Subject: [PATCH 244/312] MIPS: VDSO: Fix build for binutils < 2.25 Versions of binutils prior to 2.25 are unable to link our VDSO due to an unsupported R_MIPS_PC32 relocation generated by the ".word _start - ." line of the inline asm in get_vdso_base(). As such, the intent is that when building with binutils older than 2.25 we don't build code for gettimeofday() & friends in the VDSO that rely upon get_vdso_base(). Commit 24640f233b46 ("mips: Add support for generic vDSO") converted us to using generic VDSO infrastructure, and as part of that the gettimeofday() functionality moved to a new vgettimeofday.c file. The check for binutils < 2.25 wasn't updated to handle this new filename, and so it continues trying to remove the old unused filename from the build. The end result is that we try to include the gettimeofday() code in builds that will fail to link. Fix this by updating the binutils < 2.25 case to remove vgettimeofday.c from obj-vdso-y, rather than gettimeofday.c. Signed-off-by: Paul Burton Fixes: 24640f233b46 ("mips: Add support for generic vDSO") Cc: Vincenzo Frascino Cc: linux-mips@vger.kernel.org --- arch/mips/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 69cfa0a5339e..807f0f782f75 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -59,7 +59,7 @@ CFLAGS_REMOVE_vgettimeofday.o = -pg ifndef CONFIG_CPU_MIPSR6 ifeq ($(call ld-ifversion, -lt, 225000000, y),y) $(warning MIPS VDSO requires binutils >= 2.25) - obj-vdso-y := $(filter-out gettimeofday.o, $(obj-vdso-y)) + obj-vdso-y := $(filter-out vgettimeofday.o, $(obj-vdso-y)) ccflags-vdso += -DDISABLE_MIPS_VDSO endif endif From 66b416ee41ed7a8216e5234a96288dbb8124e4b6 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Tue, 24 Sep 2019 17:19:56 +0200 Subject: [PATCH 245/312] MIPS: init: Fix reservation of memory between PHYS_OFFSET and mem start Fix calculation of the size for reserving memory between PHYS_OFFSET and real memory start. Fixes: a94e4f24ec83 ("MIPS: init: Drop boot_mem_map") Signed-off-by: Thomas Bogendoerfer Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index b8249c233754..f5c6b4c6de24 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -321,7 +321,7 @@ static void __init bootmem_init(void) * Reserve any memory between the start of RAM and PHYS_OFFSET */ if (ramstart > PHYS_OFFSET) - memblock_reserve(PHYS_OFFSET, PFN_UP(ramstart) - PHYS_OFFSET); + memblock_reserve(PHYS_OFFSET, ramstart - PHYS_OFFSET); if (PFN_UP(ramstart) > ARCH_PFN_OFFSET) { pr_info("Wasting %lu bytes for tracking %lu unused pages\n", From bd848d1b9235d027e65fcc87de26cc1b02b41cc8 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Tue, 24 Sep 2019 17:20:51 +0200 Subject: [PATCH 246/312] MIPS: init: Prevent adding memory before PHYS_OFFSET On some SGI machines (IP28 and IP30) a small region of memory is mirrored to pyhsical address 0 for exception vectors while rest of the memory is reachable at a higher physical address. ARC PROM marks this region as reserved, but with commit a94e4f24ec83 ("MIPS: init: Drop boot_mem_map") this chunk is used, when searching for start of ram, which breaks at least IP28 and IP30 machines. To fix this add_region_memory() checks for start address < PHYS_OFFSET and ignores these chunks. Fixes: a94e4f24ec83 ("MIPS: init: Drop boot_mem_map") Signed-off-by: Thomas Bogendoerfer Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/mips/kernel/setup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index f5c6b4c6de24..5eec13b8d222 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -108,6 +108,9 @@ void __init add_memory_region(phys_addr_t start, phys_addr_t size, long type) return; } + if (start < PHYS_OFFSET) + return; + memblock_add(start, size); /* Reserve any memory except the ordinary RAM ranges. */ switch (type) { From 25b69a889b638b0b7e51e2c4fe717a66bec0e566 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 10 Sep 2019 05:59:07 +0200 Subject: [PATCH 247/312] mips: Loongson: Fix the link time qualifier of 'serial_exit()' 'exit' functions should be marked as __exit, not __init. Fixes: 85cc028817ef ("mips: make loongsoon serial driver explicitly modular") Signed-off-by: Christophe JAILLET Signed-off-by: Paul Burton Cc: chenhc@lemote.com Cc: ralf@linux-mips.org Cc: jhogan@kernel.org Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: kernel-janitors@vger.kernel.org --- arch/mips/loongson64/common/serial.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/loongson64/common/serial.c b/arch/mips/loongson64/common/serial.c index ffefc1cb2612..98c3a7feb10f 100644 --- a/arch/mips/loongson64/common/serial.c +++ b/arch/mips/loongson64/common/serial.c @@ -110,7 +110,7 @@ static int __init serial_init(void) } module_init(serial_init); -static void __init serial_exit(void) +static void __exit serial_exit(void) { platform_device_unregister(&uart8250_device); } From 53de429f4e88f538f7a8ec2b18be8c0cd9b2c8e1 Mon Sep 17 00:00:00 2001 From: Yizhuo Date: Tue, 1 Oct 2019 13:24:39 -0700 Subject: [PATCH 248/312] net: hisilicon: Fix usage of uninitialized variable in function mdio_sc_cfg_reg_write() In function mdio_sc_cfg_reg_write(), variable "reg_value" could be uninitialized if regmap_read() fails. However, "reg_value" is used to decide the control flow later in the if statement, which is potentially unsafe. Signed-off-by: Yizhuo Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns_mdio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns_mdio.c b/drivers/net/ethernet/hisilicon/hns_mdio.c index 3e863a71c513..7df5d7d211d4 100644 --- a/drivers/net/ethernet/hisilicon/hns_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns_mdio.c @@ -148,11 +148,15 @@ static int mdio_sc_cfg_reg_write(struct hns_mdio_device *mdio_dev, { u32 time_cnt; u32 reg_value; + int ret; regmap_write(mdio_dev->subctrl_vbase, cfg_reg, set_val); for (time_cnt = MDIO_TIMEOUT; time_cnt; time_cnt--) { - regmap_read(mdio_dev->subctrl_vbase, st_reg, ®_value); + ret = regmap_read(mdio_dev->subctrl_vbase, st_reg, ®_value); + if (ret) + return ret; + reg_value &= st_msk; if ((!!check_st) == (!!reg_value)) break; From 0f1a7b3fac0583083ca19d4de47403511ced3521 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 2 Oct 2019 16:16:07 -0700 Subject: [PATCH 249/312] timer-of: don't use conditional expression with mixed 'void' types Randy Dunlap reports on the sparse list that sparse warns about this expression: of_irq->percpu ? free_percpu_irq(of_irq->irq, clkevt) : free_irq(of_irq->irq, clkevt); and honestly, sparse is correct to warn. The return type of free_percpu_irq() is 'void', while free_irq() returns a 'const void *' that is the devname argument passed in to the request_irq(). You can't mix a void type with a non-void types in a conditional expression according to the C standard. It so happens that gcc seems to accept it - and the resulting type of the expression is void - but there's really no reason for the kernel to have this kind of non-standard expression with no real upside. The natural way to write that expression is with an if-statement: if (of_irq->percpu) free_percpu_irq(of_irq->irq, clkevt); else free_irq(of_irq->irq, clkevt); which is more legible anyway. I'm not sure why that timer-of code seems to have this odd pattern. It does the same at allocation time, but at least there the types match, and it makes sense as an expression. Reported-by: Randy Dunlap Signed-off-by: Linus Torvalds --- drivers/clocksource/timer-of.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c index d8c2bd4391d0..11ff701ff4bb 100644 --- a/drivers/clocksource/timer-of.c +++ b/drivers/clocksource/timer-of.c @@ -25,7 +25,9 @@ static __init void timer_of_irq_exit(struct of_timer_irq *of_irq) struct clock_event_device *clkevt = &to->clkevt; - of_irq->percpu ? free_percpu_irq(of_irq->irq, clkevt) : + if (of_irq->percpu) + free_percpu_irq(of_irq->irq, clkevt); + else free_irq(of_irq->irq, clkevt); } From 134cc4cefad34d8d24670d8a911b59c3b89c6731 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 2 Oct 2019 16:49:46 +0200 Subject: [PATCH 250/312] net: stmmac: Avoid deadlock on suspend/resume The stmmac driver will try to acquire its private mutex during suspend via phylink_resolve() -> stmmac_mac_link_down() -> stmmac_eee_init(). However, the phylink configuration is updated with the private mutex held already, which causes a deadlock during suspend. Fix this by moving the phylink configuration updates out of the region of code protected by the private mutex. Fixes: 19e13cb27b99 ("net: stmmac: Hold rtnl lock in suspend/resume callbacks") Suggested-by: Bitan Biswas Signed-off-by: Thierry Reding Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 843d53e084b7..c76a1336a451 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4716,10 +4716,10 @@ int stmmac_suspend(struct device *dev) if (!ndev || !netif_running(ndev)) return 0; - mutex_lock(&priv->lock); - phylink_mac_change(priv->phylink, false); + mutex_lock(&priv->lock); + netif_device_detach(ndev); stmmac_stop_all_queues(priv); @@ -4733,9 +4733,11 @@ int stmmac_suspend(struct device *dev) stmmac_pmt(priv, priv->hw, priv->wolopts); priv->irq_wake = 1; } else { + mutex_unlock(&priv->lock); rtnl_lock(); phylink_stop(priv->phylink); rtnl_unlock(); + mutex_lock(&priv->lock); stmmac_mac_set(priv, priv->ioaddr, false); pinctrl_pm_select_sleep_state(priv->device); @@ -4827,6 +4829,8 @@ int stmmac_resume(struct device *dev) stmmac_start_all_queues(priv); + mutex_unlock(&priv->lock); + if (!device_may_wakeup(priv->device)) { rtnl_lock(); phylink_start(priv->phylink); @@ -4835,8 +4839,6 @@ int stmmac_resume(struct device *dev) phylink_mac_change(priv->phylink, true); - mutex_unlock(&priv->lock); - return 0; } EXPORT_SYMBOL_GPL(stmmac_resume); From 3a4b46c3bc73240b31cd324b3551fc4231d9f1d5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 20 Jul 2019 21:05:25 +0900 Subject: [PATCH 251/312] block: pg: add header include guard Add a header include guard just in case. Signed-off-by: Masahiro Yamada Signed-off-by: Jens Axboe --- include/uapi/linux/pg.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/pg.h b/include/uapi/linux/pg.h index 364c350e85cd..62b6f69bd9fb 100644 --- a/include/uapi/linux/pg.h +++ b/include/uapi/linux/pg.h @@ -35,6 +35,9 @@ */ +#ifndef _UAPI_LINUX_PG_H +#define _UAPI_LINUX_PG_H + #define PG_MAGIC 'P' #define PG_RESET 'Z' #define PG_COMMAND 'C' @@ -61,4 +64,4 @@ struct pg_read_hdr { }; -/* end of pg.h */ +#endif /* _UAPI_LINUX_PG_H */ From cffb4c3ea37248c4fc2f4ce747e5c24af88aec76 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 25 Sep 2019 10:21:09 +0200 Subject: [PATCH 252/312] drm/i915/dp: Fix dsc bpp calculations, v5. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was a integer wraparound when mode_clock became too high, and we didn't correct for the FEC overhead factor when dividing, with the calculations breaking at HBR3. As a result our calculated bpp was way too high, and the link width limitation never came into effect. Print out the resulting bpp calcululations as a sanity check, just in case we ever have to debug it later on again. We also used the wrong factor for FEC. While bspec mentions 2.4%, all the calculations use 1/0.972261, and the same ratio should be applied to data M/N as well, so use it there when FEC is enabled. This fixes the FIFO underrun we are seeing with FEC enabled. Changes since v2: - Handle fec_enable in intel_link_compute_m_n, so only data M/N is adjusted. (Ville) - Fix initial hardware readout for FEC. (Ville) Changes since v3: - Remove bogus fec_to_mode_clock. (Ville) Changes since v4: - Use the correct register for icl. (Ville) - Split hw readout to a separate patch. Signed-off-by: Maarten Lankhorst Fixes: d9218c8f6cf4 ("drm/i915/dp: Add helpers for Compressed BPP and Slice Count for DSC") Cc: # v5.0+ Cc: Manasi Navare Link: https://patchwork.freedesktop.org/patch/msgid/20190925082110.17439-1-maarten.lankhorst@linux.intel.com Reviewed-by: Ville Syrjälä (cherry picked from commit ed06efb801bd291e935238d3fba46fa03d098f0e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_display.c | 12 +- drivers/gpu/drm/i915/display/intel_display.h | 2 +- drivers/gpu/drm/i915/display/intel_dp.c | 184 ++++++++++--------- drivers/gpu/drm/i915/display/intel_dp.h | 6 +- drivers/gpu/drm/i915/display/intel_dp_mst.c | 2 +- 5 files changed, 107 insertions(+), 99 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index b51d1ceb8739..ce05e805b08f 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -7261,7 +7261,7 @@ retry: pipe_config->fdi_lanes = lane; intel_link_compute_m_n(pipe_config->pipe_bpp, lane, fdi_dotclock, - link_bw, &pipe_config->fdi_m_n, false); + link_bw, &pipe_config->fdi_m_n, false, false); ret = ironlake_check_fdi_lanes(dev, intel_crtc->pipe, pipe_config); if (ret == -EDEADLK) @@ -7508,11 +7508,15 @@ void intel_link_compute_m_n(u16 bits_per_pixel, int nlanes, int pixel_clock, int link_clock, struct intel_link_m_n *m_n, - bool constant_n) + bool constant_n, bool fec_enable) { - m_n->tu = 64; + u32 data_clock = bits_per_pixel * pixel_clock; - compute_m_n(bits_per_pixel * pixel_clock, + if (fec_enable) + data_clock = intel_dp_mode_to_fec_clock(data_clock); + + m_n->tu = 64; + compute_m_n(data_clock, link_clock * nlanes * 8, &m_n->gmch_m, &m_n->gmch_n, constant_n); diff --git a/drivers/gpu/drm/i915/display/intel_display.h b/drivers/gpu/drm/i915/display/intel_display.h index e57e6969051d..01fa87ad3270 100644 --- a/drivers/gpu/drm/i915/display/intel_display.h +++ b/drivers/gpu/drm/i915/display/intel_display.h @@ -414,7 +414,7 @@ enum phy_fia { void intel_link_compute_m_n(u16 bpp, int nlanes, int pixel_clock, int link_clock, struct intel_link_m_n *m_n, - bool constant_n); + bool constant_n, bool fec_enable); bool is_ccs_modifier(u64 modifier); void lpt_disable_clkout_dp(struct drm_i915_private *dev_priv); u32 intel_plane_fb_max_stride(struct drm_i915_private *dev_priv, diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 921ad0a2f7ba..57e9f0ba331b 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -78,8 +78,8 @@ #define DP_DSC_MAX_ENC_THROUGHPUT_0 340000 #define DP_DSC_MAX_ENC_THROUGHPUT_1 400000 -/* DP DSC FEC Overhead factor = (100 - 2.4)/100 */ -#define DP_DSC_FEC_OVERHEAD_FACTOR 976 +/* DP DSC FEC Overhead factor = 1/(0.972261) */ +#define DP_DSC_FEC_OVERHEAD_FACTOR 972261 /* Compliance test status bits */ #define INTEL_DP_RESOLUTION_SHIFT_MASK 0 @@ -494,6 +494,97 @@ int intel_dp_get_link_train_fallback_values(struct intel_dp *intel_dp, return 0; } +u32 intel_dp_mode_to_fec_clock(u32 mode_clock) +{ + return div_u64(mul_u32_u32(mode_clock, 1000000U), + DP_DSC_FEC_OVERHEAD_FACTOR); +} + +static u16 intel_dp_dsc_get_output_bpp(u32 link_clock, u32 lane_count, + u32 mode_clock, u32 mode_hdisplay) +{ + u32 bits_per_pixel, max_bpp_small_joiner_ram; + int i; + + /* + * Available Link Bandwidth(Kbits/sec) = (NumberOfLanes)* + * (LinkSymbolClock)* 8 * (TimeSlotsPerMTP) + * for SST -> TimeSlotsPerMTP is 1, + * for MST -> TimeSlotsPerMTP has to be calculated + */ + bits_per_pixel = (link_clock * lane_count * 8) / + intel_dp_mode_to_fec_clock(mode_clock); + DRM_DEBUG_KMS("Max link bpp: %u\n", bits_per_pixel); + + /* Small Joiner Check: output bpp <= joiner RAM (bits) / Horiz. width */ + max_bpp_small_joiner_ram = DP_DSC_MAX_SMALL_JOINER_RAM_BUFFER / mode_hdisplay; + DRM_DEBUG_KMS("Max small joiner bpp: %u\n", max_bpp_small_joiner_ram); + + /* + * Greatest allowed DSC BPP = MIN (output BPP from available Link BW + * check, output bpp from small joiner RAM check) + */ + bits_per_pixel = min(bits_per_pixel, max_bpp_small_joiner_ram); + + /* Error out if the max bpp is less than smallest allowed valid bpp */ + if (bits_per_pixel < valid_dsc_bpp[0]) { + DRM_DEBUG_KMS("Unsupported BPP %u, min %u\n", + bits_per_pixel, valid_dsc_bpp[0]); + return 0; + } + + /* Find the nearest match in the array of known BPPs from VESA */ + for (i = 0; i < ARRAY_SIZE(valid_dsc_bpp) - 1; i++) { + if (bits_per_pixel < valid_dsc_bpp[i + 1]) + break; + } + bits_per_pixel = valid_dsc_bpp[i]; + + /* + * Compressed BPP in U6.4 format so multiply by 16, for Gen 11, + * fractional part is 0 + */ + return bits_per_pixel << 4; +} + +static u8 intel_dp_dsc_get_slice_count(struct intel_dp *intel_dp, + int mode_clock, int mode_hdisplay) +{ + u8 min_slice_count, i; + int max_slice_width; + + if (mode_clock <= DP_DSC_PEAK_PIXEL_RATE) + min_slice_count = DIV_ROUND_UP(mode_clock, + DP_DSC_MAX_ENC_THROUGHPUT_0); + else + min_slice_count = DIV_ROUND_UP(mode_clock, + DP_DSC_MAX_ENC_THROUGHPUT_1); + + max_slice_width = drm_dp_dsc_sink_max_slice_width(intel_dp->dsc_dpcd); + if (max_slice_width < DP_DSC_MIN_SLICE_WIDTH_VALUE) { + DRM_DEBUG_KMS("Unsupported slice width %d by DP DSC Sink device\n", + max_slice_width); + return 0; + } + /* Also take into account max slice width */ + min_slice_count = min_t(u8, min_slice_count, + DIV_ROUND_UP(mode_hdisplay, + max_slice_width)); + + /* Find the closest match to the valid slice count values */ + for (i = 0; i < ARRAY_SIZE(valid_dsc_slicecount); i++) { + if (valid_dsc_slicecount[i] > + drm_dp_dsc_sink_max_slice_count(intel_dp->dsc_dpcd, + false)) + break; + if (min_slice_count <= valid_dsc_slicecount[i]) + return valid_dsc_slicecount[i]; + } + + DRM_DEBUG_KMS("Unsupported Slice Count %d\n", min_slice_count); + return 0; +} + static enum drm_mode_status intel_dp_mode_valid(struct drm_connector *connector, struct drm_display_mode *mode) @@ -2226,7 +2317,7 @@ intel_dp_compute_config(struct intel_encoder *encoder, adjusted_mode->crtc_clock, pipe_config->port_clock, &pipe_config->dp_m_n, - constant_n); + constant_n, pipe_config->fec_enable); if (intel_connector->panel.downclock_mode != NULL && dev_priv->drrs.type == SEAMLESS_DRRS_SUPPORT) { @@ -2236,7 +2327,7 @@ intel_dp_compute_config(struct intel_encoder *encoder, intel_connector->panel.downclock_mode->clock, pipe_config->port_clock, &pipe_config->dp_m2_n2, - constant_n); + constant_n, pipe_config->fec_enable); } if (!HAS_DDI(dev_priv)) @@ -4323,91 +4414,6 @@ intel_dp_get_sink_irq_esi(struct intel_dp *intel_dp, u8 *sink_irq_vector) DP_DPRX_ESI_LEN; } -u16 intel_dp_dsc_get_output_bpp(int link_clock, u8 lane_count, - int mode_clock, int mode_hdisplay) -{ - u16 bits_per_pixel, max_bpp_small_joiner_ram; - int i; - - /* - * Available Link Bandwidth(Kbits/sec) = (NumberOfLanes)* - * (LinkSymbolClock)* 8 * ((100-FECOverhead)/100)*(TimeSlotsPerMTP) - * FECOverhead = 2.4%, for SST -> TimeSlotsPerMTP is 1, - * for MST -> TimeSlotsPerMTP has to be calculated - */ - bits_per_pixel = (link_clock * lane_count * 8 * - DP_DSC_FEC_OVERHEAD_FACTOR) / - mode_clock; - - /* Small Joiner Check: output bpp <= joiner RAM (bits) / Horiz. width */ - max_bpp_small_joiner_ram = DP_DSC_MAX_SMALL_JOINER_RAM_BUFFER / - mode_hdisplay; - - /* - * Greatest allowed DSC BPP = MIN (output BPP from avaialble Link BW - * check, output bpp from small joiner RAM check) - */ - bits_per_pixel = min(bits_per_pixel, max_bpp_small_joiner_ram); - - /* Error out if the max bpp is less than smallest allowed valid bpp */ - if (bits_per_pixel < valid_dsc_bpp[0]) { - DRM_DEBUG_KMS("Unsupported BPP %d\n", bits_per_pixel); - return 0; - } - - /* Find the nearest match in the array of known BPPs from VESA */ - for (i = 0; i < ARRAY_SIZE(valid_dsc_bpp) - 1; i++) { - if (bits_per_pixel < valid_dsc_bpp[i + 1]) - break; - } - bits_per_pixel = valid_dsc_bpp[i]; - - /* - * Compressed BPP in U6.4 format so multiply by 16, for Gen 11, - * fractional part is 0 - */ - return bits_per_pixel << 4; -} - -u8 intel_dp_dsc_get_slice_count(struct intel_dp *intel_dp, - int mode_clock, - int mode_hdisplay) -{ - u8 min_slice_count, i; - int max_slice_width; - - if (mode_clock <= DP_DSC_PEAK_PIXEL_RATE) - min_slice_count = DIV_ROUND_UP(mode_clock, - DP_DSC_MAX_ENC_THROUGHPUT_0); - else - min_slice_count = DIV_ROUND_UP(mode_clock, - DP_DSC_MAX_ENC_THROUGHPUT_1); - - max_slice_width = drm_dp_dsc_sink_max_slice_width(intel_dp->dsc_dpcd); - if (max_slice_width < DP_DSC_MIN_SLICE_WIDTH_VALUE) { - DRM_DEBUG_KMS("Unsupported slice width %d by DP DSC Sink device\n", - max_slice_width); - return 0; - } - /* Also take into account max slice width */ - min_slice_count = min_t(u8, min_slice_count, - DIV_ROUND_UP(mode_hdisplay, - max_slice_width)); - - /* Find the closest match to the valid slice count values */ - for (i = 0; i < ARRAY_SIZE(valid_dsc_slicecount); i++) { - if (valid_dsc_slicecount[i] > - drm_dp_dsc_sink_max_slice_count(intel_dp->dsc_dpcd, - false)) - break; - if (min_slice_count <= valid_dsc_slicecount[i]) - return valid_dsc_slicecount[i]; - } - - DRM_DEBUG_KMS("Unsupported Slice Count %d\n", min_slice_count); - return 0; -} - static void intel_pixel_encoding_setup_vsc(struct intel_dp *intel_dp, const struct intel_crtc_state *crtc_state) diff --git a/drivers/gpu/drm/i915/display/intel_dp.h b/drivers/gpu/drm/i915/display/intel_dp.h index 657bbb1f5ed0..00981fb9414b 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.h +++ b/drivers/gpu/drm/i915/display/intel_dp.h @@ -102,10 +102,6 @@ bool intel_dp_source_supports_hbr2(struct intel_dp *intel_dp); bool intel_dp_source_supports_hbr3(struct intel_dp *intel_dp); bool intel_dp_get_link_status(struct intel_dp *intel_dp, u8 *link_status); -u16 intel_dp_dsc_get_output_bpp(int link_clock, u8 lane_count, - int mode_clock, int mode_hdisplay); -u8 intel_dp_dsc_get_slice_count(struct intel_dp *intel_dp, int mode_clock, - int mode_hdisplay); bool intel_dp_read_dpcd(struct intel_dp *intel_dp); bool intel_dp_get_colorimetry_status(struct intel_dp *intel_dp); @@ -118,4 +114,6 @@ static inline unsigned int intel_dp_unused_lane_mask(int lane_count) return ~((1 << lane_count) - 1) & 0xf; } +u32 intel_dp_mode_to_fec_clock(u32 mode_clock); + #endif /* __INTEL_DP_H__ */ diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 6df240a01b8c..b6e3504c80a4 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -81,7 +81,7 @@ static int intel_dp_mst_compute_link_config(struct intel_encoder *encoder, adjusted_mode->crtc_clock, crtc_state->port_clock, &crtc_state->dp_m_n, - constant_n); + constant_n, crtc_state->fec_enable); crtc_state->dp_m_n.tu = slots; return 0; From eb0192fed016db1c5a9701cd6ca47233ff4a43e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 30 Sep 2019 21:30:45 +0300 Subject: [PATCH 253/312] drm/i915: Fix g4x sprite scaling stride check with GTT remapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I forgot to update the g4x sprite scaling stride check when GTT remapping was introduced. The stride of the original framebuffer is irrelevant when remapping is used and instead we want to check the stride of the remapped view. Also drop the duplicate width_bytes check. We already check that a few lines earlier. Fixes: df79cf441910 ("drm/i915: Store the final plane stride in plane_state") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20190930183045.662-1-ville.syrjala@linux.intel.com Reviewed-by: Chris Wilson (cherry picked from commit 006e570128f413759b9df64b51bae79903679c9b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_sprite.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_sprite.c b/drivers/gpu/drm/i915/display/intel_sprite.c index dea63be1964f..cae25e493128 100644 --- a/drivers/gpu/drm/i915/display/intel_sprite.c +++ b/drivers/gpu/drm/i915/display/intel_sprite.c @@ -1528,6 +1528,7 @@ g4x_sprite_check_scaling(struct intel_crtc_state *crtc_state, int src_x, src_w, src_h, crtc_w, crtc_h; const struct drm_display_mode *adjusted_mode = &crtc_state->base.adjusted_mode; + unsigned int stride = plane_state->color_plane[0].stride; unsigned int cpp = fb->format->cpp[0]; unsigned int width_bytes; int min_width, min_height; @@ -1569,9 +1570,9 @@ g4x_sprite_check_scaling(struct intel_crtc_state *crtc_state, return -EINVAL; } - if (width_bytes > 4096 || fb->pitches[0] > 4096) { + if (stride > 4096) { DRM_DEBUG_KMS("Stride (%u) exceeds hardware max with scaling (%u)\n", - fb->pitches[0], 4096); + stride, 4096); return -EINVAL; } From e2c4ed148cf3ec8669a1d90dc66966028e5fad70 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Wed, 2 Oct 2019 15:25:42 +0300 Subject: [PATCH 254/312] drm/omap: fix max fclk divider for omap36xx The OMAP36xx and AM/DM37x TRMs say that the maximum divider for DSS fclk (in CM_CLKSEL_DSS) is 32. Experimentation shows that this is not correct, and using divider of 32 breaks DSS with a flood or underflows and sync losts. Dividers up to 31 seem to work fine. There is another patch to the DT files to limit the divider correctly, but as the DSS driver also needs to know the maximum divider to be able to iteratively find good rates, we also need to do the fix in the DSS driver. Signed-off-by: Tomi Valkeinen Cc: Adam Ford Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20191002122542.8449-1-tomi.valkeinen@ti.com Tested-by: Adam Ford Reviewed-by: Jyri Sarha --- drivers/gpu/drm/omapdrm/dss/dss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/omapdrm/dss/dss.c b/drivers/gpu/drm/omapdrm/dss/dss.c index 5711b7a720e6..25b6a79dc385 100644 --- a/drivers/gpu/drm/omapdrm/dss/dss.c +++ b/drivers/gpu/drm/omapdrm/dss/dss.c @@ -1090,7 +1090,7 @@ static const struct dss_features omap34xx_dss_feats = { static const struct dss_features omap3630_dss_feats = { .model = DSS_MODEL_OMAP3, - .fck_div_max = 32, + .fck_div_max = 31, .fck_freq_max = 173000000, .dss_fck_multiplier = 1, .parent_clk_name = "dpll4_ck", From 6e06983dde969c15eb4fdab77f0eda8b18ea28e6 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 2 Oct 2019 17:14:30 -0600 Subject: [PATCH 255/312] selftests: kvm: Fix libkvm build error Fix the following build error from "make TARGETS=kvm kselftest": libkvm.a(assert.o): relocation R_X86_64_32 against `.rodata.str1.1' can not be used when making a PIE object; recompile with -fPIC This error is seen when build is done from the main Makefile using kselftest target. In this case KBUILD_CPPFLAGS and CC_OPTION_CFLAGS are defined. When build is invoked using: "make -C tools/testing/selftests/kvm" KBUILD_CPPFLAGS and CC_OPTION_CFLAGS aren't defined. There is no need to pass in KBUILD_CPPFLAGS and CC_OPTION_CFLAGS for the check to determine if --no-pie is necessary, which is the case when these two aren't defined when "make -C tools/testing/selftests/kvm" runs. Fix it by simplifying the no-pie-option logic. With this change, both build variations work. "make TARGETS=kvm kselftest" "make -C tools/testing/selftests/kvm" Signed-off-by: Shuah Khan Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index fd84b7a78dcf..c5ec868fa1e5 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -49,7 +49,7 @@ CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -I$(LINUX_HDR_PATH) -Iinclude -I$( Date: Tue, 1 Oct 2019 15:18:26 +0200 Subject: [PATCH 256/312] KVM: x86: omit absent pmu MSRs from MSR list INTEL_PMC_MAX_GENERIC is currently 32, which exceeds the 18 contiguous MSR indices reserved by Intel for event selectors. Since some machines actually have MSRs past the reserved range, these may survive the filtering of msrs_to_save array and would be rejected by KVM_GET/SET_MSR. To avoid this, cut the list to whatever CPUID reports for the host's architectural PMU. Reported-by: Vitaly Kuznetsov Suggested-by: Vitaly Kuznetsov Cc: Jim Mattson Fixes: e2ada66ec418 ("kvm: x86: Add Intel PMU MSRs to msrs_to_save[]", 2019-08-21) Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8072acaaf028..31607174f442 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5105,13 +5105,14 @@ out: static void kvm_init_msr_list(void) { + struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i, j; BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, "Please update the fixed PMCs in msrs_to_save[]"); - BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, - "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); + + perf_get_x86_pmu_capability(&x86_pmu); for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) @@ -5153,6 +5154,15 @@ static void kvm_init_msr_list(void) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 31: + if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 31: + if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; } default: break; From 567926cca99ba1750be8aae9c4178796bf9bb90b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Oct 2019 09:21:23 -0700 Subject: [PATCH 257/312] KVM: nVMX: Fix consistency check on injected exception error code Current versions of Intel's SDM incorrectly state that "bits 31:15 of the VM-Entry exception error-code field" must be zero. In reality, bits 31:16 must be zero, i.e. error codes are 16-bit values. The bogus error code check manifests as an unexpected VM-Entry failure due to an invalid code field (error number 7) in L1, e.g. when injecting a #GP with error_code=0x9f00. Nadav previously reported the bug[*], both to KVM and Intel, and fixed the associated kvm-unit-test. [*] https://patchwork.kernel.org/patch/11124749/ Reported-by: Nadav Amit Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 41abc62c9a8a..e76eb4f07f6c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2610,7 +2610,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, /* VM-entry exception error code */ if (CC(has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) + vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ From 6af1799aaf3f1bc8defedddfa00df3192445bbf3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2019 09:38:55 -0700 Subject: [PATCH 258/312] ipv6: drop incoming packets having a v4mapped source address This began with a syzbot report. syzkaller was injecting IPv6 TCP SYN packets having a v4mapped source address. After an unsuccessful 4-tuple lookup, TCP creates a request socket (SYN_RECV) and calls reqsk_queue_hash_req() reqsk_queue_hash_req() calls sk_ehashfn(sk) At this point we have AF_INET6 sockets, and the heuristic used by sk_ehashfn() to either hash the IPv4 or IPv6 addresses is to use ipv6_addr_v4mapped(&sk->sk_v6_daddr) For the particular spoofed packet, we end up hashing V4 addresses which were not initialized by the TCP IPv6 stack, so KMSAN fired a warning. I first fixed sk_ehashfn() to test both source and destination addresses, but then faced various problems, including user-space programs like packetdrill that had similar assumptions. Instead of trying to fix the whole ecosystem, it is better to admit that we have a dual stack behavior, and that we can not build linux kernels without V4 stack anyway. The dual stack API automatically forces the traffic to be IPv4 if v4mapped addresses are used at bind() or connect(), so it makes no sense to allow IPv6 traffic to use the same v4mapped class. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Hannes Frederic Sowa Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/ip6_input.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 7e5df23cbe7b..3d71c7d6102c 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -223,6 +223,16 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, if (ipv6_addr_is_multicast(&hdr->saddr)) goto err; + /* While RFC4291 is not explicit about v4mapped addresses + * in IPv6 headers, it seems clear linux dual-stack + * model can not deal properly with these. + * Security models could be fooled by ::ffff:127.0.0.1 for example. + * + * https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02 + */ + if (ipv6_addr_v4mapped(&hdr->saddr)) + goto err; + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); From 086bf301f541eb4acfb5dadae7b1b207ad1b95a3 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 20 Sep 2019 10:44:47 -0700 Subject: [PATCH 259/312] MAINTAINERS: kgdb: Add myself as a reviewer for kgdb/kdb I'm interested in kdb / kgdb and have sent various fixes over the years. I'd like to get CCed on patches so I can be aware of them and also help review. Signed-off-by: Douglas Anderson Acked-by: Daniel Thompson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 296de2b51c83..acdd996774d6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9075,6 +9075,7 @@ F: security/keys/ KGDB / KDB /debug_core M: Jason Wessel M: Daniel Thompson +R: Douglas Anderson W: http://kgdb.wiki.kernel.org/ L: kgdb-bugreport@lists.sourceforge.net T: git git://git.kernel.org/pub/scm/linux/kernel/git/jwessel/kgdb.git From 44b321e5020d782ad6e8ae8183f09b163be6e6e2 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Wed, 2 Oct 2019 13:29:22 -0400 Subject: [PATCH 260/312] udp: fix gso_segs calculations Commit dfec0ee22c0a ("udp: Record gso_segs when supporting UDP segmentation offload") added gso_segs calculation, but incorrectly got sizeof() the pointer and not the underlying data type. In addition let's fix the v6 case. Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Fixes: dfec0ee22c0a ("udp: Record gso_segs when supporting UDP segmentation offload") Signed-off-by: Josh Hunt Reviewed-by: Alexander Duyck Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- net/ipv6/udp.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e8443cc5c1ab..3e2b90181879 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -856,7 +856,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh), + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(*uh), cork->gso_size); goto csum_partial; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index aae4938f3dea..eb9a9934ac05 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1143,6 +1143,8 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(*uh), + cork->gso_size); goto csum_partial; } From 4094871db1d65810acab3d57f6089aa39ef7f648 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Wed, 2 Oct 2019 13:29:23 -0400 Subject: [PATCH 261/312] udp: only do GSO if # of segs > 1 Prior to this change an application sending <= 1MSS worth of data and enabling UDP GSO would fail if the system had SW GSO enabled, but the same send would succeed if HW GSO offload is enabled. In addition to this inconsistency the error in the SW GSO case does not get back to the application if sending out of a real device so the user is unaware of this failure. With this change we only perform GSO if the # of segments is > 1 even if the application has enabled segmentation. I've also updated the relevant udpgso selftests. Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Signed-off-by: Josh Hunt Reviewed-by: Willem de Bruijn Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/udp.c | 11 +++++++---- net/ipv6/udp.c | 11 +++++++---- tools/testing/selftests/net/udpgso.c | 16 ++++------------ 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3e2b90181879..14bc654b6842 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -821,6 +821,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); int len = skb->len - offset; + int datalen = len - sizeof(*uh); __wsum csum = 0; /* @@ -854,10 +855,12 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, return -EIO; } - skb_shinfo(skb)->gso_size = cork->gso_size; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(*uh), - cork->gso_size); + if (datalen > cork->gso_size) { + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, + cork->gso_size); + } goto csum_partial; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index eb9a9934ac05..6324d3a8cb53 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1109,6 +1109,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, __wsum csum = 0; int offset = skb_transport_offset(skb); int len = skb->len - offset; + int datalen = len - sizeof(*uh); /* * Create a UDP header @@ -1141,10 +1142,12 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, return -EIO; } - skb_shinfo(skb)->gso_size = cork->gso_size; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(*uh), - cork->gso_size); + if (datalen > cork->gso_size) { + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, + cork->gso_size); + } goto csum_partial; } diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index b8265ee9923f..614b31aad168 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -89,12 +89,9 @@ struct testcase testcases_v4[] = { .tfail = true, }, { - /* send a single MSS: will fail with GSO, because the segment - * logic in udp4_ufo_fragment demands a gso skb to be > MTU - */ + /* send a single MSS: will fall back to no GSO */ .tlen = CONST_MSS_V4, .gso_len = CONST_MSS_V4, - .tfail = true, .r_num_mss = 1, }, { @@ -139,10 +136,9 @@ struct testcase testcases_v4[] = { .tfail = true, }, { - /* send a single 1B MSS: will fail, see single MSS above */ + /* send a single 1B MSS: will fall back to no GSO */ .tlen = 1, .gso_len = 1, - .tfail = true, .r_num_mss = 1, }, { @@ -196,12 +192,9 @@ struct testcase testcases_v6[] = { .tfail = true, }, { - /* send a single MSS: will fail with GSO, because the segment - * logic in udp4_ufo_fragment demands a gso skb to be > MTU - */ + /* send a single MSS: will fall back to no GSO */ .tlen = CONST_MSS_V6, .gso_len = CONST_MSS_V6, - .tfail = true, .r_num_mss = 1, }, { @@ -246,10 +239,9 @@ struct testcase testcases_v6[] = { .tfail = true, }, { - /* send a single 1B MSS: will fail, see single MSS above */ + /* send a single 1B MSS: will fall back to no GSO */ .tlen = 1, .gso_len = 1, - .tfail = true, .r_num_mss = 1, }, { From 2105b52e30debe7f19f3218598d8ae777dcc6776 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 2 Oct 2019 17:08:18 -0700 Subject: [PATCH 262/312] lib: textsearch: fix escapes in example code This textsearch code example does not need the '\' escapes and they can be misleading to someone reading the example. Also, gcc and sparse warn that the "\%d" is an unknown escape sequence. Fixes: 5968a70d7af5 ("textsearch: fix kernel-doc warnings and add kernel-api section") Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- lib/textsearch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/textsearch.c b/lib/textsearch.c index 4f16eec5d554..f68dea8806be 100644 --- a/lib/textsearch.c +++ b/lib/textsearch.c @@ -89,9 +89,9 @@ * goto errout; * } * - * pos = textsearch_find_continuous(conf, \&state, example, strlen(example)); + * pos = textsearch_find_continuous(conf, &state, example, strlen(example)); * if (pos != UINT_MAX) - * panic("Oh my god, dancing chickens at \%d\n", pos); + * panic("Oh my god, dancing chickens at %d\n", pos); * * textsearch_destroy(conf); */ From 3afb0961884046c8fb4acbce65139088959681c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2019 20:19:59 -0700 Subject: [PATCH 263/312] tcp: fix slab-out-of-bounds in tcp_zerocopy_receive() Apparently a refactoring patch brought a bug, that was caught by syzbot [1] Original code was correct, do not try to be smarter than the compiler :/ [1] BUG: KASAN: slab-out-of-bounds in tcp_zerocopy_receive net/ipv4/tcp.c:1807 [inline] BUG: KASAN: slab-out-of-bounds in do_tcp_getsockopt.isra.0+0x2c6c/0x3120 net/ipv4/tcp.c:3654 Read of size 4 at addr ffff8880943cf188 by task syz-executor.2/17508 CPU: 0 PID: 17508 Comm: syz-executor.2 Not tainted 5.3.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351 __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482 kasan_report+0x12/0x17 mm/kasan/common.c:618 __asan_report_load4_noabort+0x14/0x20 mm/kasan/generic_report.c:131 tcp_zerocopy_receive net/ipv4/tcp.c:1807 [inline] do_tcp_getsockopt.isra.0+0x2c6c/0x3120 net/ipv4/tcp.c:3654 tcp_getsockopt+0xbf/0xe0 net/ipv4/tcp.c:3680 sock_common_getsockopt+0x94/0xd0 net/core/sock.c:3098 __sys_getsockopt+0x16d/0x310 net/socket.c:2129 __do_sys_getsockopt net/socket.c:2144 [inline] __se_sys_getsockopt net/socket.c:2141 [inline] __x64_sys_getsockopt+0xbe/0x150 net/socket.c:2141 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296 Fixes: d8e18a516f8f ("net: Use skb accessors in network core") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Cc: Matthew Wilcox (Oracle) Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 79c325a07ba5..f98a1882e537 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1798,13 +1798,11 @@ static int tcp_zerocopy_receive(struct sock *sk, } if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) { int remaining = zc->recv_skip_hint; - int size = skb_frag_size(frags); - while (remaining && (size != PAGE_SIZE || + while (remaining && (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags))) { - remaining -= size; + remaining -= skb_frag_size(frags); frags++; - size = skb_frag_size(frags); } zc->recv_skip_hint -= remaining; break; From 341115822f8832f0c2d8af2f7e151c4c9a77bcd1 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 3 Oct 2019 10:11:21 -0700 Subject: [PATCH 264/312] usercopy: Add parentheses around assignment in test_copy_struct_from_user Clang warns: lib/test_user_copy.c:96:10: warning: using the result of an assignment as a condition without parentheses [-Wparentheses] if (ret |= test(umem_src == NULL, "kmalloc failed")) ~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lib/test_user_copy.c:96:10: note: place parentheses around the assignment to silence this warning if (ret |= test(umem_src == NULL, "kmalloc failed")) ^ ( ) lib/test_user_copy.c:96:10: note: use '!=' to turn this compound assignment into an inequality comparison if (ret |= test(umem_src == NULL, "kmalloc failed")) ^~ != Add the parentheses as it suggests because this is intentional. Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper") Link: https://github.com/ClangBuiltLinux/linux/issues/731 Signed-off-by: Nathan Chancellor Acked-by: Aleksa Sarai Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20191003171121.2723619-1-natechancellor@gmail.com Signed-off-by: Christian Brauner --- lib/test_user_copy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c index 950ee88cd6ac..e365ace06538 100644 --- a/lib/test_user_copy.c +++ b/lib/test_user_copy.c @@ -93,11 +93,11 @@ static int test_copy_struct_from_user(char *kmem, char __user *umem, size_t ksize, usize; umem_src = kmalloc(size, GFP_KERNEL); - if (ret |= test(umem_src == NULL, "kmalloc failed")) + if ((ret |= test(umem_src == NULL, "kmalloc failed"))) goto out_free; expected = kmalloc(size, GFP_KERNEL); - if (ret |= test(expected == NULL, "kmalloc failed")) + if ((ret |= test(expected == NULL, "kmalloc failed"))) goto out_free; /* Fill umem with a fixed byte pattern. */ From 501bd0166eb949820c8e4204e62aaee5c89c84d9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 27 Sep 2019 17:28:42 +0200 Subject: [PATCH 265/312] fork: add kernel-doc for clone3 Add kernel-doc for the clone3() syscall. Link: https://lore.kernel.org/r/20191001114701.24661-2-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- kernel/fork.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..bf11cf39579a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2604,6 +2604,17 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) return true; } +/** + * clone3 - create a new process with specific properties + * @uargs: argument structure + * @size: size of @uargs + * + * clone3() is the extensible successor to clone()/clone2(). + * It takes a struct as argument that is versioned by its size. + * + * Return: On success, a positive PID for the child process. + * On error, a negative errno number. + */ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) { int err; From 78f6face5af344f12f4bd48b32faa6f499a06f36 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 27 Sep 2019 17:29:05 +0200 Subject: [PATCH 266/312] sched: add kernel-doc for struct clone_args Add kernel-doc for struct clone_args for the clone3() syscall. Link: https://lore.kernel.org/r/20191001114701.24661-3-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 851ff1feadd5..3d8f03bfd428 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -34,8 +34,30 @@ #define CLONE_IO 0x80000000 /* Clone io context */ #ifndef __ASSEMBLY__ -/* - * Arguments for the clone3 syscall +/** + * struct clone_args - arguments for the clone3 syscall + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * + * The structure is versioned by size and thus extensible. + * New struct members must go at the end of the struct and + * must be properly 64bit aligned. */ struct clone_args { __aligned_u64 flags; From 485f682be9fc8d41376936a3b01423edd07b9a75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 3 Sep 2019 18:40:18 +0300 Subject: [PATCH 267/312] Revert "drm/i915: Fix DP-MST crtc_mask" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 4eaceea3a00f8e936a7f48dcd0c975a57f88930f. Several userspace clients (modesetting ddx and mutter+wayland at least) handle encoder.possible_crtcs incorrectly. What they essentially do is the following: possible_crtcs = ~0; for_each_possible_encoder(connector) possible_crtcs &= encoder->possible_crtcs; Ie. they calculate the intersection of the possible_crtcs for the connector when they really should be calculating the union instead. In our case each MST encoder now has just one unique bit set, and so the intersection is always zero. The end result is that MST connectors can't be lit up because no crtc can be found to drive them. I've submitted a fix for the modesetting ddx [1], and complained on #wayland about mutter, so hopefully the situation will improve in the future. In the meantime we have regression, and so must go back to the old way of misconfiguring possible_crtcs in the kernel. [1] https://gitlab.freedesktop.org/xorg/xserver/merge_requests/277 Cc: Jonas Ådahl Cc: Stanislav Lisovskiy Cc: Lionel Landwerlin Cc: Dhinakaran Pandiyan Cc: Lucas De Marchi Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=111507 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20190903154018.26357-1-ville.syrjala@linux.intel.com Reviewed-by: José Roberto de Souza (cherry picked from commit e838bfa8e170415fa3cc8e83ecb171e809c0c422) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp_mst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index b6e3504c80a4..600873c796d0 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -615,7 +615,7 @@ intel_dp_create_fake_mst_encoder(struct intel_digital_port *intel_dig_port, enum intel_encoder->type = INTEL_OUTPUT_DP_MST; intel_encoder->power_domain = intel_dig_port->base.power_domain; intel_encoder->port = intel_dig_port->base.port; - intel_encoder->crtc_mask = BIT(pipe); + intel_encoder->crtc_mask = 0x7; intel_encoder->cloneable = 0; intel_encoder->compute_config = intel_dp_mst_compute_config; From dc301025658aef326763765b0bebeaa44569ed20 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 2 Oct 2019 19:23:05 -0700 Subject: [PATCH 268/312] block: sed-opal: fix sparse warning: obsolete array init. Fix sparse warning: (missing '=') ../block/sed-opal.c:133:17: warning: obsolete array initializer, use C99 syntax Fixes: ff91064ea37c ("block: sed-opal: check size of shadow mbr") Cc: linux-block@vger.kernel.org Cc: Jonas Rabenstein Cc: David Kozub Reviewed-by: Scott Bauer Reviewed-by: Revanth Rajashekar Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/sed-opal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 4e95a9792162..59a6ba2131d5 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -129,7 +129,7 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 }, /* tables */ - [OPAL_TABLE_TABLE] + [OPAL_TABLE_TABLE] = { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_LOCKINGRANGE_GLOBAL] = { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, From a9eb49c964884654dd6394cb6abe7ceb021c9c96 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 2 Oct 2019 19:23:15 -0700 Subject: [PATCH 269/312] block: sed-opal: fix sparse warning: convert __be64 data sparse warns about incorrect type when using __be64 data. It is not being converted to CPU-endian but it should be. Fixes these sparse warnings: ../block/sed-opal.c:375:20: warning: incorrect type in assignment (different base types) ../block/sed-opal.c:375:20: expected unsigned long long [usertype] align ../block/sed-opal.c:375:20: got restricted __be64 const [usertype] alignment_granularity ../block/sed-opal.c:376:25: warning: incorrect type in assignment (different base types) ../block/sed-opal.c:376:25: expected unsigned long long [usertype] lowest_lba ../block/sed-opal.c:376:25: got restricted __be64 const [usertype] lowest_aligned_lba Fixes: 455a7b238cd6 ("block: Add Sed-opal library") Cc: Scott Bauer Cc: Rafael Antognolli Cc: linux-block@vger.kernel.org Reviewed-by: Jon Derrick Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/sed-opal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 59a6ba2131d5..b4c761973ac1 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -372,8 +372,8 @@ static void check_geometry(struct opal_dev *dev, const void *data) { const struct d0_geometry_features *geo = data; - dev->align = geo->alignment_granularity; - dev->lowest_lba = geo->lowest_aligned_lba; + dev->align = be64_to_cpu(geo->alignment_granularity); + dev->lowest_lba = be64_to_cpu(geo->lowest_aligned_lba); } static int execute_step(struct opal_dev *dev, From cc3a7bfe62b947b423fcb2cfe89fcba92bf48fa3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 2 Oct 2019 16:17:54 -0500 Subject: [PATCH 270/312] vfs: Fix EOVERFLOW testing in put_compat_statfs64 Today, put_compat_statfs64() disallows nearly any field value over 2^32 if f_bsize is only 32 bits, but that makes no sense. compat_statfs64 is there for the explicit purpose of providing 64-bit fields for f_files, f_ffree, etc. And f_bsize is always only 32 bits. As a result, 32-bit userspace gets -EOVERFLOW for i.e. large file counts even with -D_FILE_OFFSET_BITS=64 set. In reality, only f_bsize and f_frsize can legitimately overflow (fields like f_type and f_namelen should never be large), so test only those fields. This bug was discussed at length some time ago, and this is the proposal Al suggested at https://lkml.org/lkml/2018/8/6/640. It seemed to get dropped amid the discussion of other related changes, but this part seems obviously correct on its own, so I've picked it up and sent it, for expediency. Fixes: 64d2ab32efe3 ("vfs: fix put_compat_statfs64() does not handle errors") Signed-off-by: Eric Sandeen Signed-off-by: Linus Torvalds --- fs/statfs.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/statfs.c b/fs/statfs.c index eea7af6f2f22..2616424012ea 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -318,19 +318,10 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { struct compat_statfs64 buf; - if (sizeof(ubuf->f_bsize) == 4) { - if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | - kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } + + if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + memset(&buf, 0, sizeof(struct compat_statfs64)); buf.f_type = kbuf->f_type; buf.f_bsize = kbuf->f_bsize; From 101c40ed269ccf67d3535bf2d131584008edbbd1 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 3 Oct 2019 22:46:23 +0000 Subject: [PATCH 271/312] MIPS: pmcs-msp71xx: Add missing MAX_PROM_MEM definition Commit b3c948e2c00f ("MIPS: msp: Record prom memory") introduced use of a MAX_PROM_MEM value but didn't define it. A bounds check in prom_meminit() suggests its value was supposed to be 5, so define it as such & adjust the bounds check to use the macro rather than a magic number. Signed-off-by: Paul Burton Fixes: b3c948e2c00f ("MIPS: msp: Record prom memory") Cc: Jiaxun Yang Cc: linux-mips@vger.kernel.org --- arch/mips/pmcs-msp71xx/msp_prom.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/pmcs-msp71xx/msp_prom.c b/arch/mips/pmcs-msp71xx/msp_prom.c index dfb527961a27..f87c522a63e6 100644 --- a/arch/mips/pmcs-msp71xx/msp_prom.c +++ b/arch/mips/pmcs-msp71xx/msp_prom.c @@ -61,6 +61,7 @@ int init_debug = 1; /* memory blocks */ struct prom_pmemblock mdesc[PROM_MAX_PMEMBLOCKS]; +#define MAX_PROM_MEM 5 static phys_addr_t prom_mem_base[MAX_PROM_MEM] __initdata; static phys_addr_t prom_mem_size[MAX_PROM_MEM] __initdata; static unsigned int nr_prom_mem __initdata; @@ -358,7 +359,7 @@ void __init prom_meminit(void) p++; if (type == BOOT_MEM_ROM_DATA) { - if (nr_prom_mem >= 5) { + if (nr_prom_mem >= MAX_PROM_MEM) { pr_err("Too many ROM DATA regions"); continue; } From 437450cf09c382ff26c14c868d3a1e82c269be12 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 3 Oct 2019 22:46:36 +0000 Subject: [PATCH 272/312] MIPS: pmcs-msp71xx: Remove unused addr variable The addr variable in prom_free_prom_memory() has been unused since commit b3c948e2c00f ("MIPS: msp: Record prom memory"), causing a warning & build failure due to -Werror. Remove the unused variable. Signed-off-by: Paul Burton Fixes: b3c948e2c00f ("MIPS: msp: Record prom memory") Cc: Jiaxun Yang Cc: linux-mips@vger.kernel.org --- arch/mips/pmcs-msp71xx/msp_prom.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/mips/pmcs-msp71xx/msp_prom.c b/arch/mips/pmcs-msp71xx/msp_prom.c index f87c522a63e6..800a21b8b8b0 100644 --- a/arch/mips/pmcs-msp71xx/msp_prom.c +++ b/arch/mips/pmcs-msp71xx/msp_prom.c @@ -378,7 +378,6 @@ void __init prom_free_prom_memory(void) char *ptr; int len = 0; int i; - unsigned long addr; /* * preserve environment variables and command line from pmon/bbload From cf05a67b68b8d9d6469bedb63ee461f8c7de62e6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Oct 2019 15:33:07 +0200 Subject: [PATCH 273/312] KVM: x86: omit "impossible" pmu MSRs from MSR list INTEL_PMC_MAX_GENERIC is currently 32, which exceeds the 18 contiguous MSR indices reserved by Intel for event selectors. Since some machines actually have MSRs past the reserved range, filtering them against x86_pmu.num_counters_gp may have false positives. Cut the list to 18 entries to avoid this. Reported-by: Vitaly Kuznetsov Suggested-by: Vitaly Kuznetsov Cc: Jim Mattson Fixes: e2ada66ec418 ("kvm: x86: Add Intel PMU MSRs to msrs_to_save[]", 2019-08-21) Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 31607174f442..661e2bf38526 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1169,13 +1169,6 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, - MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, - MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, - MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, - MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, - MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, - MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, - MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, @@ -1185,13 +1178,6 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, - MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, - MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, - MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, - MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, - MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, - MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, - MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, }; static unsigned num_msrs_to_save; @@ -5154,12 +5140,12 @@ static void kvm_init_msr_list(void) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 31: + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 31: + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; From 5c8e10f8326294b0f860da6e7266174f28fc1341 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 1 Oct 2019 22:08:01 +0200 Subject: [PATCH 274/312] s390: mark __cpacf_query() as __always_inline arch/s390/kvm/kvm-s390.c calls on several places __cpacf_query() directly, which makes it impossible to meet the "i" constraint for the asm operands (opcode in this case). As we are now force-enabling CONFIG_OPTIMIZE_INLINING on all architectures, this causes a build failure on s390: In file included from arch/s390/kvm/kvm-s390.c:44: ./arch/s390/include/asm/cpacf.h: In function '__cpacf_query': ./arch/s390/include/asm/cpacf.h:179:2: warning: asm operand 3 probably doesn't match constraints 179 | asm volatile( | ^~~ ./arch/s390/include/asm/cpacf.h:179:2: error: impossible constraint in 'asm' Mark __cpacf_query() as __always_inline in order to fix that, analogically how we fixes __cpacf_check_opcode(), cpacf_query_func() and scpacf_query() already. Reported-and-tested-by: Michal Kubecek Fixes: d83623c5eab2 ("s390: mark __cpacf_check_opcode() and cpacf_query_func() as __always_inline") Fixes: e60fb8bf68d4 ("s390/cpacf: mark scpacf_query() as __always_inline") Fixes: ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") Fixes: 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING") Signed-off-by: Jiri Kosina Link: https://lore.kernel.org/lkml/nycvar.YFH.7.76.1910012203010.13160@cbobk.fhfr.pm Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/cpacf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index a092f63aac6a..c0f3bfeddcbe 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -171,7 +171,7 @@ typedef struct { unsigned char bytes[16]; } cpacf_mask_t; * * Returns 1 if @func is available for @opcode, 0 otherwise */ -static inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask) +static __always_inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask) { register unsigned long r0 asm("0") = 0; /* query function */ register unsigned long r1 asm("1") = (unsigned long) mask; From 51ce02216d4ad4e8f6a58de81d6e803cf04c418e Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 3 Oct 2019 17:36:36 -0400 Subject: [PATCH 275/312] s390/mm: fix -Wunused-but-set-variable warnings Convert two functions to static inline to get ride of W=1 GCC warnings like, mm/gup.c: In function 'gup_pte_range': mm/gup.c:1816:16: warning: variable 'ptem' set but not used [-Wunused-but-set-variable] pte_t *ptep, *ptem; ^~~~ mm/mmap.c: In function 'acct_stack_growth': mm/mmap.c:2322:16: warning: variable 'new_start' set but not used [-Wunused-but-set-variable] unsigned long new_start; ^~~~~~~~~ Signed-off-by: Qian Cai Link: https://lore.kernel.org/lkml/1570138596-11913-1-git-send-email-cai@lca.pw/ Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/hugetlb.h | 9 +++++++-- arch/s390/include/asm/pgtable.h | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index bb59dd964590..de8f0bf5f238 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -12,8 +12,6 @@ #include #include - -#define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range free_pgd_range #define hugepages_supported() (MACHINE_HAS_EDAT1) @@ -23,6 +21,13 @@ pte_t huge_ptep_get(pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +static inline bool is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + return false; +} + /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 36c578c0ff96..19c2cf001df3 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1269,7 +1269,8 @@ static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) #define pte_offset_kernel(pmd, address) pte_offset(pmd, address) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) -#define pte_unmap(pte) do { } while (0) + +static inline void pte_unmap(pte_t *pte) { } static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { From b4fd5a0a9295d87da629c72378a7f4967a277030 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 4 Oct 2019 12:28:43 +0200 Subject: [PATCH 276/312] s390/atomic,bitops: mark function(s) __always_inline Always inline asm inlines with variable operands for "i" constraints, since they won't compile if the compiler would decide to not inline them. Reported-by: Michal Kubecek Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/atomic_ops.h | 2 +- arch/s390/include/asm/bitops.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h index d3f09526ee19..61467b9eecc7 100644 --- a/arch/s390/include/asm/atomic_ops.h +++ b/arch/s390/include/asm/atomic_ops.h @@ -41,7 +41,7 @@ __ATOMIC_OPS(__atomic64_xor, long, "laxg") #undef __ATOMIC_OP #define __ATOMIC_CONST_OP(op_name, op_type, op_string, op_barrier) \ -static inline void op_name(op_type val, op_type *ptr) \ +static __always_inline void op_name(op_type val, op_type *ptr) \ { \ asm volatile( \ op_string " %[ptr],%[val]\n" \ diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index b8833ac983fa..eb7eed43e780 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -56,7 +56,7 @@ __bitops_byte(unsigned long nr, volatile unsigned long *ptr) return ((unsigned char *)ptr) + ((nr ^ (BITS_PER_LONG - 8)) >> 3); } -static inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr) +static __always_inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); unsigned long mask; @@ -77,7 +77,7 @@ static inline void arch_set_bit(unsigned long nr, volatile unsigned long *ptr) __atomic64_or(mask, (long *)addr); } -static inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr) +static __always_inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); unsigned long mask; @@ -98,8 +98,8 @@ static inline void arch_clear_bit(unsigned long nr, volatile unsigned long *ptr) __atomic64_and(mask, (long *)addr); } -static inline void arch_change_bit(unsigned long nr, - volatile unsigned long *ptr) +static __always_inline void arch_change_bit(unsigned long nr, + volatile unsigned long *ptr) { unsigned long *addr = __bitops_word(nr, ptr); unsigned long mask; From 44967bfb552090981751074aeb4ad372f5979902 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 4 Oct 2019 12:29:00 +0200 Subject: [PATCH 277/312] s390/cpu_mf: mark function(s) __always_inline Always inline asm inlines with variable operands for "i" constraints, since they won't compile if the compiler would decide to not inline them. Reported-by: Michal Kubecek Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/cpu_mf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 753669bfbe99..819803a97c2b 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -223,7 +223,8 @@ enum stcctm_ctr_set { MT_DIAG = 5, MT_DIAG_CLEARING = 9, /* clears loss-of-MT-ctr-data alert */ }; -static inline int stcctm(enum stcctm_ctr_set set, u64 range, u64 *dest) + +static __always_inline int stcctm(enum stcctm_ctr_set set, u64 range, u64 *dest) { int cc; From 7f032febb665f6c1fe95068be14aa891a4dfb8fc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 4 Oct 2019 12:29:18 +0200 Subject: [PATCH 278/312] s390/jump_label: mark function(s) __always_inline Always inline asm inlines with variable operands for "i" constraints, since they won't compile if the compiler would decide to not inline them. Reported-by: Michal Kubecek Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/jump_label.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index e548ec1ec12c..39f747d63758 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -20,7 +20,7 @@ * We use a brcl 0,2 instruction for jump labels at compile time so it * can be easily distinguished from a hotpatch generated instruction. */ -static inline bool arch_static_branch(struct static_key *key, bool branch) +static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("0: brcl 0,"__stringify(JUMP_LABEL_NOP_OFFSET)"\n" ".pushsection __jump_table,\"aw\"\n" @@ -34,7 +34,7 @@ label: return true; } -static inline bool arch_static_branch_jump(struct static_key *key, bool branch) +static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) { asm_volatile_goto("0: brcl 15,%l[label]\n" ".pushsection __jump_table,\"aw\"\n" From 6818b542a084689366a2b6211f79d9122aea4ac6 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 4 Oct 2019 12:29:37 +0200 Subject: [PATCH 279/312] s390/mm: mark function(s) __always_inline Always inline asm inlines with variable operands for "i" constraints, since they won't compile if the compiler would decide to not inline them. Reported-by: Michal Kubecek Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pgtable.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 19c2cf001df3..5ff98d76a66c 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -997,9 +997,9 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_NODAT 0x400 #define IPTE_GUEST_ASCE 0x800 -static inline void __ptep_ipte(unsigned long address, pte_t *ptep, - unsigned long opt, unsigned long asce, - int local) +static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) { unsigned long pto = (unsigned long) ptep; @@ -1020,8 +1020,8 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep, : [r1] "a" (pto), [m4] "i" (local) : "memory"); } -static inline void __ptep_ipte_range(unsigned long address, int nr, - pte_t *ptep, int local) +static __always_inline void __ptep_ipte_range(unsigned long address, int nr, + pte_t *ptep, int local) { unsigned long pto = (unsigned long) ptep; @@ -1436,9 +1436,9 @@ static inline void __pmdp_csp(pmd_t *pmdp) #define IDTE_NODAT 0x1000 #define IDTE_GUEST_ASCE 0x2000 -static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, - unsigned long opt, unsigned long asce, - int local) +static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, + unsigned long opt, unsigned long asce, + int local) { unsigned long sto; @@ -1462,9 +1462,9 @@ static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, } } -static inline void __pudp_idte(unsigned long addr, pud_t *pudp, - unsigned long opt, unsigned long asce, - int local) +static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp, + unsigned long opt, unsigned long asce, + int local) { unsigned long r3o; From 771c24f6da06042e8aa8303e37c069fa01f96a09 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 4 Oct 2019 12:29:43 +0200 Subject: [PATCH 280/312] s390/pci: mark function(s) __always_inline Always inline asm inlines with variable operands for "i" constraints, since they won't compile if the compiler would decide to not inline them. Reported-by: Michal Kubecek Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_clp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 9bdff4defef1..e585a62d6530 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -66,7 +66,7 @@ static inline int clp_get_ilp(unsigned long *ilp) /* * Call Logical Processor with c=0, the give constant lps and an lpcb request. */ -static inline int clp_req(void *data, unsigned int lps) +static __always_inline int clp_req(void *data, unsigned int lps) { struct { u8 _[CLP_BLK_SIZE]; } *req = data; u64 ignored; From 61c313471574de93c63868689d42cd9fff297a1a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 4 Oct 2019 15:05:03 +0200 Subject: [PATCH 281/312] s390: update defconfigs Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 24 ++++++++++++++++-------- arch/s390/configs/defconfig | 25 ++++++++++++++++--------- arch/s390/configs/zfcpdump_defconfig | 2 +- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 347f48702edb..38d64030aacf 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -44,6 +44,7 @@ CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y @@ -69,12 +70,13 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y CONFIG_MODULE_SIG_SHA256=y +CONFIG_UNUSED_SYMBOLS=y CONFIG_BLK_DEV_INTEGRITY=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y CONFIG_BSD_DISKLABEL=y @@ -370,6 +372,7 @@ CONFIG_NETLINK_DIAG=m CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m +# CONFIG_NET_DROP_MONITOR is not set CONFIG_PCI=y CONFIG_PCI_DEBUG=y CONFIG_HOTPLUG_PCI=y @@ -424,6 +427,7 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_WRITECACHE=m +CONFIG_DM_CLONE=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m CONFIG_DM_RAID=m @@ -435,6 +439,7 @@ CONFIG_DM_DELAY=m CONFIG_DM_UEVENT=y CONFIG_DM_FLAKEY=m CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y CONFIG_DM_SWITCH=m CONFIG_NETDEVICES=y CONFIG_BONDING=m @@ -489,6 +494,7 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set # CONFIG_NET_VENDOR_PACKET_ENGINES is not set +# CONFIG_NET_VENDOR_PENSANDO is not set # CONFIG_NET_VENDOR_QLOGIC is not set # CONFIG_NET_VENDOR_QUALCOMM is not set # CONFIG_NET_VENDOR_RDC is not set @@ -538,15 +544,16 @@ CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m -CONFIG_DRM=y -CONFIG_DRM_VIRTIO_GPU=y +CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y # CONFIG_HID is not set # CONFIG_USB_SUPPORT is not set CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m +CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_VFIO_MDEV=m @@ -580,6 +587,8 @@ CONFIG_NILFS2_FS=m CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y +CONFIG_FS_VERITY=y +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y CONFIG_FANOTIFY=y CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA_NETLINK_INTERFACE=y @@ -589,6 +598,7 @@ CONFIG_QFMT_V2=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m CONFIG_FSCACHE=m CONFIG_CACHEFILES=m @@ -648,12 +658,15 @@ CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y CONFIG_INTEGRITY_SIGNATURE=y CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y +CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set CONFIG_CRYPTO_PCRYPT=m @@ -664,10 +677,6 @@ CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_AEGIS128L=m -CONFIG_CRYPTO_AEGIS256=m -CONFIG_CRYPTO_MORUS640=m -CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m @@ -739,7 +748,6 @@ CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_GDB_SCRIPTS=y CONFIG_FRAME_WARN=1024 -CONFIG_UNUSED_SYMBOLS=y CONFIG_HEADERS_INSTALL=y CONFIG_HEADERS_CHECK=y CONFIG_DEBUG_SECTION_MISMATCH=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8514b8b9500f..25f799849582 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -44,6 +44,7 @@ CONFIG_NUMA=y # CONFIG_NUMA_EMU is not set CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_CHSC_SCH=y @@ -66,11 +67,12 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y CONFIG_MODULE_SIG_SHA256=y +CONFIG_UNUSED_SYMBOLS=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y CONFIG_BSD_DISKLABEL=y @@ -363,6 +365,7 @@ CONFIG_NETLINK_DIAG=m CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m +# CONFIG_NET_DROP_MONITOR is not set CONFIG_PCI=y CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y @@ -418,6 +421,7 @@ CONFIG_DM_CRYPT=m CONFIG_DM_SNAPSHOT=m CONFIG_DM_THIN_PROVISIONING=m CONFIG_DM_WRITECACHE=m +CONFIG_DM_CLONE=m CONFIG_DM_MIRROR=m CONFIG_DM_LOG_USERSPACE=m CONFIG_DM_RAID=m @@ -429,6 +433,7 @@ CONFIG_DM_DELAY=m CONFIG_DM_UEVENT=y CONFIG_DM_FLAKEY=m CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y CONFIG_DM_SWITCH=m CONFIG_DM_INTEGRITY=m CONFIG_NETDEVICES=y @@ -484,6 +489,7 @@ CONFIG_MLX5_CORE_EN=y # CONFIG_NET_VENDOR_NVIDIA is not set # CONFIG_NET_VENDOR_OKI is not set # CONFIG_NET_VENDOR_PACKET_ENGINES is not set +# CONFIG_NET_VENDOR_PENSANDO is not set # CONFIG_NET_VENDOR_QLOGIC is not set # CONFIG_NET_VENDOR_QUALCOMM is not set # CONFIG_NET_VENDOR_RDC is not set @@ -533,16 +539,16 @@ CONFIG_WATCHDOG_CORE=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m -CONFIG_DRM=y -CONFIG_DRM_VIRTIO_GPU=y -# CONFIG_BACKLIGHT_CLASS_DEVICE is not set +CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y # CONFIG_HID is not set # CONFIG_USB_SUPPORT is not set CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m +CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_VFIO_MDEV=m @@ -573,6 +579,8 @@ CONFIG_NILFS2_FS=m CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y +CONFIG_FS_VERITY=y +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y CONFIG_FANOTIFY=y CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA_NETLINK_INTERFACE=y @@ -581,6 +589,7 @@ CONFIG_QFMT_V2=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m CONFIG_FSCACHE=m CONFIG_CACHEFILES=m @@ -639,12 +648,15 @@ CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +CONFIG_SECURITY_LOCKDOWN_LSM_EARLY=y CONFIG_INTEGRITY_SIGNATURE=y CONFIG_INTEGRITY_ASYMMETRIC_KEYS=y CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y +CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set @@ -656,10 +668,6 @@ CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_AEGIS128L=m -CONFIG_CRYPTO_AEGIS256=m -CONFIG_CRYPTO_MORUS640=m -CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m @@ -727,7 +735,6 @@ CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_GDB_SCRIPTS=y CONFIG_FRAME_WARN=1024 -CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_MEMORY_INIT=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index be09a208b608..20c51e5d9353 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -61,7 +61,7 @@ CONFIG_RAW_DRIVER=y CONFIG_CONFIGFS_FS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -# CONFIG_DIMLIB is not set +CONFIG_LSM="yama,loadpin,safesetid,integrity" CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y From 6822c29ddbbdeafd8d1b79ebe6c51b83efd55ae1 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 4 Oct 2019 17:41:02 +0000 Subject: [PATCH 282/312] MIPS: fw/arc: Remove unused addr variable The addr variable in prom_free_prom_memory() has been unused since commit 0df1007677d5 ("MIPS: fw: Record prom memory"), leading to a compiler warning: arch/mips/fw/arc/memory.c:163:16: warning: unused variable 'addr' [-Wunused-variable] Fix this by removing the unused variable. Signed-off-by: Paul Burton Fixes: 0df1007677d5 ("MIPS: fw: Record prom memory") Cc: Jiaxun Yang Cc: linux-mips@vger.kernel.org --- arch/mips/fw/arc/memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/mips/fw/arc/memory.c b/arch/mips/fw/arc/memory.c index af44b35d79a1..b4328b3b5288 100644 --- a/arch/mips/fw/arc/memory.c +++ b/arch/mips/fw/arc/memory.c @@ -160,7 +160,6 @@ void __init prom_meminit(void) void __init prom_free_prom_memory(void) { - unsigned long addr; int i; if (prom_flags & PROM_FLAG_DONT_FREE_TEMP) From cf74ac6db25d4002089e85cc623ad149ecc25614 Mon Sep 17 00:00:00 2001 From: Reinhard Speyerer Date: Thu, 3 Oct 2019 18:34:39 +0200 Subject: [PATCH 283/312] qmi_wwan: add support for Cinterion CLS8 devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Cinterion CLS8 devices. Use QMI_QUIRK_SET_DTR as required for Qualcomm MDM9x07 chipsets. T: Bus=01 Lev=03 Prnt=05 Port=01 Cnt=02 Dev#= 25 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1e2d ProdID=00b0 Rev= 3.18 S: Manufacturer=GEMALTO S: Product=USB Modem C:* #Ifs= 5 Cfg#= 1 Atr=80 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Reinhard Speyerer Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index b6dc5d714b5e..3d77cd402ba9 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1350,6 +1350,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1e2d, 0x0082, 4)}, /* Cinterion PHxx,PXxx (2 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0082, 5)}, /* Cinterion PHxx,PXxx (2 RmNet) */ {QMI_FIXED_INTF(0x1e2d, 0x0083, 4)}, /* Cinterion PHxx,PXxx (1 RmNet + USB Audio)*/ + {QMI_QUIRK_SET_DTR(0x1e2d, 0x00b0, 4)}, /* Cinterion CLS8 */ {QMI_FIXED_INTF(0x413c, 0x81a2, 8)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a3, 8)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {QMI_FIXED_INTF(0x413c, 0x81a4, 8)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ From db9b2e0af605e7c994784527abfd9276cabd718a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Oct 2019 17:44:44 +0100 Subject: [PATCH 284/312] rxrpc: Fix rxrpc_recvmsg tracepoint Fix the rxrpc_recvmsg tracepoint to handle being called with a NULL call parameter. Fixes: a25e21f0bcd2 ("rxrpc, afs: Use debug_ids rather than pointers in traces") Signed-off-by: David Howells Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a13a62db3565..edc5c887a44c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1068,7 +1068,7 @@ TRACE_EVENT(rxrpc_recvmsg, ), TP_fast_assign( - __entry->call = call->debug_id; + __entry->call = call ? call->debug_id : 0; __entry->why = why; __entry->seq = seq; __entry->offset = offset; From 7a512eb865aa5fcfb396c71047bc9c3b046836b3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 4 Oct 2019 00:44:40 +0300 Subject: [PATCH 285/312] net: make sock_prot_memory_pressure() return "const char *" This function returns string literals which are "const char *". Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index e23ec80a67bf..fac2b4d80de5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3497,7 +3497,7 @@ static long sock_prot_memory_allocated(struct proto *proto) return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } -static char *sock_prot_memory_pressure(struct proto *proto) +static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; From 8ae72cbf62d2c1879456c0c5872f958e18f53711 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 3 Oct 2019 14:46:15 -0700 Subject: [PATCH 286/312] Revert "ipv6: Handle race in addrconf_dad_work" This reverts commit a3ce2a21bb8969ae27917281244fa91bf5f286d7. Eric reported tests failings with commit. After digging into it, the bottom line is that the DAD sequence is not to be messed with. There are too many cases that are expected to proceed regardless of whether a device is up. Revert the patch and I will send a different solution for the problem Rajendra reported. Signed-off-by: David Ahern Cc: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index dd3be06d5a06..6a576ff92c39 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4032,12 +4032,6 @@ static void addrconf_dad_work(struct work_struct *w) rtnl_lock(); - /* check if device was taken down before this delayed work - * function could be canceled - */ - if (idev->dead || !(idev->if_flags & IF_READY)) - goto out; - spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { action = DAD_BEGIN; @@ -4083,6 +4077,11 @@ static void addrconf_dad_work(struct work_struct *w) goto out; write_lock_bh(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) { + write_unlock_bh(&idev->lock); + goto out; + } + spin_lock(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_DEAD) { spin_unlock(&ifp->lock); From 6b517374f4ea5a3c6e307e1219ec5f35d42e6d00 Mon Sep 17 00:00:00 2001 From: Vishal Kulkarni Date: Fri, 4 Oct 2019 04:06:15 +0530 Subject: [PATCH 287/312] cxgb4:Fix out-of-bounds MSI-X info array access When fetching free MSI-X vectors for ULDs, check for the error code before accessing MSI-X info array. Otherwise, an out-of-bounds access is attempted, which results in kernel panic. Fixes: 94cdb8bb993a ("cxgb4: Add support for dynamic allocation of resources for ULD") Signed-off-by: Shahjada Abul Husain Signed-off-by: Vishal Kulkarni Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c index 5b602243d573..a4dead4ab0ed 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c @@ -137,13 +137,12 @@ static int uldrx_handler(struct sge_rspq *q, const __be64 *rsp, static int alloc_uld_rxqs(struct adapter *adap, struct sge_uld_rxq_info *rxq_info, bool lro) { - struct sge *s = &adap->sge; unsigned int nq = rxq_info->nrxq + rxq_info->nciq; + int i, err, msi_idx, que_idx = 0, bmap_idx = 0; struct sge_ofld_rxq *q = rxq_info->uldrxq; unsigned short *ids = rxq_info->rspq_id; - unsigned int bmap_idx = 0; + struct sge *s = &adap->sge; unsigned int per_chan; - int i, err, msi_idx, que_idx = 0; per_chan = rxq_info->nrxq / adap->params.nports; @@ -161,6 +160,10 @@ static int alloc_uld_rxqs(struct adapter *adap, if (msi_idx >= 0) { bmap_idx = get_msix_idx_from_bmap(adap); + if (bmap_idx < 0) { + err = -ENOSPC; + goto freeout; + } msi_idx = adap->msix_info_ulds[bmap_idx].idx; } err = t4_sge_alloc_rxq(adap, &q->rspq, false, From a54cdeeb04fc719e4c7f19d6e28dba7ea86cee5b Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 4 Oct 2019 20:51:04 +0800 Subject: [PATCH 288/312] r8152: Set macpassthru in reset_resume callback r8152 may fail to establish network connection after resume from system suspend. If the USB port connects to r8152 lost its power during system suspend, the MAC address was written before is lost. The reason is that The MAC address doesn't get written again in its reset_resume callback. So let's set MAC address again in reset_resume callback. Also remove unnecessary lock as no other locking attempt will happen during reset_resume. Signed-off-by: Kai-Heng Feng Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 08726090570e..cee9fef925cd 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -4799,10 +4799,9 @@ static int rtl8152_reset_resume(struct usb_interface *intf) struct r8152 *tp = usb_get_intfdata(intf); clear_bit(SELECTIVE_SUSPEND, &tp->flags); - mutex_lock(&tp->control); tp->rtl_ops.init(tp); queue_delayed_work(system_long_wq, &tp->hw_phy_work, 0); - mutex_unlock(&tp->control); + set_ethernet_addr(tp); return rtl8152_resume(intf); } From b406472b5ad79ede8d10077f0c8f05505ace8b6d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 4 Oct 2019 15:11:17 +0200 Subject: [PATCH 289/312] net: ipv4: avoid mixed n_redirects and rate_tokens usage Since commit c09551c6ff7f ("net: ipv4: use a dedicated counter for icmp_v4 redirect packets") we use 'n_redirects' to account for redirect packets, but we still use 'rate_tokens' to compute the redirect packets exponential backoff. If the device sent to the relevant peer any ICMP error packet after sending a redirect, it will also update 'rate_token' according to the leaking bucket schema; typically 'rate_token' will raise above BITS_PER_LONG and the redirect packets backoff algorithm will produce undefined behavior. Fix the issue using 'n_redirects' to compute the exponential backoff in ip_rt_send_redirect(). Note that we still clear rate_tokens after a redirect silence period, to avoid changing an established behaviour. The root cause predates git history; before the mentioned commit in the critical scenario, the kernel stopped sending redirects, after the mentioned commit the behavior more randomic. Reported-by: Xiumei Mu Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: c09551c6ff7f ("net: ipv4: use a dedicated counter for icmp_v4 redirect packets") Signed-off-by: Paolo Abeni Acked-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv4/route.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7dcce724c78b..14654876127e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -916,16 +916,15 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (peer->rate_tokens == 0 || time_after(jiffies, (peer->rate_last + - (ip_rt_redirect_load << peer->rate_tokens)))) { + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; - ++peer->rate_tokens; ++peer->n_redirects; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - peer->rate_tokens == ip_rt_redirect_number) + peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); From ea977d19d918324ad5b66953f051a6ed07d0a3c5 Mon Sep 17 00:00:00 2001 From: Andrea Merello Date: Fri, 4 Oct 2019 15:53:32 +0200 Subject: [PATCH 290/312] net: phy: allow for reset line to be tied to a sleepy GPIO controller mdio_device_reset() makes use of the atomic-pretending API flavor for handling the PHY reset GPIO line. I found no hint that mdio_device_reset() is called from atomic context and indeed it uses usleep_range() since long time, so I would assume that it is OK to sleep there. This patch switch to gpiod_set_value_cansleep() in mdio_device_reset(). This is relevant if e.g. the PHY reset line is tied to a I2C GPIO controller. This has been tested on a ZynqMP board running an upstream 4.19 kernel and then hand-ported on current kernel tree. Signed-off-by: Andrea Merello Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c index e282600bd83e..c1d345c3cab3 100644 --- a/drivers/net/phy/mdio_device.c +++ b/drivers/net/phy/mdio_device.c @@ -121,7 +121,7 @@ void mdio_device_reset(struct mdio_device *mdiodev, int value) return; if (mdiodev->reset_gpio) - gpiod_set_value(mdiodev->reset_gpio, value); + gpiod_set_value_cansleep(mdiodev->reset_gpio, value); if (mdiodev->reset_ctrl) { if (value) From 2d819d250a1393a3e725715425ab70a0e0772a71 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 4 Oct 2019 08:03:09 -0700 Subject: [PATCH 291/312] ipv6: Handle missing host route in __ipv6_ifa_notify Rajendra reported a kernel panic when a link was taken down: [ 6870.263084] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 [ 6870.271856] IP: [] __ipv6_ifa_notify+0x154/0x290 [ 6870.570501] Call Trace: [ 6870.573238] [] ? ipv6_ifa_notify+0x26/0x40 [ 6870.579665] [] ? addrconf_dad_completed+0x4c/0x2c0 [ 6870.586869] [] ? ipv6_dev_mc_inc+0x196/0x260 [ 6870.593491] [] ? addrconf_dad_work+0x10a/0x430 [ 6870.600305] [] ? __switch_to_asm+0x34/0x70 [ 6870.606732] [] ? process_one_work+0x18a/0x430 [ 6870.613449] [] ? worker_thread+0x4d/0x490 [ 6870.619778] [] ? process_one_work+0x430/0x430 [ 6870.626495] [] ? kthread+0xd9/0xf0 [ 6870.632145] [] ? __switch_to_asm+0x34/0x70 [ 6870.638573] [] ? kthread_park+0x60/0x60 [ 6870.644707] [] ? ret_from_fork+0x57/0x70 [ 6870.650936] Code: 31 c0 31 d2 41 b9 20 00 08 02 b9 09 00 00 0 addrconf_dad_work is kicked to be scheduled when a device is brought up. There is a race between addrcond_dad_work getting scheduled and taking the rtnl lock and a process taking the link down (under rtnl). The latter removes the host route from the inet6_addr as part of addrconf_ifdown which is run for NETDEV_DOWN. The former attempts to use the host route in __ipv6_ifa_notify. If the down event removes the host route due to the race to the rtnl, then the BUG listed above occurs. Since the DAD sequence can not be aborted, add a check for the missing host route in __ipv6_ifa_notify. The only way this should happen is due to the previously mentioned race. The host route is created when the address is added to an interface; it is only removed on a down event where the address is kept. Add a warning if the host route is missing AND the device is up; this is a situation that should never happen. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Reported-by: Rajendra Dendukuri Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6a576ff92c39..34ccef18b40e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5964,13 +5964,20 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) switch (event) { case RTM_NEWADDR: /* - * If the address was optimistic - * we inserted the route at the start of - * our DAD process, so we don't need - * to do it again + * If the address was optimistic we inserted the route at the + * start of our DAD process, so we don't need to do it again. + * If the device was taken down in the middle of the DAD + * cycle there is a race where we could get here without a + * host route, so nothing to insert. That will be fixed when + * the device is brought up. */ - if (!rcu_access_pointer(ifp->rt->fib6_node)) + if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) { ip6_ins_rt(net, ifp->rt); + } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) { + pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n", + &ifp->addr, ifp->idev->dev->name); + } + if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) From 4cf6c57e61fee954f7b7685de31b80ec26843d27 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:05:58 +0100 Subject: [PATCH 292/312] net: phy: fix write to mii-ctrl1000 register When userspace writes to the MII_ADVERTISE register, we update phylib's advertising mask and trigger a renegotiation. However, writing to the MII_CTRL1000 register, which contains the gigabit advertisement, does neither. This can lead to phylib's copy of the advertisement becoming de-synced with the values in the PHY register set, which can result in incorrect negotiation resolution. Fixes: 5502b218e001 ("net: phy: use phy_resolve_aneg_linkmode in genphy_read_status") Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 5 +++++ include/linux/mii.h | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 7c92afd36bbe..119e6f466056 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -457,6 +457,11 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd) val); change_autoneg = true; break; + case MII_CTRL1000: + mii_ctrl1000_mod_linkmode_adv_t(phydev->advertising, + val); + change_autoneg = true; + break; default: /* do nothing */ break; diff --git a/include/linux/mii.h b/include/linux/mii.h index 5cd824c1c0ca..4ce8901a1af6 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -455,6 +455,15 @@ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising, lp_advertising, lpa & LPA_LPACK); } +static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising, + u32 ctrl1000) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, + ctrl1000 & ADVERTISE_1000HALF); + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, + ctrl1000 & ADVERTISE_1000FULL); +} + /** * linkmode_adv_to_lcl_adv_t * @advertising:pointer to linkmode advertising From 8d3dc3ac9dd6801c732a72ca6979698c38451b4f Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:06:04 +0100 Subject: [PATCH 293/312] net: phy: extract link partner advertisement reading Move reading the link partner advertisement out of genphy_read_status() into its own separate function. This will allow re-use of this code by PHY drivers that are able to read the resolved status from the PHY. Tested-by: tinywrkb Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 65 ++++++++++++++++++++++-------------- include/linux/phy.h | 1 + 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index d347ddcac45b..9d2bbb13293e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1783,32 +1783,9 @@ done: } EXPORT_SYMBOL(genphy_update_link); -/** - * genphy_read_status - check the link status and update current link state - * @phydev: target phy_device struct - * - * Description: Check the link, then figure out the current state - * by comparing what we advertise with what the link partner - * advertises. Start by checking the gigabit possibilities, - * then move on to 10/100. - */ -int genphy_read_status(struct phy_device *phydev) +int genphy_read_lpa(struct phy_device *phydev) { - int lpa, lpagb, err, old_link = phydev->link; - - /* Update the link, but return if there was an error */ - err = genphy_update_link(phydev); - if (err) - return err; - - /* why bother the PHY if nothing can have changed */ - if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link) - return 0; - - phydev->speed = SPEED_UNKNOWN; - phydev->duplex = DUPLEX_UNKNOWN; - phydev->pause = 0; - phydev->asym_pause = 0; + int lpa, lpagb; if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) { if (phydev->is_gigabit_capable) { @@ -1838,6 +1815,44 @@ int genphy_read_status(struct phy_device *phydev) return lpa; mii_lpa_mod_linkmode_lpa_t(phydev->lp_advertising, lpa); + } + + return 0; +} +EXPORT_SYMBOL(genphy_read_lpa); + +/** + * genphy_read_status - check the link status and update current link state + * @phydev: target phy_device struct + * + * Description: Check the link, then figure out the current state + * by comparing what we advertise with what the link partner + * advertises. Start by checking the gigabit possibilities, + * then move on to 10/100. + */ +int genphy_read_status(struct phy_device *phydev) +{ + int err, old_link = phydev->link; + + /* Update the link, but return if there was an error */ + err = genphy_update_link(phydev); + if (err) + return err; + + /* why bother the PHY if nothing can have changed */ + if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link) + return 0; + + phydev->speed = SPEED_UNKNOWN; + phydev->duplex = DUPLEX_UNKNOWN; + phydev->pause = 0; + phydev->asym_pause = 0; + + err = genphy_read_lpa(phydev); + if (err < 0) + return err; + + if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) { phy_resolve_aneg_linkmode(phydev); } else if (phydev->autoneg == AUTONEG_DISABLE) { int bmcr = phy_read(phydev, MII_BMCR); diff --git a/include/linux/phy.h b/include/linux/phy.h index a7ecbe0e55aa..7abee820d05c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1076,6 +1076,7 @@ int genphy_config_eee_advert(struct phy_device *phydev); int __genphy_config_aneg(struct phy_device *phydev, bool changed); int genphy_aneg_done(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); +int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); From 2d880b8709c013d47472f85a9d42ea1aca3bce47 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:06:09 +0100 Subject: [PATCH 294/312] net: phy: extract pause mode Extract the update of phylib's software pause mode state from genphy_read_status(), so that we can re-use this functionality with PHYs that have alternative ways to read the negotiation results. Tested-by: tinywrkb Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 20 +++++++++++++------- include/linux/phy.h | 1 + 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 369903d9b6ec..9412669b579c 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -283,6 +283,18 @@ void of_set_phy_eee_broken(struct phy_device *phydev) phydev->eee_broken_modes = broken; } +void phy_resolve_aneg_pause(struct phy_device *phydev) +{ + if (phydev->duplex == DUPLEX_FULL) { + phydev->pause = linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, + phydev->lp_advertising); + phydev->asym_pause = linkmode_test_bit( + ETHTOOL_LINK_MODE_Asym_Pause_BIT, + phydev->lp_advertising); + } +} +EXPORT_SYMBOL_GPL(phy_resolve_aneg_pause); + /** * phy_resolve_aneg_linkmode - resolve the advertisements into phy settings * @phydev: The phy_device struct @@ -305,13 +317,7 @@ void phy_resolve_aneg_linkmode(struct phy_device *phydev) break; } - if (phydev->duplex == DUPLEX_FULL) { - phydev->pause = linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, - phydev->lp_advertising); - phydev->asym_pause = linkmode_test_bit( - ETHTOOL_LINK_MODE_Asym_Pause_BIT, - phydev->lp_advertising); - } + phy_resolve_aneg_pause(phydev); } EXPORT_SYMBOL_GPL(phy_resolve_aneg_linkmode); diff --git a/include/linux/phy.h b/include/linux/phy.h index 7abee820d05c..9a0e981df502 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -678,6 +678,7 @@ static inline bool phy_is_started(struct phy_device *phydev) return phydev->state >= PHY_UP; } +void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); /** From 06d5f3441b2e80eeb6deb0885aefa00589e463c1 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 4 Oct 2019 17:06:14 +0100 Subject: [PATCH 295/312] net: phy: at803x: use operating parameters from PHY-specific status Read the PHY-specific status register for the current operating mode (speed and duplex) of the PHY. This register reflects the actual mode that the PHY has resolved depending on either the advertisements of autoneg is enabled, or the forced mode if autoneg is disabled. This ensures that phylib's software state always tracks the hardware state. It seems both AR8033 (which uses the AR8031 ID) and AR8035 support this status register. AR8030 is not known at the present time. This patch depends on "net: phy: extract pause mode" and "net: phy: extract link partner advertisement reading". Reported-by: tinywrkb Reviewed-by: Andrew Lunn Tested-by: tinywrkb Fixes: 5502b218e001 ("net: phy: use phy_resolve_aneg_linkmode in genphy_read_status") Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 69 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 2aa7b2e60046..1eb5d4fb8925 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -15,6 +15,15 @@ #include #include +#define AT803X_SPECIFIC_STATUS 0x11 +#define AT803X_SS_SPEED_MASK (3 << 14) +#define AT803X_SS_SPEED_1000 (2 << 14) +#define AT803X_SS_SPEED_100 (1 << 14) +#define AT803X_SS_SPEED_10 (0 << 14) +#define AT803X_SS_DUPLEX BIT(13) +#define AT803X_SS_SPEED_DUPLEX_RESOLVED BIT(11) +#define AT803X_SS_MDIX BIT(6) + #define AT803X_INTR_ENABLE 0x12 #define AT803X_INTR_ENABLE_AUTONEG_ERR BIT(15) #define AT803X_INTR_ENABLE_SPEED_CHANGED BIT(14) @@ -357,6 +366,64 @@ static int at803x_aneg_done(struct phy_device *phydev) return aneg_done; } +static int at803x_read_status(struct phy_device *phydev) +{ + int ss, err, old_link = phydev->link; + + /* Update the link, but return if there was an error */ + err = genphy_update_link(phydev); + if (err) + return err; + + /* why bother the PHY if nothing can have changed */ + if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link) + return 0; + + phydev->speed = SPEED_UNKNOWN; + phydev->duplex = DUPLEX_UNKNOWN; + phydev->pause = 0; + phydev->asym_pause = 0; + + err = genphy_read_lpa(phydev); + if (err < 0) + return err; + + /* Read the AT8035 PHY-Specific Status register, which indicates the + * speed and duplex that the PHY is actually using, irrespective of + * whether we are in autoneg mode or not. + */ + ss = phy_read(phydev, AT803X_SPECIFIC_STATUS); + if (ss < 0) + return ss; + + if (ss & AT803X_SS_SPEED_DUPLEX_RESOLVED) { + switch (ss & AT803X_SS_SPEED_MASK) { + case AT803X_SS_SPEED_10: + phydev->speed = SPEED_10; + break; + case AT803X_SS_SPEED_100: + phydev->speed = SPEED_100; + break; + case AT803X_SS_SPEED_1000: + phydev->speed = SPEED_1000; + break; + } + if (ss & AT803X_SS_DUPLEX) + phydev->duplex = DUPLEX_FULL; + else + phydev->duplex = DUPLEX_HALF; + if (ss & AT803X_SS_MDIX) + phydev->mdix = ETH_TP_MDI_X; + else + phydev->mdix = ETH_TP_MDI; + } + + if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) + phy_resolve_aneg_pause(phydev); + + return 0; +} + static struct phy_driver at803x_driver[] = { { /* ATHEROS 8035 */ @@ -370,6 +437,7 @@ static struct phy_driver at803x_driver[] = { .suspend = at803x_suspend, .resume = at803x_resume, /* PHY_GBIT_FEATURES */ + .read_status = at803x_read_status, .ack_interrupt = at803x_ack_interrupt, .config_intr = at803x_config_intr, }, { @@ -399,6 +467,7 @@ static struct phy_driver at803x_driver[] = { .suspend = at803x_suspend, .resume = at803x_resume, /* PHY_GBIT_FEATURES */ + .read_status = at803x_read_status, .aneg_done = at803x_aneg_done, .ack_interrupt = &at803x_ack_interrupt, .config_intr = &at803x_config_intr, From 474f0813a3002cb299bb73a5a93aa1f537a80ca8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2019 10:34:45 -0700 Subject: [PATCH 296/312] sch_dsmark: fix potential NULL deref in dsmark_init() Make sure TCA_DSMARK_INDICES was provided by the user. syzbot reported : kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8799 Comm: syz-executor235 Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:nla_get_u16 include/net/netlink.h:1501 [inline] RIP: 0010:dsmark_init net/sched/sch_dsmark.c:364 [inline] RIP: 0010:dsmark_init+0x193/0x640 net/sched/sch_dsmark.c:339 Code: 85 db 58 0f 88 7d 03 00 00 e8 e9 1a ac fb 48 8b 9d 70 ff ff ff 48 b8 00 00 00 00 00 fc ff df 48 8d 7b 04 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 01 38 d0 7c 08 84 d2 0f 85 ca RSP: 0018:ffff88809426f3b8 EFLAGS: 00010247 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff85c6eb09 RDX: 0000000000000000 RSI: ffffffff85c6eb17 RDI: 0000000000000004 RBP: ffff88809426f4b0 R08: ffff88808c4085c0 R09: ffffed1015d26159 R10: ffffed1015d26158 R11: ffff8880ae930ac7 R12: ffff8880a7e96940 R13: dffffc0000000000 R14: ffff88809426f8c0 R15: 0000000000000000 FS: 0000000001292880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000080 CR3: 000000008ca1b000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: qdisc_create+0x4ee/0x1210 net/sched/sch_api.c:1237 tc_modify_qdisc+0x524/0x1c50 net/sched/sch_api.c:1653 rtnetlink_rcv_msg+0x463/0xb00 net/core/rtnetlink.c:5223 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5241 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x803/0x920 net/socket.c:2311 __sys_sendmsg+0x105/0x1d0 net/socket.c:2356 __do_sys_sendmsg net/socket.c:2365 [inline] __se_sys_sendmsg net/socket.c:2363 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2363 do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x440369 Fixes: 758cc43c6d73 ("[PKT_SCHED]: Fix dsmark to apply changes consistent") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index bad1cbe59a56..05605b30bef3 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -361,6 +361,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt, goto errout; err = -EINVAL; + if (!tb[TCA_DSMARK_INDICES]) + goto errout; indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); if (hweight32(indices) != 1) From a0c2dc1fe63e2869b74c1c7f6a81d1745c8a695d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2019 11:08:34 -0700 Subject: [PATCH 297/312] nfc: fix memory leak in llcp_sock_bind() sysbot reported a memory leak after a bind() has failed. While we are at it, abort the operation if kmemdup() has failed. BUG: memory leak unreferenced object 0xffff888105d83ec0 (size 32): comm "syz-executor067", pid 7207, jiffies 4294956228 (age 19.430s) hex dump (first 32 bytes): 00 69 6c 65 20 72 65 61 64 00 6e 65 74 3a 5b 34 .ile read.net:[4 30 32 36 35 33 33 30 39 37 5d 00 00 00 00 00 00 026533097]...... backtrace: [<0000000036bac473>] kmemleak_alloc_recursive /./include/linux/kmemleak.h:43 [inline] [<0000000036bac473>] slab_post_alloc_hook /mm/slab.h:522 [inline] [<0000000036bac473>] slab_alloc /mm/slab.c:3319 [inline] [<0000000036bac473>] __do_kmalloc /mm/slab.c:3653 [inline] [<0000000036bac473>] __kmalloc_track_caller+0x169/0x2d0 /mm/slab.c:3670 [<000000000cd39d07>] kmemdup+0x27/0x60 /mm/util.c:120 [<000000008e57e5fc>] kmemdup /./include/linux/string.h:432 [inline] [<000000008e57e5fc>] llcp_sock_bind+0x1b3/0x230 /net/nfc/llcp_sock.c:107 [<000000009cb0b5d3>] __sys_bind+0x11c/0x140 /net/socket.c:1647 [<00000000492c3bbc>] __do_sys_bind /net/socket.c:1658 [inline] [<00000000492c3bbc>] __se_sys_bind /net/socket.c:1656 [inline] [<00000000492c3bbc>] __x64_sys_bind+0x1e/0x30 /net/socket.c:1656 [<0000000008704b2a>] do_syscall_64+0x76/0x1a0 /arch/x86/entry/common.c:296 [<000000009f4c57a4>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 30cc4587659e ("NFC: Move LLCP code to the NFC top level diirectory") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/nfc/llcp_sock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 8dfea26536c9..ccdd790e163a 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -107,9 +107,14 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) llcp_sock->service_name = kmemdup(llcp_addr.service_name, llcp_sock->service_name_len, GFP_KERNEL); - + if (!llcp_sock->service_name) { + ret = -ENOMEM; + goto put_dev; + } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { + kfree(llcp_sock->service_name); + llcp_sock->service_name = NULL; ret = -EADDRINUSE; goto put_dev; } From 1acb8f2a7a9f10543868ddd737e37424d5c36cf4 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Fri, 4 Oct 2019 15:24:39 -0500 Subject: [PATCH 298/312] net: qlogic: Fix memory leak in ql_alloc_large_buffers In ql_alloc_large_buffers, a new skb is allocated via netdev_alloc_skb. This skb should be released if pci_dma_mapping_error fails. Fixes: 0f8ab89e825f ("qla3xxx: Check return code from pci_map_single() in ql_release_to_lrg_buf_free_list(), ql_populate_free_queue(), ql_alloc_large_buffers(), and ql3xxx_send()") Signed-off-by: Navid Emamdoost Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qla3xxx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index 457444894d80..b4b8ba00ee01 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -2787,6 +2787,7 @@ static int ql_alloc_large_buffers(struct ql3_adapter *qdev) netdev_err(qdev->ndev, "PCI mapping failed with error: %d\n", err); + dev_kfree_skb_irq(skb); ql_free_large_buffers(qdev); return -ENOMEM; } From ef129d34149ea23d0d442844fc25ae26a85589fc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 17:36:50 -0700 Subject: [PATCH 299/312] selftests/net: add nettest to .gitignore nettest is missing from gitignore. Fixes: acda655fefae ("selftests: Add nettest") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/net/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index c7cced739c34..8aefd81fbc86 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -21,3 +21,4 @@ ipv6_flowlabel ipv6_flowlabel_mgr so_txtime tcp_fastopen_backup_key +nettest From 6b190d3ce0a693750517256ba67011a9fa04a12a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 26 Jul 2019 11:10:54 +0900 Subject: [PATCH 300/312] integrity: remove unneeded, broken attempt to add -fshort-wchar I guess commit 15ea0e1e3e18 ("efi: Import certificates from UEFI Secure Boot") attempted to add -fshort-wchar for building load_uefi.o, but it has never worked as intended. load_uefi.o is created in the platform_certs/ sub-directory. If you really want to add -fshort-wchar, the correct code is: $(obj)/platform_certs/load_uefi.o: KBUILD_CFLAGS += -fshort-wchar But, you do not need to fix it. Commit 8c97023cf051 ("Kbuild: use -fshort-wchar globally") had already added -fshort-wchar globally. This code was unneeded in the first place. Signed-off-by: Masahiro Yamada --- security/integrity/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/security/integrity/Makefile b/security/integrity/Makefile index 19faace69644..b6d6273a4176 100644 --- a/security/integrity/Makefile +++ b/security/integrity/Makefile @@ -13,7 +13,6 @@ integrity-$(CONFIG_INTEGRITY_PLATFORM_KEYRING) += platform_certs/platform_keyrin integrity-$(CONFIG_LOAD_UEFI_KEYS) += platform_certs/efi_parser.o \ platform_certs/load_uefi.o integrity-$(CONFIG_LOAD_IPL_KEYS) += platform_certs/load_ipl_s390.o -$(obj)/load_uefi.o: KBUILD_CFLAGS += -fshort-wchar subdir-$(CONFIG_IMA) += ima obj-$(CONFIG_IMA) += ima/ From 7a8beb7ad51551babd17331f3c0f58af25916f67 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 26 Jul 2019 11:10:55 +0900 Subject: [PATCH 301/312] integrity: remove pointless subdir-$(CONFIG_...) The ima/ and evm/ sub-directories contain built-in objects, so obj-$(CONFIG_...) is the correct way to descend into them. subdir-$(CONFIG_...) is redundant. Signed-off-by: Masahiro Yamada --- security/integrity/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/security/integrity/Makefile b/security/integrity/Makefile index b6d6273a4176..35e6ca773734 100644 --- a/security/integrity/Makefile +++ b/security/integrity/Makefile @@ -14,7 +14,5 @@ integrity-$(CONFIG_LOAD_UEFI_KEYS) += platform_certs/efi_parser.o \ platform_certs/load_uefi.o integrity-$(CONFIG_LOAD_IPL_KEYS) += platform_certs/load_ipl_s390.o -subdir-$(CONFIG_IMA) += ima obj-$(CONFIG_IMA) += ima/ -subdir-$(CONFIG_EVM) += evm obj-$(CONFIG_EVM) += evm/ From a9bbe79fd5cb2411d69ab8a2e94bb701ca45e133 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 21 Aug 2019 13:12:34 +0900 Subject: [PATCH 302/312] video/logo: remove unneeded *.o pattern from clean-files The pattern *.o is cleaned up globally by the top Makefile. Signed-off-by: Masahiro Yamada --- drivers/video/logo/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/logo/Makefile b/drivers/video/logo/Makefile index 228a89b9bdd1..10b75ce3ce09 100644 --- a/drivers/video/logo/Makefile +++ b/drivers/video/logo/Makefile @@ -56,4 +56,4 @@ $(obj)/%_gray256.c: $(src)/%_gray256.pgm $(pnmtologo) FORCE $(call if_changed,logo) # Files generated that shall be removed upon make clean -clean-files := *.o *_mono.c *_vga16.c *_clut224.c *_gray256.c +clean-files := *_mono.c *_vga16.c *_clut224.c *_gray256.c From 01bb25156d856e3efb887a273c03f8ba66f8fb0f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 21 Aug 2019 13:12:35 +0900 Subject: [PATCH 303/312] video/logo: do not generate unneeded logo C files Currently, all the logo C files are generated irrespective of the CONFIG options. Adding them to extra-y is wrong. What we need to do here is to add them to 'targets' so that if_changed works properly. Files listed in 'targets' are cleaned, so clean-files is unneeded. Signed-off-by: Masahiro Yamada --- drivers/video/logo/Makefile | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/drivers/video/logo/Makefile b/drivers/video/logo/Makefile index 10b75ce3ce09..16f60c1e1766 100644 --- a/drivers/video/logo/Makefile +++ b/drivers/video/logo/Makefile @@ -18,23 +18,6 @@ obj-$(CONFIG_SPU_BASE) += logo_spe_clut224.o # How to generate logo's -# Use logo-cfiles to retrieve list of .c files to be built -logo-cfiles = $(notdir $(patsubst %.$(2), %.c, \ - $(wildcard $(srctree)/$(src)/*$(1).$(2)))) - - -# Mono logos -extra-y += $(call logo-cfiles,_mono,pbm) - -# VGA16 logos -extra-y += $(call logo-cfiles,_vga16,ppm) - -# 224 Logos -extra-y += $(call logo-cfiles,_clut224,ppm) - -# Gray 256 -extra-y += $(call logo-cfiles,_gray256,pgm) - pnmtologo := scripts/pnmtologo # Create commands like "pnmtologo -t mono -n logo_mac_mono -o ..." @@ -55,5 +38,5 @@ $(obj)/%_clut224.c: $(src)/%_clut224.ppm $(pnmtologo) FORCE $(obj)/%_gray256.c: $(src)/%_gray256.pgm $(pnmtologo) FORCE $(call if_changed,logo) -# Files generated that shall be removed upon make clean -clean-files := *_mono.c *_vga16.c *_clut224.c *_gray256.c +# generated C files +targets += *_mono.c *_vga16.c *_clut224.c *_gray256.c From 82fdd12b95727640c9a8233c09d602e4518e71f7 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 27 Sep 2019 16:30:27 -0700 Subject: [PATCH 304/312] namespace: fix namespace.pl script to support relative paths The namespace.pl script does not work properly if objtree is not set to an absolute path. The do_nm function is run from within the find function, which changes directories. Because of this, appending objtree, $File::Find::dir, and $source, will return a path which is not valid from the current directory. This used to work when objtree was set to an absolute path when using "make namespacecheck". It appears to have not worked when calling ./scripts/namespace.pl directly. This behavior was changed in 7e1c04779efd ("kbuild: Use relative path for $(objtree)", 2014-05-14) Rather than fixing the Makefile to set objtree to an absolute path, just fix namespace.pl to work when srctree and objtree are relative. Also fix the script to use an absolute path for these by default. Use the File::Spec module for this purpose. It's been part of perl 5 since 5.005. The curdir() function is used to get the current directory when the objtree and srctree aren't set in the environment. rel2abs() is used to convert possibly relative objtree and srctree environment variables to absolute paths. Finally, the catfile() function is used instead of string appending paths together, since this is more robust when joining paths together. Signed-off-by: Jacob Keller Acked-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Masahiro Yamada --- scripts/namespace.pl | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/namespace.pl b/scripts/namespace.pl index 6135574a6f39..1da7bca201a4 100755 --- a/scripts/namespace.pl +++ b/scripts/namespace.pl @@ -65,13 +65,14 @@ use warnings; use strict; use File::Find; +use File::Spec; my $nm = ($ENV{'NM'} || "nm") . " -p"; my $objdump = ($ENV{'OBJDUMP'} || "objdump") . " -s -j .comment"; -my $srctree = ""; -my $objtree = ""; -$srctree = "$ENV{'srctree'}/" if (exists($ENV{'srctree'})); -$objtree = "$ENV{'objtree'}/" if (exists($ENV{'objtree'})); +my $srctree = File::Spec->curdir(); +my $objtree = File::Spec->curdir(); +$srctree = File::Spec->rel2abs($ENV{'srctree'}) if (exists($ENV{'srctree'})); +$objtree = File::Spec->rel2abs($ENV{'objtree'}) if (exists($ENV{'objtree'})); if ($#ARGV != -1) { print STDERR "usage: $0 takes no parameters\n"; @@ -231,9 +232,9 @@ sub do_nm } ($source = $basename) =~ s/\.o$//; if (-e "$source.c" || -e "$source.S") { - $source = "$objtree$File::Find::dir/$source"; + $source = File::Spec->catfile($objtree, $File::Find::dir, $source) } else { - $source = "$srctree$File::Find::dir/$source"; + $source = File::Spec->catfile($srctree, $File::Find::dir, $source) } if (! -e "$source.c" && ! -e "$source.S") { # No obvious source, exclude the object if it is conglomerate From 7a82e3fa28f174ba23c9faca544c65986e3025f1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 1 Oct 2019 21:17:24 +0900 Subject: [PATCH 305/312] scripts/setlocalversion: clear local variable to make it work for sh Geert Uytterhoeven reports a strange side-effect of commit 858805b336be ("kbuild: add $(BASH) to run scripts with bash-extension"), which inserts the contents of a localversion file in the build directory twice. [Steps to Reproduce] $ echo bar > localversion $ mkdir build $ cd build/ $ echo foo > localversion $ make -s -f ../Makefile defconfig include/config/kernel.release $ cat include/config/kernel.release 5.4.0-rc1foofoobar This comes down to the behavior change of local variables. The 'man sh' on my Ubuntu machine, where sh is an alias to dash, explains as follows: When a variable is made local, it inherits the initial value and exported and readonly flags from the variable with the same name in the surrounding scope, if there is one. Otherwise, the variable is initially unset. [Test Code] foo () { local res echo "res: $res" } res=1 foo [Result] $ sh test.sh res: 1 $ bash test.sh res: So, scripts/setlocalversion correctly works only for bash in spite of its hashbang being #!/bin/sh. Nobody had noticed it before because CONFIG_SHELL was previously set to bash almost all the time. Now that CONFIG_SHELL is set to sh, we must write portable and correct code. I gave the Fixes tag to the commit that uncovered the issue. Clear the variable 'res' in collect_files() to make it work for sh (and it also works on distributions where sh is an alias to bash). Fixes: 858805b336be ("kbuild: add $(BASH) to run scripts with bash-extension") Reported-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada Tested-by: Geert Uytterhoeven --- scripts/setlocalversion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/setlocalversion b/scripts/setlocalversion index 365b3c2b8f43..220dae0db3f1 100755 --- a/scripts/setlocalversion +++ b/scripts/setlocalversion @@ -126,7 +126,7 @@ scm_version() collect_files() { - local file res + local file res= for file; do case "$file" in From 43496709f166aeb87ac0bdcc6f7bb2bedafb17c9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Oct 2019 19:29:12 +0900 Subject: [PATCH 306/312] kbuild: two minor updates for Documentation/kbuild/modules.rst Capitalize the first word in the sentence. Use obj-m instead of obj-y. obj-y still works, but we have no built-in objects in external module builds. So, obj-m is better IMHO. Signed-off-by: Masahiro Yamada --- Documentation/kbuild/modules.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/kbuild/modules.rst b/Documentation/kbuild/modules.rst index 33a17121d11d..774a998dcf37 100644 --- a/Documentation/kbuild/modules.rst +++ b/Documentation/kbuild/modules.rst @@ -502,7 +502,7 @@ build. ---------------------------------------- Sometimes, an external module uses exported symbols from - another external module. kbuild needs to have full knowledge of + another external module. Kbuild needs to have full knowledge of all symbols to avoid spitting out warnings about undefined symbols. Three solutions exist for this situation. @@ -522,7 +522,7 @@ build. The top-level kbuild file would then look like:: #./Kbuild (or ./Makefile): - obj-y := foo/ bar/ + obj-m := foo/ bar/ And executing:: From d188b8c901bb4966ab035ce90af02426f062e9e1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Oct 2019 11:36:29 +0900 Subject: [PATCH 307/312] kbuild: update compile-test header list for v5.4-rc2 Commit 6dc280ebeed2 ("coda: remove uapi/linux/coda_psdev.h") removed a header in question. Some more build errors were fixed. Add more headers into the test coverage. Signed-off-by: Masahiro Yamada --- usr/include/Makefile | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/usr/include/Makefile b/usr/include/Makefile index c9449aaf438d..57b20f7b6729 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -29,13 +29,11 @@ header-test- += linux/android/binderfs.h header-test-$(CONFIG_CPU_BIG_ENDIAN) += linux/byteorder/big_endian.h header-test-$(CONFIG_CPU_LITTLE_ENDIAN) += linux/byteorder/little_endian.h header-test- += linux/coda.h -header-test- += linux/coda_psdev.h header-test- += linux/elfcore.h header-test- += linux/errqueue.h header-test- += linux/fsmap.h header-test- += linux/hdlc/ioctl.h header-test- += linux/ivtv.h -header-test- += linux/jffs2.h header-test- += linux/kexec.h header-test- += linux/matroxfb.h header-test- += linux/netfilter_ipv4/ipt_LOG.h @@ -55,20 +53,12 @@ header-test- += linux/v4l2-mediabus.h header-test- += linux/v4l2-subdev.h header-test- += linux/videodev2.h header-test- += linux/vm_sockets.h -header-test- += scsi/scsi_bsg_fc.h -header-test- += scsi/scsi_netlink.h -header-test- += scsi/scsi_netlink_fc.h header-test- += sound/asequencer.h header-test- += sound/asoc.h header-test- += sound/asound.h header-test- += sound/compress_offload.h header-test- += sound/emu10k1.h header-test- += sound/sfnt_info.h -header-test- += sound/sof/eq.h -header-test- += sound/sof/fw.h -header-test- += sound/sof/header.h -header-test- += sound/sof/manifest.h -header-test- += sound/sof/trace.h header-test- += xen/evtchn.h header-test- += xen/gntdev.h header-test- += xen/privcmd.h From 86cdd2fdc4e39c388d39c7ba2396d1a9dfd66226 Mon Sep 17 00:00:00 2001 From: Dmitry Goldin Date: Fri, 4 Oct 2019 10:40:07 +0000 Subject: [PATCH 308/312] kheaders: make headers archive reproducible In commit 43d8ce9d65a5 ("Provide in-kernel headers to make extending kernel easier") a new mechanism was introduced, for kernels >=5.2, which embeds the kernel headers in the kernel image or a module and exposes them in procfs for use by userland tools. The archive containing the header files has nondeterminism caused by header files metadata. This patch normalizes the metadata and utilizes KBUILD_BUILD_TIMESTAMP if provided and otherwise falls back to the default behaviour. In commit f7b101d33046 ("kheaders: Move from proc to sysfs") it was modified to use sysfs and the script for generation of the archive was renamed to what is being patched. Signed-off-by: Dmitry Goldin Reviewed-by: Greg Kroah-Hartman Reviewed-by: Joel Fernandes (Google) Signed-off-by: Masahiro Yamada --- Documentation/kbuild/reproducible-builds.rst | 13 +++++++++---- kernel/gen_kheaders.sh | 5 ++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Documentation/kbuild/reproducible-builds.rst b/Documentation/kbuild/reproducible-builds.rst index ab92e98c89c8..503393854e2e 100644 --- a/Documentation/kbuild/reproducible-builds.rst +++ b/Documentation/kbuild/reproducible-builds.rst @@ -16,16 +16,21 @@ the kernel may be unreproducible, and how to avoid them. Timestamps ---------- -The kernel embeds a timestamp in two places: +The kernel embeds timestamps in three places: * The version string exposed by ``uname()`` and included in ``/proc/version`` * File timestamps in the embedded initramfs -By default the timestamp is the current time. This must be overridden -using the `KBUILD_BUILD_TIMESTAMP`_ variable. If you are building -from a git commit, you could use its commit date. +* If enabled via ``CONFIG_IKHEADERS``, file timestamps of kernel + headers embedded in the kernel or respective module, + exposed via ``/sys/kernel/kheaders.tar.xz`` + +By default the timestamp is the current time and in the case of +``kheaders`` the various files' modification times. This must +be overridden using the `KBUILD_BUILD_TIMESTAMP`_ variable. +If you are building from a git commit, you could use its commit date. The kernel does *not* use the ``__DATE__`` and ``__TIME__`` macros, and enables warnings if they are used. If you incorporate external diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 9ff449888d9c..aff79e461fc9 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -71,7 +71,10 @@ done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 find $cpio_dir -type f -print0 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' -tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null +# Create archive and try to normalize metadata for reproducibility +tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --owner=0 --group=0 --sort=name --numeric-owner \ + -Jcf $tarfile -C $cpio_dir/ . > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 From b1c41ac3ce569b04644bb1e3fd28926604637da3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Oct 2019 14:24:47 +0200 Subject: [PATCH 309/312] KVM: s390: fix __insn32_query() inline assembly The inline assembly constraints of __insn32_query() tell the compiler that only the first byte of "query" is being written to. Intended was probably that 32 bytes are written to. Fix and simplify the code and just use a "memory" clobber. Fixes: d668139718a9 ("KVM: s390: provide query function for instructions returning 32 byte") Cc: stable@vger.kernel.org # v5.2+ Acked-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kvm/kvm-s390.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f6db0f1bc867..74e1a22786e8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -332,7 +332,7 @@ static inline int plo_test_bit(unsigned char nr) return cc == 0; } -static inline void __insn32_query(unsigned int opcode, u8 query[32]) +static inline void __insn32_query(unsigned int opcode, u8 *query) { register unsigned long r0 asm("0") = 0; /* query function */ register unsigned long r1 asm("1") = (unsigned long) query; @@ -340,9 +340,9 @@ static inline void __insn32_query(unsigned int opcode, u8 query[32]) asm volatile( /* Parameter regs are ignored */ " .insn rrf,%[opc] << 16,2,4,6,0\n" - : "=m" (*query) + : : "d" (r0), "a" (r1), [opc] "i" (opcode) - : "cc"); + : "cc", "memory"); } #define INSN_SORTL 0xb938 From d0dea733f60efe94257d08ae6eba81d0b511d0a9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Oct 2019 14:34:37 +0200 Subject: [PATCH 310/312] KVM: s390: mark __insn32_query() as __always_inline __insn32_query() will not compile if the compiler decides to not inline it, since it contains an inline assembly with an "i" constraint with variable contents. Acked-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 74e1a22786e8..d047e846e1b9 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -332,7 +332,7 @@ static inline int plo_test_bit(unsigned char nr) return cc == 0; } -static inline void __insn32_query(unsigned int opcode, u8 *query) +static __always_inline void __insn32_query(unsigned int opcode, u8 *query) { register unsigned long r0 asm("0") = 0; /* query function */ register unsigned long r1 asm("1") = (unsigned long) query; From 9f79b78ef74436c7507bac6bfb7b8b989263bccb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 May 2016 21:59:07 -0700 Subject: [PATCH 311/312] Convert filldir[64]() from __put_user() to unsafe_put_user() We really should avoid the "__{get,put}_user()" functions entirely, because they can easily be mis-used and the original intent of being used for simple direct user accesses no longer holds in a post-SMAP/PAN world. Manually optimizing away the user access range check makes no sense any more, when the range check is generally much cheaper than the "enable user accesses" code that the __{get,put}_user() functions still need. So instead of __put_user(), use the unsafe_put_user() interface with user_access_{begin,end}() that really does generate better code these days, and which is generally a nicer interface. Under some loads, the multiple user writes that filldir() does are actually quite noticeable. This also makes the dirent name copy use unsafe_put_user() with a couple of macros. We do not want to make function calls with SMAP/PAN disabled, and the code this generates is quite good when the architecture uses "asm goto" for unsafe_put_user() like x86 does. Note that this doesn't bother with the legacy cases. Nobody should use them anyway, so performance doesn't really matter there. Signed-off-by: Linus Torvalds --- fs/readdir.c | 128 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 93 insertions(+), 35 deletions(-) diff --git a/fs/readdir.c b/fs/readdir.c index 2f6a4534e0df..579c8ea894ae 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -20,9 +20,63 @@ #include #include #include - #include +#include + +/* + * Note the "unsafe_put_user() semantics: we goto a + * label for errors. + * + * Also note how we use a "while()" loop here, even though + * only the biggest size needs to loop. The compiler (well, + * at least gcc) is smart enough to turn the smaller sizes + * into just if-statements, and this way we don't need to + * care whether 'u64' or 'u32' is the biggest size. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(get_unaligned((type *)src), \ + (type __user *)dst, label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +/* + * We avoid doing 64-bit copies on 32-bit architectures. They + * might be better, but the component names are mostly small, + * and the 64-bit cases can end up being much more complex and + * put much more register pressure on the code, so it's likely + * not worth the pain of unaligned accesses etc. + * + * So limit the copies to "unsigned long" size. I did verify + * that at least the x86-32 case is ok without this limiting, + * but I worry about random other legacy 32-bit cases that + * might not do as well. + */ +#define unsafe_copy_type(dst, src, len, type, label) do { \ + if (sizeof(type) <= sizeof(unsigned long)) \ + unsafe_copy_loop(dst, src, len, type, label); \ +} while (0) + +/* + * Copy the dirent name to user space, and NUL-terminate + * it. This should not be a function call, since we're doing + * the copy inside a "user_access_begin/end()" section. + */ +#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ + char __user *dst = (_dst); \ + const char *src = (_src); \ + size_t len = (_len); \ + unsafe_copy_type(dst, src, len, u64, label); \ + unsafe_copy_type(dst, src, len, u32, label); \ + unsafe_copy_type(dst, src, len, u16, label); \ + unsafe_copy_type(dst, src, len, u8, label); \ + unsafe_put_user(0, dst, label); \ +} while (0) + + int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); @@ -182,28 +236,31 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, return -EOVERFLOW; } dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) + goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) - goto efault; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -263,30 +320,31 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, if (reclen > buf->count) return -EINVAL; dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) + goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); dirent = buf->current_dir; - if (__put_user(ino, &dirent->d_ino)) - goto efault; - if (__put_user(0, &dirent->d_off)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (__put_user(d_type, &dirent->d_type)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; + unsafe_put_user(ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, &dirent->d_type, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; From 8a23eb804ca4f2be909e372cf5a9e7b30ae476cd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 5 Oct 2019 11:32:52 -0700 Subject: [PATCH 312/312] Make filldir[64]() verify the directory entry filename is valid This has been discussed several times, and now filesystem people are talking about doing it individually at the filesystem layer, so head that off at the pass and just do it in getdents{64}(). This is partially based on a patch by Jann Horn, but checks for NUL bytes as well, and somewhat simplified. There's also commentary about how it might be better if invalid names due to filesystem corruption don't cause an immediate failure, but only an error at the end of the readdir(), so that people can still see the filenames that are ok. There's also been discussion about just how much POSIX strictly speaking requires this since it's about filesystem corruption. It's really more "protect user space from bad behavior" as pointed out by Jann. But since Eric Biederman looked up the POSIX wording, here it is for context: "From readdir: The readdir() function shall return a pointer to a structure representing the directory entry at the current position in the directory stream specified by the argument dirp, and position the directory stream at the next entry. It shall return a null pointer upon reaching the end of the directory stream. The structure dirent defined in the header describes a directory entry. From definitions: 3.129 Directory Entry (or Link) An object that associates a filename with a file. Several directory entries can associate names with the same file. ... 3.169 Filename A name consisting of 1 to {NAME_MAX} bytes used to name a file. The characters composing the name may be selected from the set of all character values excluding the slash character and the null byte. The filenames dot and dot-dot have special meaning. A filename is sometimes referred to as a 'pathname component'." Note that I didn't bother adding the checks to any legacy interfaces that nobody uses. Also note that if this ends up being noticeable as a performance regression, we can fix that to do a much more optimized model that checks for both NUL and '/' at the same time one word at a time. We haven't really tended to optimize 'memchr()', and it only checks for one pattern at a time anyway, and we really _should_ check for NUL too (but see the comment about "soft errors" in the code about why it currently only checks for '/') See the CONFIG_DCACHE_WORD_ACCESS case of hash_name() for how the name lookup code looks for pathname terminating characters in parallel. Link: https://lore.kernel.org/lkml/20190118161440.220134-2-jannh@google.com/ Cc: Alexander Viro Cc: Jann Horn Cc: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/readdir.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fs/readdir.c b/fs/readdir.c index 579c8ea894ae..19bea591c3f1 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -118,6 +118,40 @@ out: } EXPORT_SYMBOL(iterate_dir); +/* + * POSIX says that a dirent name cannot contain NULL or a '/'. + * + * It's not 100% clear what we should really do in this case. + * The filesystem is clearly corrupted, but returning a hard + * error means that you now don't see any of the other names + * either, so that isn't a perfect alternative. + * + * And if you return an error, what error do you use? Several + * filesystems seem to have decided on EUCLEAN being the error + * code for EFSCORRUPTED, and that may be the error to use. Or + * just EIO, which is perhaps more obvious to users. + * + * In order to see the other file names in the directory, the + * caller might want to make this a "soft" error: skip the + * entry, and return the error at the end instead. + * + * Note that this should likely do a "memchr(name, 0, len)" + * check too, since that would be filesystem corruption as + * well. However, that case can't actually confuse user space, + * which has to do a strlen() on the name anyway to find the + * filename length, and the above "soft error" worry means + * that it's probably better left alone until we have that + * issue clarified. + */ +static int verify_dirent_name(const char *name, int len) +{ + if (WARN_ON_ONCE(!len)) + return -EIO; + if (WARN_ON_ONCE(memchr(name, '/', len))) + return -EIO; + return 0; +} + /* * Traditional linux readdir() handling.. * @@ -227,6 +261,9 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -316,6 +353,9 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL;