From 1aea9b3f921003f0880f0676ae85d87c9f1cb4a2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 24 Apr 2017 18:19:10 +0200 Subject: [PATCH 01/52] s390/mm: implement 5 level pages tables Add the logic to upgrade the page table for a 64-bit process to five levels. This increases the TASK_SIZE from 8PB to 16EB-4K. Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 2 +- arch/s390/include/asm/page.h | 3 ++ arch/s390/include/asm/pgalloc.h | 25 ++++++++-- arch/s390/include/asm/pgtable.h | 80 ++++++++++++++++++++++++++----- arch/s390/include/asm/processor.h | 4 +- arch/s390/include/asm/tlb.h | 15 ++++++ arch/s390/mm/dump_pagetables.c | 23 +++++++-- arch/s390/mm/gmap.c | 5 +- arch/s390/mm/gup.c | 33 +++++++++++-- arch/s390/mm/hugetlbpage.c | 30 +++++++----- arch/s390/mm/mmap.c | 4 +- arch/s390/mm/pageattr.c | 30 ++++++++++-- arch/s390/mm/pgalloc.c | 57 ++++++++++++++-------- arch/s390/mm/pgtable.c | 6 ++- arch/s390/mm/vmem.c | 44 +++++++++++++++-- 15 files changed, 289 insertions(+), 72 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e161fafb495b..e7ff58150e8f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -184,7 +184,7 @@ config SCHED_OMIT_FRAME_POINTER config PGTABLE_LEVELS int - default 4 + default 5 source "init/Kconfig" diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 69b8a41fca84..624deaa44230 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -74,6 +74,7 @@ typedef struct { unsigned long pgste; } pgste_t; typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pud; } pud_t; +typedef struct { unsigned long p4d; } p4d_t; typedef struct { unsigned long pgd; } pgd_t; typedef pte_t *pgtable_t; @@ -82,12 +83,14 @@ typedef pte_t *pgtable_t; #define pte_val(x) ((x).pte) #define pmd_val(x) ((x).pmd) #define pud_val(x) ((x).pud) +#define p4d_val(x) ((x).p4d) #define pgd_val(x) ((x).pgd) #define __pgste(x) ((pgste_t) { (x) } ) #define __pte(x) ((pte_t) { (x) } ) #define __pmd(x) ((pmd_t) { (x) } ) #define __pud(x) ((pud_t) { (x) } ) +#define __p4d(x) ((p4d_t) { (x) } ) #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 166f703dad7c..bb0ff1bb0c4a 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -51,12 +51,24 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm) return _SEGMENT_ENTRY_EMPTY; if (mm->context.asce_limit <= (1UL << 42)) return _REGION3_ENTRY_EMPTY; - return _REGION2_ENTRY_EMPTY; + if (mm->context.asce_limit <= (1UL << 53)) + return _REGION2_ENTRY_EMPTY; + return _REGION1_ENTRY_EMPTY; } -int crst_table_upgrade(struct mm_struct *); +int crst_table_upgrade(struct mm_struct *mm, unsigned long limit); void crst_table_downgrade(struct mm_struct *); +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) +{ + unsigned long *table = crst_table_alloc(mm); + + if (table) + crst_table_init(table, _REGION2_ENTRY_EMPTY); + return (p4d_t *) table; +} +#define p4d_free(mm, p4d) crst_table_free(mm, (unsigned long *) p4d) + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { unsigned long *table = crst_table_alloc(mm); @@ -86,9 +98,14 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) crst_table_free(mm, (unsigned long *) pmd); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { - pgd_val(*pgd) = _REGION2_ENTRY | __pa(pud); + pgd_val(*pgd) = _REGION1_ENTRY | __pa(p4d); +} + +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + p4d_val(*p4d) = _REGION2_ENTRY | __pa(pud); } static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index e6e3b887bee3..3effb26f0e1a 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -24,7 +24,6 @@ * the S390 page table tree. */ #ifndef __ASSEMBLY__ -#include #include #include #include @@ -87,12 +86,15 @@ extern unsigned long zero_page_mask; */ #define PMD_SHIFT 20 #define PUD_SHIFT 31 -#define PGDIR_SHIFT 42 +#define P4D_SHIFT 42 +#define PGDIR_SHIFT 53 #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PUD_SIZE (1UL << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) +#define P4D_SIZE (1UL << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE-1)) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) @@ -105,6 +107,7 @@ extern unsigned long zero_page_mask; #define PTRS_PER_PTE 256 #define PTRS_PER_PMD 2048 #define PTRS_PER_PUD 2048 +#define PTRS_PER_P4D 2048 #define PTRS_PER_PGD 2048 #define FIRST_USER_ADDRESS 0UL @@ -115,6 +118,8 @@ extern unsigned long zero_page_mask; printk("%s:%d: bad pmd %p.\n", __FILE__, __LINE__, (void *) pmd_val(e)) #define pud_ERROR(e) \ printk("%s:%d: bad pud %p.\n", __FILE__, __LINE__, (void *) pud_val(e)) +#define p4d_ERROR(e) \ + printk("%s:%d: bad p4d %p.\n", __FILE__, __LINE__, (void *) p4d_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e)) @@ -310,8 +315,8 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */ #endif -#define _REGION_ENTRY_BITS 0xfffffffffffff227UL -#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe27UL +#define _REGION_ENTRY_BITS 0xfffffffffffff22fUL +#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe2fUL /* Bits in the segment table entry */ #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL @@ -564,14 +569,14 @@ static inline void crdte(unsigned long old, unsigned long new, */ static inline int pgd_present(pgd_t pgd) { - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1) return 1; return (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) != 0UL; } static inline int pgd_none(pgd_t pgd) { - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1) return 0; return (pgd_val(pgd) & _REGION_ENTRY_INVALID) != 0UL; } @@ -589,6 +594,28 @@ static inline int pgd_bad(pgd_t pgd) return (pgd_val(pgd) & mask) != 0; } +static inline int p4d_present(p4d_t p4d) +{ + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) + return 1; + return (p4d_val(p4d) & _REGION_ENTRY_ORIGIN) != 0UL; +} + +static inline int p4d_none(p4d_t p4d) +{ + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) + return 0; + return p4d_val(p4d) == _REGION2_ENTRY_EMPTY; +} + +static inline unsigned long p4d_pfn(p4d_t p4d) +{ + unsigned long origin_mask; + + origin_mask = _REGION_ENTRY_ORIGIN; + return (p4d_val(p4d) & origin_mask) >> PAGE_SHIFT; +} + static inline int pud_present(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) @@ -641,6 +668,13 @@ static inline int pud_bad(pud_t pud) return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0; } +static inline int p4d_bad(p4d_t p4d) +{ + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) + return pud_bad(__pud(p4d_val(p4d))); + return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; +} + static inline int pmd_present(pmd_t pmd) { return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY; @@ -794,8 +828,14 @@ static inline int pte_unused(pte_t pte) static inline void pgd_clear(pgd_t *pgd) { - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pgd_val(*pgd) = _REGION2_ENTRY_EMPTY; + if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) + pgd_val(*pgd) = _REGION1_ENTRY_EMPTY; +} + +static inline void p4d_clear(p4d_t *p4d) +{ + if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) + p4d_val(*p4d) = _REGION2_ENTRY_EMPTY; } static inline void pud_clear(pud_t *pud) @@ -1089,6 +1129,7 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) } #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) +#define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) @@ -1098,19 +1139,31 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) #define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) #define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN) +#define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN) #define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { - pud_t *pud = (pud_t *) pgd; - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pud = (pud_t *) pgd_deref(*pgd); - return pud + pud_index(address); + p4d_t *p4d = (p4d_t *) pgd; + + if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) + p4d = (p4d_t *) pgd_deref(*pgd); + return p4d + p4d_index(address); +} + +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + pud_t *pud = (pud_t *) p4d; + + if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) + pud = (pud_t *) p4d_deref(*p4d); + return pud + pud_index(address); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { pmd_t *pmd = (pmd_t *) pud; + if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) pmd = (pmd_t *) pud_deref(*pud); return pmd + pmd_index(address); @@ -1122,6 +1175,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) #define pud_page(pud) pfn_to_page(pud_pfn(pud)) +#define p4d_page(pud) pfn_to_page(p4d_pfn(p4d)) /* Find an entry in the lowest level page table.. */ #define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr)) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 60d395fdc864..f57c017a5c03 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -92,11 +92,11 @@ extern void execve_tail(void); */ #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_31BIT) ? \ - (1UL << 31) : (1UL << 53)) + (1UL << 31) : -PAGE_SIZE) #define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \ (1UL << 30) : (1UL << 41)) #define TASK_SIZE TASK_SIZE_OF(current) -#define TASK_SIZE_MAX (1UL << 53) +#define TASK_SIZE_MAX (-PAGE_SIZE) #define STACK_TOP (test_thread_flag(TIF_31BIT) ? \ (1UL << 31) : (1UL << 42)) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 853b2a3d8dee..7317b3108a88 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -136,6 +136,21 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, tlb_remove_table(tlb, pmd); } +/* + * p4d_free_tlb frees a pud table and clears the CRSTE for the + * region second table entry from the tlb. + * If the mm uses a four level page table the single p4d is freed + * as the pgd. p4d_free_tlb checks the asce_limit against 8PB + * to avoid the double free of the p4d in this case. + */ +static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long address) +{ + if (tlb->mm->context.asce_limit <= (1UL << 53)) + return; + tlb_remove_table(tlb, p4d); +} + /* * pud_free_tlb frees a pud table and clears the CRSTE for the * region third table entry from the tlb. diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 1b553d847140..049c3c455b32 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -149,7 +149,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, } static void walk_pud_level(struct seq_file *m, struct pg_state *st, - pgd_t *pgd, unsigned long addr) + p4d_t *p4d, unsigned long addr) { unsigned int prot; pud_t *pud; @@ -157,7 +157,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) { st->current_address = addr; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); if (!pud_none(*pud)) if (pud_large(*pud)) { prot = pud_val(*pud) & @@ -172,6 +172,23 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, } } +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, + pgd_t *pgd, unsigned long addr) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++) { + st->current_address = addr; + p4d = p4d_offset(pgd, addr); + if (!p4d_none(*p4d)) + walk_pud_level(m, st, p4d, addr); + else + note_page(m, st, _PAGE_INVALID, 2); + addr += P4D_SIZE; + } +} + static void walk_pgd_level(struct seq_file *m) { unsigned long addr = 0; @@ -184,7 +201,7 @@ static void walk_pgd_level(struct seq_file *m) st.current_address = addr; pgd = pgd_offset_k(addr); if (!pgd_none(*pgd)) - walk_pud_level(m, &st, pgd, addr); + walk_p4d_level(m, &st, pgd, addr); else note_page(m, &st, _PAGE_INVALID, 1); addr += PGDIR_SIZE; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 7f6db1e6c048..fbd664e48098 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -537,6 +537,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) unsigned long *table; spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; int rc; @@ -573,7 +574,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) mm = gmap->mm; pgd = pgd_offset(mm, vmaddr); VM_BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, vmaddr); + p4d = p4d_offset(pgd, vmaddr); + VM_BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, vmaddr); VM_BUG_ON(pud_none(*pud)); /* large puds cannot yet be handled */ if (pud_large(*pud)) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index b7b779c40a5b..8ecc25e760fa 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -166,15 +166,15 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, return 1; } -static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, +static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp, pud; - pudp = (pud_t *) pgdp; - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pudp = (pud_t *) pgd_deref(pgd); + pudp = (pud_t *) p4dp; + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) + pudp = (pud_t *) p4d_deref(p4d); pudp += pud_index(addr); do { pud = *pudp; @@ -194,6 +194,29 @@ static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, return 1; } +static inline int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp, p4d; + + p4dp = (p4d_t *) pgdp; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) + p4dp = (p4d_t *) pgd_deref(pgd); + p4dp += p4d_index(addr); + do { + p4d = *p4dp; + barrier(); + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + if (!gup_pud_range(p4dp, p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. @@ -228,7 +251,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; - if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgdp, pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 9b4050caa4e9..d3a5e39756f6 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -162,16 +162,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp = NULL; pgdp = pgd_offset(mm, addr); - pudp = pud_alloc(mm, pgdp, addr); - if (pudp) { - if (sz == PUD_SIZE) - return (pte_t *) pudp; - else if (sz == PMD_SIZE) - pmdp = pmd_alloc(mm, pudp, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (p4dp) { + pudp = pud_alloc(mm, p4dp, addr); + if (pudp) { + if (sz == PUD_SIZE) + return (pte_t *) pudp; + else if (sz == PMD_SIZE) + pmdp = pmd_alloc(mm, pudp, addr); + } } return (pte_t *) pmdp; } @@ -179,16 +183,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgdp; + p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp = NULL; pgdp = pgd_offset(mm, addr); if (pgd_present(*pgdp)) { - pudp = pud_offset(pgdp, addr); - if (pud_present(*pudp)) { - if (pud_large(*pudp)) - return (pte_t *) pudp; - pmdp = pmd_offset(pudp, addr); + p4dp = p4d_offset(pgdp, addr); + if (p4d_present(*p4dp)) { + pudp = pud_offset(p4dp, addr); + if (pud_present(*pudp)) { + if (pud_large(*pudp)) + return (pte_t *) pudp; + pmdp = pmd_offset(pudp, addr); + } } } return (pte_t *) pmdp; diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index b017daed6887..8c5f284044ef 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -120,7 +120,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, check_asce_limit: if (addr + len > current->mm->context.asce_limit) { - rc = crst_table_upgrade(mm); + rc = crst_table_upgrade(mm, addr + len); if (rc) return (unsigned long) rc; } @@ -184,7 +184,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, check_asce_limit: if (addr + len > current->mm->context.asce_limit) { - rc = crst_table_upgrade(mm); + rc = crst_table_upgrade(mm, addr + len); if (rc) return (unsigned long) rc; } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 49e721f3645e..180481589246 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -229,14 +229,14 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr, pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); } -static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end, +static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long flags) { unsigned long next; pud_t *pudp; int rc = 0; - pudp = pud_offset(pgd, addr); + pudp = pud_offset(p4d, addr); do { if (pud_none(*pudp)) return -EINVAL; @@ -259,6 +259,26 @@ static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end, return rc; } +static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + p4d_t *p4dp; + int rc = 0; + + p4dp = p4d_offset(pgd, addr); + do { + if (p4d_none(*p4dp)) + return -EINVAL; + next = p4d_addr_end(addr, end); + rc = walk_pud_level(p4dp, addr, next, flags); + p4dp++; + addr = next; + cond_resched(); + } while (addr < end && !rc); + return rc; +} + static DEFINE_MUTEX(cpa_mutex); static int change_page_attr(unsigned long addr, unsigned long end, @@ -278,7 +298,7 @@ static int change_page_attr(unsigned long addr, unsigned long end, if (pgd_none(*pgdp)) break; next = pgd_addr_end(addr, end); - rc = walk_pud_level(pgdp, addr, next, flags); + rc = walk_p4d_level(pgdp, addr, next, flags); if (rc) break; cond_resched(); @@ -319,6 +339,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) unsigned long address; int nr, i, j; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -326,7 +347,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) for (i = 0; i < numpages;) { address = page_to_phys(page + i); pgd = pgd_offset_k(address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pte = pte_offset_kernel(pmd, address); nr = (unsigned long)pte >> ilog2(sizeof(long)); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index f502cbe657af..18918e394ce4 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -76,29 +76,46 @@ static void __crst_table_upgrade(void *arg) __tlb_flush_local(); } -int crst_table_upgrade(struct mm_struct *mm) +int crst_table_upgrade(struct mm_struct *mm, unsigned long end) { unsigned long *table, *pgd; + int rc, notify; - /* upgrade should only happen from 3 to 4 levels */ - BUG_ON(mm->context.asce_limit != (1UL << 42)); - - table = crst_table_alloc(mm); - if (!table) + /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ + BUG_ON(mm->context.asce_limit < (1UL << 42)); + if (end >= TASK_SIZE_MAX) return -ENOMEM; - - spin_lock_bh(&mm->page_table_lock); - pgd = (unsigned long *) mm->pgd; - crst_table_init(table, _REGION2_ENTRY_EMPTY); - pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = 1UL << 53; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION2; - spin_unlock_bh(&mm->page_table_lock); - - on_each_cpu(__crst_table_upgrade, mm, 0); - return 0; + rc = 0; + notify = 0; + while (mm->context.asce_limit < end) { + table = crst_table_alloc(mm); + if (!table) { + rc = -ENOMEM; + break; + } + spin_lock_bh(&mm->page_table_lock); + pgd = (unsigned long *) mm->pgd; + if (mm->context.asce_limit == (1UL << 42)) { + crst_table_init(table, _REGION2_ENTRY_EMPTY); + p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); + mm->pgd = (pgd_t *) table; + mm->context.asce_limit = 1UL << 53; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + } else { + crst_table_init(table, _REGION1_ENTRY_EMPTY); + pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd); + mm->pgd = (pgd_t *) table; + mm->context.asce_limit = -PAGE_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + } + notify = 1; + spin_unlock_bh(&mm->page_table_lock); + } + if (notify) + on_each_cpu(__crst_table_upgrade, mm, 0); + return rc; } void crst_table_downgrade(struct mm_struct *mm) @@ -274,7 +291,7 @@ static void __tlb_remove_table(void *_table) struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); switch (mask) { - case 0: /* pmd or pud */ + case 0: /* pmd, pud, or p4d */ free_pages((unsigned long) table, 2); break; case 1: /* lower 2K of a 4K page table */ diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 947b66a5cdba..d4d409ba206b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -610,6 +610,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) { spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgste_t pgste; @@ -618,7 +619,10 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) bool dirty; pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return false; + pud = pud_alloc(mm, p4d, addr); if (!pud) return false; pmd = pmd_alloc(mm, pud, addr); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index c33c94b4be60..d8398962a723 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -38,6 +38,17 @@ static void __ref *vmem_alloc_pages(unsigned int order) return (void *) memblock_alloc(size, size); } +static inline p4d_t *vmem_p4d_alloc(void) +{ + p4d_t *p4d = NULL; + + p4d = vmem_alloc_pages(2); + if (!p4d) + return NULL; + clear_table((unsigned long *) p4d, _REGION2_ENTRY_EMPTY, PAGE_SIZE * 4); + return p4d; +} + static inline pud_t *vmem_pud_alloc(void) { pud_t *pud = NULL; @@ -85,6 +96,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size) unsigned long end = start + size; unsigned long address = start; pgd_t *pg_dir; + p4d_t *p4_dir; pud_t *pu_dir; pmd_t *pm_dir; pte_t *pt_dir; @@ -102,12 +114,19 @@ static int vmem_add_mem(unsigned long start, unsigned long size) while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { + p4_dir = vmem_p4d_alloc(); + if (!p4_dir) + goto out; + pgd_populate(&init_mm, pg_dir, p4_dir); + } + p4_dir = p4d_offset(pg_dir, address); + if (p4d_none(*p4_dir)) { pu_dir = vmem_pud_alloc(); if (!pu_dir) goto out; - pgd_populate(&init_mm, pg_dir, pu_dir); + p4d_populate(&init_mm, p4_dir, pu_dir); } - pu_dir = pud_offset(pg_dir, address); + pu_dir = pud_offset(p4_dir, address); if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) && !debug_pagealloc_enabled()) { @@ -161,6 +180,7 @@ static void vmem_remove_range(unsigned long start, unsigned long size) unsigned long end = start + size; unsigned long address = start; pgd_t *pg_dir; + p4d_t *p4_dir; pud_t *pu_dir; pmd_t *pm_dir; pte_t *pt_dir; @@ -172,7 +192,12 @@ static void vmem_remove_range(unsigned long start, unsigned long size) address += PGDIR_SIZE; continue; } - pu_dir = pud_offset(pg_dir, address); + p4_dir = p4d_offset(pg_dir, address); + if (p4d_none(*p4_dir)) { + address += P4D_SIZE; + continue; + } + pu_dir = pud_offset(p4_dir, address); if (pud_none(*pu_dir)) { address += PUD_SIZE; continue; @@ -213,6 +238,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) unsigned long pgt_prot, sgt_prot; unsigned long address = start; pgd_t *pg_dir; + p4d_t *p4_dir; pud_t *pu_dir; pmd_t *pm_dir; pte_t *pt_dir; @@ -227,13 +253,21 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) for (address = start; address < end;) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { + p4_dir = vmem_p4d_alloc(); + if (!p4_dir) + goto out; + pgd_populate(&init_mm, pg_dir, p4_dir); + } + + p4_dir = p4d_offset(pg_dir, address); + if (p4d_none(*p4_dir)) { pu_dir = vmem_pud_alloc(); if (!pu_dir) goto out; - pgd_populate(&init_mm, pg_dir, pu_dir); + p4d_populate(&init_mm, p4_dir, pu_dir); } - pu_dir = pud_offset(pg_dir, address); + pu_dir = pud_offset(p4_dir, address); if (pud_none(*pu_dir)) { pm_dir = vmem_pmd_alloc(); if (!pm_dir) From 92acfb7406171038ae4e3b6041576642cb75b529 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 May 2017 11:01:26 +0200 Subject: [PATCH 02/52] s390: add missing header includes for type checking Add missing include statements to make sure that prototypes match implementation. As reported by sparse: arch/s390/crypto/arch_random.c:18:1: warning: symbol 's390_arch_random_available' was not declared. Should it be static? arch/s390/kernel/traps.c:279:13: warning: symbol 'trap_init' was not declared. Should it be static? Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/crypto/arch_random.c | 1 + arch/s390/kernel/traps.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c index 9317b3e645e2..36aefc07d10c 100644 --- a/arch/s390/crypto/arch_random.c +++ b/arch/s390/crypto/arch_random.c @@ -12,6 +12,7 @@ #include #include +#include #include #include diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index f787b9d8f54c..442e5423ce3d 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "entry.h" From cb4ef3c20bd95e21dddc1101133d90ac7f049a53 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 May 2017 12:44:21 +0200 Subject: [PATCH 03/52] s390/pkey: make pkey_init() static drivers/s390/crypto/pkey_api.c:1197:12: warning: symbol 'pkey_init' was not declared. Should it be static? Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/pkey_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index ea86da8c75f9..8f466facf15e 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -1194,7 +1194,7 @@ static struct miscdevice pkey_dev = { /* * Module init */ -int __init pkey_init(void) +static int __init pkey_init(void) { cpacf_mask_t pckmo_functions; From d12a3d603690ba84b7e3fd357f05b96e777d4630 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 May 2017 13:44:43 +0200 Subject: [PATCH 04/52] s390/mm: add __rcu annotations Add __rcu annotations so sparse correctly warns only if "slot" gets derefenced without using rcu_dereference(). Right now we get warnings because of the missing annotation: arch/s390/mm/gmap.c:135:17: warning: incorrect type in assignment (different address spaces) arch/s390/mm/gmap.c:135:17: expected void **slot arch/s390/mm/gmap.c:135:17: got void [noderef] ** Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/gmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index fbd664e48098..4fb3d3cdb370 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -125,7 +125,7 @@ static void gmap_radix_tree_free(struct radix_tree_root *root) struct radix_tree_iter iter; unsigned long indices[16]; unsigned long index; - void **slot; + void __rcu **slot; int i, nr; /* A radix tree is freed by deleting all of its entries */ @@ -150,7 +150,7 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) struct radix_tree_iter iter; unsigned long indices[16]; unsigned long index; - void **slot; + void __rcu **slot; int i, nr; /* A radix tree is freed by deleting all of its entries */ @@ -1011,7 +1011,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { - void **slot; + void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); From 94d26bfcf31244d24fddbe7ba5b0dd17f3c32b11 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 1 Dec 2016 12:51:32 +0100 Subject: [PATCH 05/52] s390/scm: remove cluster option Remove CONFIG_SCM_BLOCK_CLUSTER_WRITE and related code. This quirk is no longer needed on current hardware. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/block/Kconfig | 7 - drivers/s390/block/Makefile | 3 - drivers/s390/block/scm_blk.c | 45 +---- drivers/s390/block/scm_blk.h | 54 ------ drivers/s390/block/scm_blk_cluster.c | 255 --------------------------- 5 files changed, 3 insertions(+), 361 deletions(-) delete mode 100644 drivers/s390/block/scm_blk_cluster.c diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig index 0acb8c2f9475..31f014b57bfc 100644 --- a/drivers/s390/block/Kconfig +++ b/drivers/s390/block/Kconfig @@ -82,10 +82,3 @@ config SCM_BLOCK To compile this driver as a module, choose M here: the module will be called scm_block. - -config SCM_BLOCK_CLUSTER_WRITE - def_bool y - prompt "SCM force cluster writes" - depends on SCM_BLOCK - help - Force writes to Storage Class Memory (SCM) to be in done in clusters. diff --git a/drivers/s390/block/Makefile b/drivers/s390/block/Makefile index c2f4e673e031..b64e2b32c753 100644 --- a/drivers/s390/block/Makefile +++ b/drivers/s390/block/Makefile @@ -19,7 +19,4 @@ obj-$(CONFIG_BLK_DEV_XPRAM) += xpram.o obj-$(CONFIG_DCSSBLK) += dcssblk.o scm_block-objs := scm_drv.o scm_blk.o -ifdef CONFIG_SCM_BLOCK_CLUSTER_WRITE -scm_block-objs += scm_blk_cluster.o -endif obj-$(CONFIG_SCM_BLOCK) += scm_block.o diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 152de6817875..97fd0fcfa3e6 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -42,7 +42,6 @@ static void __scm_free_rq(struct scm_request *scmrq) struct aob_rq_header *aobrq = to_aobrq(scmrq); free_page((unsigned long) scmrq->aob); - __scm_free_rq_cluster(scmrq); kfree(scmrq->request); kfree(aobrq); } @@ -82,9 +81,6 @@ static int __scm_alloc_rq(void) if (!scmrq->request) goto free; - if (__scm_alloc_rq_cluster(scmrq)) - goto free; - INIT_LIST_HEAD(&scmrq->list); spin_lock_irq(&list_lock); list_add(&scmrq->list, &inactive_requests); @@ -234,7 +230,6 @@ static inline void scm_request_init(struct scm_blk_dev *bdev, scmrq->error = 0; /* We don't use all msbs - place aidaws at the end of the aob page. */ scmrq->next_aidaw = (void *) &aob->msb[nr_requests_per_io]; - scm_request_cluster_init(scmrq); } static void scm_ensure_queue_restart(struct scm_blk_dev *bdev) @@ -246,12 +241,11 @@ static void scm_ensure_queue_restart(struct scm_blk_dev *bdev) blk_delay_queue(bdev->rq, SCM_QUEUE_DELAY); } -void scm_request_requeue(struct scm_request *scmrq) +static void scm_request_requeue(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; int i; - scm_release_cluster(scmrq); for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) blk_requeue_request(bdev->rq, scmrq->request[i]); @@ -260,12 +254,11 @@ void scm_request_requeue(struct scm_request *scmrq) scm_ensure_queue_restart(bdev); } -void scm_request_finish(struct scm_request *scmrq) +static void scm_request_finish(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; int i; - scm_release_cluster(scmrq); for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) blk_end_request_all(scmrq->request[i], scmrq->error); @@ -313,31 +306,6 @@ static void scm_blk_request(struct request_queue *rq) } scm_request_set(scmrq, req); - if (!scm_reserve_cluster(scmrq)) { - SCM_LOG(5, "cluster busy"); - scm_request_set(scmrq, NULL); - if (scmrq->aob->request.msb_count) - goto out; - - scm_request_done(scmrq); - return; - } - - if (scm_need_cluster_request(scmrq)) { - if (scmrq->aob->request.msb_count) { - /* Start cluster requests separately. */ - scm_request_set(scmrq, NULL); - if (scm_request_start(scmrq)) - return; - } else { - atomic_inc(&bdev->queued_reqs); - blk_start_request(req); - scm_initiate_cluster_request(scmrq); - } - scmrq = NULL; - continue; - } - if (scm_request_prepare(scmrq)) { SCM_LOG(5, "aidaw alloc failed"); scm_request_set(scmrq, NULL); @@ -444,12 +412,6 @@ static void scm_blk_tasklet(struct scm_blk_dev *bdev) continue; } - if (scm_test_cluster_request(scmrq)) { - scm_cluster_request_irq(scmrq); - spin_lock_irqsave(&bdev->lock, flags); - continue; - } - scm_request_finish(scmrq); spin_lock_irqsave(&bdev->lock, flags); } @@ -498,7 +460,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) blk_queue_max_segments(rq, nr_max_blk); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rq); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, rq); - scm_blk_dev_cluster_setup(bdev); bdev->gendisk = alloc_disk(SCM_NR_PARTS); if (!bdev->gendisk) @@ -558,7 +519,7 @@ static bool __init scm_blk_params_valid(void) if (!nr_requests_per_io || nr_requests_per_io > 64) return false; - return scm_cluster_size_valid(); + return true; } static int __init scm_blk_init(void) diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index 09218cdc5129..f32188dd7909 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -23,9 +23,6 @@ struct scm_blk_dev { atomic_t queued_reqs; enum {SCM_OPER, SCM_WR_PROHIBIT} state; struct list_head finished_requests; -#ifdef CONFIG_SCM_BLOCK_CLUSTER_WRITE - struct list_head cluster_list; -#endif }; struct scm_request { @@ -36,13 +33,6 @@ struct scm_request { struct list_head list; u8 retries; int error; -#ifdef CONFIG_SCM_BLOCK_CLUSTER_WRITE - struct { - enum {CLUSTER_NONE, CLUSTER_READ, CLUSTER_WRITE} state; - struct list_head list; - void **buf; - } cluster; -#endif }; #define to_aobrq(rq) container_of((void *) rq, struct aob_rq_header, data) @@ -52,55 +42,11 @@ void scm_blk_dev_cleanup(struct scm_blk_dev *); void scm_blk_set_available(struct scm_blk_dev *); void scm_blk_irq(struct scm_device *, void *, int); -void scm_request_finish(struct scm_request *); -void scm_request_requeue(struct scm_request *); - struct aidaw *scm_aidaw_fetch(struct scm_request *scmrq, unsigned int bytes); int scm_drv_init(void); void scm_drv_cleanup(void); -#ifdef CONFIG_SCM_BLOCK_CLUSTER_WRITE -void __scm_free_rq_cluster(struct scm_request *); -int __scm_alloc_rq_cluster(struct scm_request *); -void scm_request_cluster_init(struct scm_request *); -bool scm_reserve_cluster(struct scm_request *); -void scm_release_cluster(struct scm_request *); -void scm_blk_dev_cluster_setup(struct scm_blk_dev *); -bool scm_need_cluster_request(struct scm_request *); -void scm_initiate_cluster_request(struct scm_request *); -void scm_cluster_request_irq(struct scm_request *); -bool scm_test_cluster_request(struct scm_request *); -bool scm_cluster_size_valid(void); -#else /* CONFIG_SCM_BLOCK_CLUSTER_WRITE */ -static inline void __scm_free_rq_cluster(struct scm_request *scmrq) {} -static inline int __scm_alloc_rq_cluster(struct scm_request *scmrq) -{ - return 0; -} -static inline void scm_request_cluster_init(struct scm_request *scmrq) {} -static inline bool scm_reserve_cluster(struct scm_request *scmrq) -{ - return true; -} -static inline void scm_release_cluster(struct scm_request *scmrq) {} -static inline void scm_blk_dev_cluster_setup(struct scm_blk_dev *bdev) {} -static inline bool scm_need_cluster_request(struct scm_request *scmrq) -{ - return false; -} -static inline void scm_initiate_cluster_request(struct scm_request *scmrq) {} -static inline void scm_cluster_request_irq(struct scm_request *scmrq) {} -static inline bool scm_test_cluster_request(struct scm_request *scmrq) -{ - return false; -} -static inline bool scm_cluster_size_valid(void) -{ - return true; -} -#endif /* CONFIG_SCM_BLOCK_CLUSTER_WRITE */ - extern debug_info_t *scm_debug; #define SCM_LOG(imp, txt) do { \ diff --git a/drivers/s390/block/scm_blk_cluster.c b/drivers/s390/block/scm_blk_cluster.c deleted file mode 100644 index 7497ddde2dd6..000000000000 --- a/drivers/s390/block/scm_blk_cluster.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Block driver for s390 storage class memory. - * - * Copyright IBM Corp. 2012 - * Author(s): Sebastian Ott - */ - -#include -#include -#include -#include -#include -#include -#include -#include "scm_blk.h" - -static unsigned int write_cluster_size = 64; -module_param(write_cluster_size, uint, S_IRUGO); -MODULE_PARM_DESC(write_cluster_size, - "Number of pages used for contiguous writes."); - -#define CLUSTER_SIZE (write_cluster_size * PAGE_SIZE) - -void __scm_free_rq_cluster(struct scm_request *scmrq) -{ - int i; - - if (!scmrq->cluster.buf) - return; - - for (i = 0; i < 2 * write_cluster_size; i++) - free_page((unsigned long) scmrq->cluster.buf[i]); - - kfree(scmrq->cluster.buf); -} - -int __scm_alloc_rq_cluster(struct scm_request *scmrq) -{ - int i; - - scmrq->cluster.buf = kzalloc(sizeof(void *) * 2 * write_cluster_size, - GFP_KERNEL); - if (!scmrq->cluster.buf) - return -ENOMEM; - - for (i = 0; i < 2 * write_cluster_size; i++) { - scmrq->cluster.buf[i] = (void *) get_zeroed_page(GFP_DMA); - if (!scmrq->cluster.buf[i]) - return -ENOMEM; - } - INIT_LIST_HEAD(&scmrq->cluster.list); - return 0; -} - -void scm_request_cluster_init(struct scm_request *scmrq) -{ - scmrq->cluster.state = CLUSTER_NONE; -} - -static bool clusters_intersect(struct request *A, struct request *B) -{ - unsigned long firstA, lastA, firstB, lastB; - - firstA = ((u64) blk_rq_pos(A) << 9) / CLUSTER_SIZE; - lastA = (((u64) blk_rq_pos(A) << 9) + - blk_rq_bytes(A) - 1) / CLUSTER_SIZE; - - firstB = ((u64) blk_rq_pos(B) << 9) / CLUSTER_SIZE; - lastB = (((u64) blk_rq_pos(B) << 9) + - blk_rq_bytes(B) - 1) / CLUSTER_SIZE; - - return (firstB <= lastA && firstA <= lastB); -} - -bool scm_reserve_cluster(struct scm_request *scmrq) -{ - struct request *req = scmrq->request[scmrq->aob->request.msb_count]; - struct scm_blk_dev *bdev = scmrq->bdev; - struct scm_request *iter; - int pos, add = 1; - - if (write_cluster_size == 0) - return true; - - spin_lock(&bdev->lock); - list_for_each_entry(iter, &bdev->cluster_list, cluster.list) { - if (iter == scmrq) { - /* - * We don't have to use clusters_intersect here, since - * cluster requests are always started separately. - */ - add = 0; - continue; - } - for (pos = 0; pos < iter->aob->request.msb_count; pos++) { - if (clusters_intersect(req, iter->request[pos]) && - (rq_data_dir(req) == WRITE || - rq_data_dir(iter->request[pos]) == WRITE)) { - spin_unlock(&bdev->lock); - return false; - } - } - } - if (add) - list_add(&scmrq->cluster.list, &bdev->cluster_list); - spin_unlock(&bdev->lock); - - return true; -} - -void scm_release_cluster(struct scm_request *scmrq) -{ - struct scm_blk_dev *bdev = scmrq->bdev; - unsigned long flags; - - if (write_cluster_size == 0) - return; - - spin_lock_irqsave(&bdev->lock, flags); - list_del(&scmrq->cluster.list); - spin_unlock_irqrestore(&bdev->lock, flags); -} - -void scm_blk_dev_cluster_setup(struct scm_blk_dev *bdev) -{ - INIT_LIST_HEAD(&bdev->cluster_list); - blk_queue_io_opt(bdev->rq, CLUSTER_SIZE); -} - -static int scm_prepare_cluster_request(struct scm_request *scmrq) -{ - struct scm_blk_dev *bdev = scmrq->bdev; - struct scm_device *scmdev = bdev->gendisk->private_data; - struct request *req = scmrq->request[0]; - struct msb *msb = &scmrq->aob->msb[0]; - struct req_iterator iter; - struct aidaw *aidaw; - struct bio_vec bv; - int i = 0; - u64 addr; - - switch (scmrq->cluster.state) { - case CLUSTER_NONE: - scmrq->cluster.state = CLUSTER_READ; - /* fall through */ - case CLUSTER_READ: - msb->bs = MSB_BS_4K; - msb->oc = MSB_OC_READ; - msb->flags = MSB_FLAG_IDA; - msb->blk_count = write_cluster_size; - - addr = scmdev->address + ((u64) blk_rq_pos(req) << 9); - msb->scm_addr = round_down(addr, CLUSTER_SIZE); - - if (msb->scm_addr != - round_down(addr + (u64) blk_rq_bytes(req) - 1, - CLUSTER_SIZE)) - msb->blk_count = 2 * write_cluster_size; - - aidaw = scm_aidaw_fetch(scmrq, msb->blk_count * PAGE_SIZE); - if (!aidaw) - return -ENOMEM; - - scmrq->aob->request.msb_count = 1; - msb->data_addr = (u64) aidaw; - for (i = 0; i < msb->blk_count; i++) { - aidaw->data_addr = (u64) scmrq->cluster.buf[i]; - aidaw++; - } - - break; - case CLUSTER_WRITE: - aidaw = (void *) msb->data_addr; - msb->oc = MSB_OC_WRITE; - - for (addr = msb->scm_addr; - addr < scmdev->address + ((u64) blk_rq_pos(req) << 9); - addr += PAGE_SIZE) { - aidaw->data_addr = (u64) scmrq->cluster.buf[i]; - aidaw++; - i++; - } - rq_for_each_segment(bv, req, iter) { - aidaw->data_addr = (u64) page_address(bv.bv_page); - aidaw++; - i++; - } - for (; i < msb->blk_count; i++) { - aidaw->data_addr = (u64) scmrq->cluster.buf[i]; - aidaw++; - } - break; - } - return 0; -} - -bool scm_need_cluster_request(struct scm_request *scmrq) -{ - int pos = scmrq->aob->request.msb_count; - - if (rq_data_dir(scmrq->request[pos]) == READ) - return false; - - return blk_rq_bytes(scmrq->request[pos]) < CLUSTER_SIZE; -} - -/* Called with queue lock held. */ -void scm_initiate_cluster_request(struct scm_request *scmrq) -{ - if (scm_prepare_cluster_request(scmrq)) - goto requeue; - if (eadm_start_aob(scmrq->aob)) - goto requeue; - return; -requeue: - scm_request_requeue(scmrq); -} - -bool scm_test_cluster_request(struct scm_request *scmrq) -{ - return scmrq->cluster.state != CLUSTER_NONE; -} - -void scm_cluster_request_irq(struct scm_request *scmrq) -{ - struct scm_blk_dev *bdev = scmrq->bdev; - unsigned long flags; - - switch (scmrq->cluster.state) { - case CLUSTER_NONE: - BUG(); - break; - case CLUSTER_READ: - if (scmrq->error) { - scm_request_finish(scmrq); - break; - } - scmrq->cluster.state = CLUSTER_WRITE; - spin_lock_irqsave(&bdev->rq_lock, flags); - scm_initiate_cluster_request(scmrq); - spin_unlock_irqrestore(&bdev->rq_lock, flags); - break; - case CLUSTER_WRITE: - scm_request_finish(scmrq); - break; - } -} - -bool scm_cluster_size_valid(void) -{ - if (write_cluster_size == 1 || write_cluster_size > 128) - return false; - - return !(write_cluster_size & (write_cluster_size - 1)); -} From 12d9076265398eb2fec3c5dabe7b6713bca8bac9 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 25 Jan 2017 16:18:53 +0100 Subject: [PATCH 06/52] s390/scm: convert to blk-mq Convert scm_blk to use the blk-mq API. This is just a simple conversion since we still use a single queue. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/block/scm_blk.c | 149 ++++++++++++++++++----------------- drivers/s390/block/scm_blk.h | 2 + 2 files changed, 80 insertions(+), 71 deletions(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 97fd0fcfa3e6..701e42e852e2 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -110,13 +111,13 @@ static struct scm_request *scm_request_fetch(void) { struct scm_request *scmrq = NULL; - spin_lock(&list_lock); + spin_lock_irq(&list_lock); if (list_empty(&inactive_requests)) goto out; scmrq = list_first_entry(&inactive_requests, struct scm_request, list); list_del(&scmrq->list); out: - spin_unlock(&list_lock); + spin_unlock_irq(&list_lock); return scmrq; } @@ -232,26 +233,17 @@ static inline void scm_request_init(struct scm_blk_dev *bdev, scmrq->next_aidaw = (void *) &aob->msb[nr_requests_per_io]; } -static void scm_ensure_queue_restart(struct scm_blk_dev *bdev) -{ - if (atomic_read(&bdev->queued_reqs)) { - /* Queue restart is triggered by the next interrupt. */ - return; - } - blk_delay_queue(bdev->rq, SCM_QUEUE_DELAY); -} - static void scm_request_requeue(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; int i; for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) - blk_requeue_request(bdev->rq, scmrq->request[i]); + blk_mq_requeue_request(scmrq->request[i], false); atomic_dec(&bdev->queued_reqs); scm_request_done(scmrq); - scm_ensure_queue_restart(bdev); + blk_mq_kick_requeue_list(bdev->rq); } static void scm_request_finish(struct scm_request *scmrq) @@ -259,73 +251,74 @@ static void scm_request_finish(struct scm_request *scmrq) struct scm_blk_dev *bdev = scmrq->bdev; int i; - for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) - blk_end_request_all(scmrq->request[i], scmrq->error); + for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) { + if (scmrq->error) + blk_mq_end_request(scmrq->request[i], scmrq->error); + else + blk_mq_complete_request(scmrq->request[i]); + } atomic_dec(&bdev->queued_reqs); scm_request_done(scmrq); } -static int scm_request_start(struct scm_request *scmrq) +static void scm_request_start(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; - int ret; atomic_inc(&bdev->queued_reqs); - if (!scmrq->aob->request.msb_count) { - scm_request_requeue(scmrq); - return -EINVAL; - } - - ret = eadm_start_aob(scmrq->aob); - if (ret) { + if (eadm_start_aob(scmrq->aob)) { SCM_LOG(5, "no subchannel"); scm_request_requeue(scmrq); } - return ret; } -static void scm_blk_request(struct request_queue *rq) +static int scm_blk_request(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *qd) { - struct scm_device *scmdev = rq->queuedata; + struct scm_device *scmdev = hctx->queue->queuedata; struct scm_blk_dev *bdev = dev_get_drvdata(&scmdev->dev); - struct scm_request *scmrq = NULL; - struct request *req; + struct request *req = qd->rq; + struct scm_request *scmrq; - while ((req = blk_peek_request(rq))) { - if (!scm_permit_request(bdev, req)) - goto out; - - if (!scmrq) { - scmrq = scm_request_fetch(); - if (!scmrq) { - SCM_LOG(5, "no request"); - goto out; - } - scm_request_init(bdev, scmrq); - } - scm_request_set(scmrq, req); - - if (scm_request_prepare(scmrq)) { - SCM_LOG(5, "aidaw alloc failed"); - scm_request_set(scmrq, NULL); - goto out; - } - blk_start_request(req); - - if (scmrq->aob->request.msb_count < nr_requests_per_io) - continue; - - if (scm_request_start(scmrq)) - return; - - scmrq = NULL; + spin_lock(&bdev->rq_lock); + if (!scm_permit_request(bdev, req)) { + spin_unlock(&bdev->rq_lock); + return BLK_MQ_RQ_QUEUE_BUSY; } -out: - if (scmrq) + + scmrq = hctx->driver_data; + if (!scmrq) { + scmrq = scm_request_fetch(); + if (!scmrq) { + SCM_LOG(5, "no request"); + spin_unlock(&bdev->rq_lock); + return BLK_MQ_RQ_QUEUE_BUSY; + } + scm_request_init(bdev, scmrq); + hctx->driver_data = scmrq; + } + scm_request_set(scmrq, req); + + if (scm_request_prepare(scmrq)) { + SCM_LOG(5, "aidaw alloc failed"); + scm_request_set(scmrq, NULL); + + if (scmrq->aob->request.msb_count) + scm_request_start(scmrq); + + hctx->driver_data = NULL; + spin_unlock(&bdev->rq_lock); + return BLK_MQ_RQ_QUEUE_BUSY; + } + blk_mq_start_request(req); + + if (qd->last || scmrq->aob->request.msb_count == nr_requests_per_io) { scm_request_start(scmrq); - else - scm_ensure_queue_restart(bdev); + hctx->driver_data = NULL; + } + spin_unlock(&bdev->rq_lock); + return BLK_MQ_RQ_QUEUE_OK; } static void __scmrq_log_error(struct scm_request *scmrq) @@ -387,9 +380,7 @@ restart: return; requeue: - spin_lock_irqsave(&bdev->rq_lock, flags); scm_request_requeue(scmrq); - spin_unlock_irqrestore(&bdev->rq_lock, flags); } static void scm_blk_tasklet(struct scm_blk_dev *bdev) @@ -416,19 +407,21 @@ static void scm_blk_tasklet(struct scm_blk_dev *bdev) spin_lock_irqsave(&bdev->lock, flags); } spin_unlock_irqrestore(&bdev->lock, flags); - /* Look out for more requests. */ - blk_run_queue(bdev->rq); } static const struct block_device_operations scm_blk_devops = { .owner = THIS_MODULE, }; +static const struct blk_mq_ops scm_mq_ops = { + .queue_rq = scm_blk_request, +}; + int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) { - struct request_queue *rq; - int len, ret = -ENOMEM; unsigned int devindex, nr_max_blk; + struct request_queue *rq; + int len, ret; devindex = atomic_inc_return(&nr_devices) - 1; /* scma..scmz + scmaa..scmzz */ @@ -447,10 +440,20 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) (void (*)(unsigned long)) scm_blk_tasklet, (unsigned long) bdev); - rq = blk_init_queue(scm_blk_request, &bdev->rq_lock); - if (!rq) + bdev->tag_set.ops = &scm_mq_ops; + bdev->tag_set.nr_hw_queues = 1; + bdev->tag_set.queue_depth = nr_requests_per_io * nr_requests; + bdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + + ret = blk_mq_alloc_tag_set(&bdev->tag_set); + if (ret) goto out; + rq = blk_mq_init_queue(&bdev->tag_set); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + goto out_tag; + } bdev->rq = rq; nr_max_blk = min(scmdev->nr_max_block, (unsigned int) (PAGE_SIZE / sizeof(struct aidaw))); @@ -462,9 +465,10 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, rq); bdev->gendisk = alloc_disk(SCM_NR_PARTS); - if (!bdev->gendisk) + if (!bdev->gendisk) { + ret = -ENOMEM; goto out_queue; - + } rq->queuedata = scmdev; bdev->gendisk->private_data = scmdev; bdev->gendisk->fops = &scm_blk_devops; @@ -489,6 +493,8 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) out_queue: blk_cleanup_queue(rq); +out_tag: + blk_mq_free_tag_set(&bdev->tag_set); out: atomic_dec(&nr_devices); return ret; @@ -499,6 +505,7 @@ void scm_blk_dev_cleanup(struct scm_blk_dev *bdev) tasklet_kill(&bdev->tasklet); del_gendisk(bdev->gendisk); blk_cleanup_queue(bdev->gendisk->queue); + blk_mq_free_tag_set(&bdev->tag_set); put_disk(bdev->gendisk); } diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index f32188dd7909..160564399ff4 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ struct scm_blk_dev { struct tasklet_struct tasklet; struct request_queue *rq; struct gendisk *gendisk; + struct blk_mq_tag_set tag_set; struct scm_device *scmdev; spinlock_t rq_lock; /* guard the request queue */ spinlock_t lock; /* guard the rest of the blockdev */ From c7b3e92331fbb905579e67aeed202a37eade54b2 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 31 Jan 2017 16:15:25 +0100 Subject: [PATCH 07/52] s390/scm: convert tasklet Drop the tasklet that was used to complete requests in favor of block layer helpers that finish the IO on the CPU that initiated it. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/block/scm_blk.c | 54 ++++++++++-------------------------- drivers/s390/block/scm_blk.h | 1 - 2 files changed, 15 insertions(+), 40 deletions(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 701e42e852e2..bba798c699f1 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -338,21 +338,6 @@ static void __scmrq_log_error(struct scm_request *scmrq) scmrq->error); } -void scm_blk_irq(struct scm_device *scmdev, void *data, int error) -{ - struct scm_request *scmrq = data; - struct scm_blk_dev *bdev = scmrq->bdev; - - scmrq->error = error; - if (error) - __scmrq_log_error(scmrq); - - spin_lock(&bdev->lock); - list_add_tail(&scmrq->list, &bdev->finished_requests); - spin_unlock(&bdev->lock); - tasklet_hi_schedule(&bdev->tasklet); -} - static void scm_blk_handle_error(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; @@ -383,30 +368,25 @@ requeue: scm_request_requeue(scmrq); } -static void scm_blk_tasklet(struct scm_blk_dev *bdev) +void scm_blk_irq(struct scm_device *scmdev, void *data, int error) { - struct scm_request *scmrq; - unsigned long flags; + struct scm_request *scmrq = data; - spin_lock_irqsave(&bdev->lock, flags); - while (!list_empty(&bdev->finished_requests)) { - scmrq = list_first_entry(&bdev->finished_requests, - struct scm_request, list); - list_del(&scmrq->list); - spin_unlock_irqrestore(&bdev->lock, flags); - - if (scmrq->error && scmrq->retries-- > 0) { + scmrq->error = error; + if (error) { + __scmrq_log_error(scmrq); + if (scmrq->retries-- > 0) { scm_blk_handle_error(scmrq); - - /* Request restarted or requeued, handle next. */ - spin_lock_irqsave(&bdev->lock, flags); - continue; + return; } - - scm_request_finish(scmrq); - spin_lock_irqsave(&bdev->lock, flags); } - spin_unlock_irqrestore(&bdev->lock, flags); + + scm_request_finish(scmrq); +} + +static void scm_blk_request_done(struct request *req) +{ + blk_mq_end_request(req, 0); } static const struct block_device_operations scm_blk_devops = { @@ -415,6 +395,7 @@ static const struct block_device_operations scm_blk_devops = { static const struct blk_mq_ops scm_mq_ops = { .queue_rq = scm_blk_request, + .complete = scm_blk_request_done, }; int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) @@ -434,11 +415,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) bdev->state = SCM_OPER; spin_lock_init(&bdev->rq_lock); spin_lock_init(&bdev->lock); - INIT_LIST_HEAD(&bdev->finished_requests); atomic_set(&bdev->queued_reqs, 0); - tasklet_init(&bdev->tasklet, - (void (*)(unsigned long)) scm_blk_tasklet, - (unsigned long) bdev); bdev->tag_set.ops = &scm_mq_ops; bdev->tag_set.nr_hw_queues = 1; @@ -502,7 +479,6 @@ out: void scm_blk_dev_cleanup(struct scm_blk_dev *bdev) { - tasklet_kill(&bdev->tasklet); del_gendisk(bdev->gendisk); blk_cleanup_queue(bdev->gendisk->queue); blk_mq_free_tag_set(&bdev->tag_set); diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index 160564399ff4..f7b4d9ba43d1 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -15,7 +15,6 @@ #define SCM_QUEUE_DELAY 5 struct scm_blk_dev { - struct tasklet_struct tasklet; struct request_queue *rq; struct gendisk *gendisk; struct blk_mq_tag_set tag_set; From 9861dbd5b4a422ae03a8caa2fa6d2827912aa952 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Fri, 24 Feb 2017 17:50:17 +0100 Subject: [PATCH 08/52] s390/scm: use multiple queues Exploit multiple hardware contexts (queues) that can process requests in parallel. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/block/scm_blk.c | 52 ++++++++++++++++++++++++++++-------- drivers/s390/block/scm_blk.h | 3 +-- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index bba798c699f1..725f912fab41 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -273,30 +273,36 @@ static void scm_request_start(struct scm_request *scmrq) } } +struct scm_queue { + struct scm_request *scmrq; + spinlock_t lock; +}; + static int scm_blk_request(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { struct scm_device *scmdev = hctx->queue->queuedata; struct scm_blk_dev *bdev = dev_get_drvdata(&scmdev->dev); + struct scm_queue *sq = hctx->driver_data; struct request *req = qd->rq; struct scm_request *scmrq; - spin_lock(&bdev->rq_lock); + spin_lock(&sq->lock); if (!scm_permit_request(bdev, req)) { - spin_unlock(&bdev->rq_lock); + spin_unlock(&sq->lock); return BLK_MQ_RQ_QUEUE_BUSY; } - scmrq = hctx->driver_data; + scmrq = sq->scmrq; if (!scmrq) { scmrq = scm_request_fetch(); if (!scmrq) { SCM_LOG(5, "no request"); - spin_unlock(&bdev->rq_lock); + spin_unlock(&sq->lock); return BLK_MQ_RQ_QUEUE_BUSY; } scm_request_init(bdev, scmrq); - hctx->driver_data = scmrq; + sq->scmrq = scmrq; } scm_request_set(scmrq, req); @@ -307,20 +313,43 @@ static int scm_blk_request(struct blk_mq_hw_ctx *hctx, if (scmrq->aob->request.msb_count) scm_request_start(scmrq); - hctx->driver_data = NULL; - spin_unlock(&bdev->rq_lock); + sq->scmrq = NULL; + spin_unlock(&sq->lock); return BLK_MQ_RQ_QUEUE_BUSY; } blk_mq_start_request(req); if (qd->last || scmrq->aob->request.msb_count == nr_requests_per_io) { scm_request_start(scmrq); - hctx->driver_data = NULL; + sq->scmrq = NULL; } - spin_unlock(&bdev->rq_lock); + spin_unlock(&sq->lock); return BLK_MQ_RQ_QUEUE_OK; } +static int scm_blk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int idx) +{ + struct scm_queue *qd = kzalloc(sizeof(*qd), GFP_KERNEL); + + if (!qd) + return -ENOMEM; + + spin_lock_init(&qd->lock); + hctx->driver_data = qd; + + return 0; +} + +static void scm_blk_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int idx) +{ + struct scm_queue *qd = hctx->driver_data; + + WARN_ON(qd->scmrq); + kfree(hctx->driver_data); + hctx->driver_data = NULL; +} + static void __scmrq_log_error(struct scm_request *scmrq) { struct aob *aob = scmrq->aob; @@ -396,6 +425,8 @@ static const struct block_device_operations scm_blk_devops = { static const struct blk_mq_ops scm_mq_ops = { .queue_rq = scm_blk_request, .complete = scm_blk_request_done, + .init_hctx = scm_blk_init_hctx, + .exit_hctx = scm_blk_exit_hctx, }; int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) @@ -413,12 +444,11 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) bdev->scmdev = scmdev; bdev->state = SCM_OPER; - spin_lock_init(&bdev->rq_lock); spin_lock_init(&bdev->lock); atomic_set(&bdev->queued_reqs, 0); bdev->tag_set.ops = &scm_mq_ops; - bdev->tag_set.nr_hw_queues = 1; + bdev->tag_set.nr_hw_queues = nr_requests; bdev->tag_set.queue_depth = nr_requests_per_io * nr_requests; bdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index f7b4d9ba43d1..242d17a91920 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -19,8 +19,7 @@ struct scm_blk_dev { struct gendisk *gendisk; struct blk_mq_tag_set tag_set; struct scm_device *scmdev; - spinlock_t rq_lock; /* guard the request queue */ - spinlock_t lock; /* guard the rest of the blockdev */ + spinlock_t lock; atomic_t queued_reqs; enum {SCM_OPER, SCM_WR_PROHIBIT} state; struct list_head finished_requests; From 3050c20821d7e9f15566d105da9a0913de61e85c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Tue, 9 May 2017 14:07:37 +0200 Subject: [PATCH 09/52] s390/dasd: Remove variable sized array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dynamic stack allocations are considered bad. Get rid of this one occurrence and use kstrdup() instead. Also, set the return codes so that we have only one exit where we can call kfree(). Signed-off-by: Jan Höppner Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd_devmap.c | 47 ++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 1164b51d09f3..05e5762d045e 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -315,45 +315,58 @@ static int __init dasd_parse_range(const char *range) char *features_str = NULL; char *from_str = NULL; char *to_str = NULL; - size_t len = strlen(range) + 1; - char tmp[len]; + int rc = 0; + char *tmp; - strlcpy(tmp, range, len); + tmp = kstrdup(range, GFP_KERNEL); + if (!tmp) + return -ENOMEM; - if (dasd_evaluate_range_param(tmp, &from_str, &to_str, &features_str)) - goto out_err; + if (dasd_evaluate_range_param(tmp, &from_str, &to_str, &features_str)) { + rc = -EINVAL; + goto out; + } - if (dasd_busid(from_str, &from_id0, &from_id1, &from)) - goto out_err; + if (dasd_busid(from_str, &from_id0, &from_id1, &from)) { + rc = -EINVAL; + goto out; + } to = from; to_id0 = from_id0; to_id1 = from_id1; if (to_str) { - if (dasd_busid(to_str, &to_id0, &to_id1, &to)) - goto out_err; + if (dasd_busid(to_str, &to_id0, &to_id1, &to)) { + rc = -EINVAL; + goto out; + } if (from_id0 != to_id0 || from_id1 != to_id1 || from > to) { pr_err("%s is not a valid device range\n", range); - goto out_err; + rc = -EINVAL; + goto out; } } features = dasd_feature_list(features_str); - if (features < 0) - goto out_err; + if (features < 0) { + rc = -EINVAL; + goto out; + } /* each device in dasd= parameter should be set initially online */ features |= DASD_FEATURE_INITIAL_ONLINE; while (from <= to) { sprintf(bus_id, "%01x.%01x.%04x", from_id0, from_id1, from++); devmap = dasd_add_busid(bus_id, features); - if (IS_ERR(devmap)) - return PTR_ERR(devmap); + if (IS_ERR(devmap)) { + rc = PTR_ERR(devmap); + goto out; + } } - return 0; +out: + kfree(tmp); -out_err: - return -EINVAL; + return rc; } /* From 7a003637923e9b127dec84fd55b67f4c2c900684 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 May 2017 12:42:26 +0200 Subject: [PATCH 10/52] s390/pkey: add missing __user annotations Add missing __user annotations to get rid of a couple of sparse warnings. All callers actually pass kernel pointers instead of user space pointers, however the pointers are being used within KERNEL_DS. So everything is fine. Corresponding sparse warnings: drivers/s390/crypto/pkey_api.c:181:41: warning: incorrect type in assignment (different address spaces) expected char [noderef] *request_control_blk_addr got void * Cc: Harald Freudenberger Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/pkey_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 8f466facf15e..f61fa47135a6 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -178,9 +178,9 @@ static inline void prep_xcrb(struct ica_xcRB *pxcrb, pxcrb->user_defined = (cardnr == 0xFFFF ? AUTOSELECT : cardnr); pxcrb->request_control_blk_length = preqcblk->cprb_len + preqcblk->req_parml; - pxcrb->request_control_blk_addr = (void *) preqcblk; + pxcrb->request_control_blk_addr = (void __user *) preqcblk; pxcrb->reply_control_blk_length = preqcblk->rpl_msgbl; - pxcrb->reply_control_blk_addr = (void *) prepcblk; + pxcrb->reply_control_blk_addr = (void __user *) prepcblk; } /* From 6c386da799078fddb482bc5c1ab6a62c9a360e4e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 12 May 2017 12:57:40 +0200 Subject: [PATCH 11/52] s390: use two more generic header files I missed at least these two header files where we can make use of the generic ones. vga.h is another one, however that is already addressed by a patch from Jiri Slaby. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/Kbuild | 2 ++ arch/s390/include/asm/device.h | 10 ---------- arch/s390/include/asm/fb.h | 12 ------------ 3 files changed, 2 insertions(+), 22 deletions(-) delete mode 100644 arch/s390/include/asm/device.h delete mode 100644 arch/s390/include/asm/fb.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 45092b12f54f..b3c88479feba 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -1,10 +1,12 @@ generic-y += asm-offsets.h generic-y += cacheflush.h generic-y += clkdev.h +generic-y += device.h generic-y += dma-contiguous.h generic-y += div64.h generic-y += emergency-restart.h generic-y += export.h +generic-y += fb.h generic-y += irq_regs.h generic-y += irq_work.h generic-y += kmap_types.h diff --git a/arch/s390/include/asm/device.h b/arch/s390/include/asm/device.h deleted file mode 100644 index 5203fc87f080..000000000000 --- a/arch/s390/include/asm/device.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Arch specific extensions to struct device - * - * This file is released under the GPLv2 - */ -struct dev_archdata { -}; - -struct pdev_archdata { -}; diff --git a/arch/s390/include/asm/fb.h b/arch/s390/include/asm/fb.h deleted file mode 100644 index c7df38030992..000000000000 --- a/arch/s390/include/asm/fb.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_FB_H_ -#define _ASM_FB_H_ -#include - -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H_ */ From b29e061bb7a8d2e2fbf7826fb30ab5477d595e92 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 12 May 2017 15:31:38 +0200 Subject: [PATCH 12/52] s390/cputime: simplify account_system_index_scaled The account_system_index_scaled gets two cputime values, a raw value derived from CPU timer deltas and a scaled value. The scaled value is always calculated from the raw value, the code can be simplified by moving the scale_vtime call into account_system_index_scaled. Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/vtime.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 072d84ba42a3..dd7178fbb4f3 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -110,11 +110,10 @@ static inline u64 scale_vtime(u64 vtime) return vtime; } -static void account_system_index_scaled(struct task_struct *p, - u64 cputime, u64 scaled, +static void account_system_index_scaled(struct task_struct *p, u64 cputime, enum cpu_usage_stat index) { - p->stimescaled += cputime_to_nsecs(scaled); + p->stimescaled += cputime_to_nsecs(scale_vtime(cputime)); account_system_index_time(p, cputime_to_nsecs(cputime), index); } @@ -176,14 +175,11 @@ static int do_account_vtime(struct task_struct *tsk) } if (system) - account_system_index_scaled(tsk, system, scale_vtime(system), - CPUTIME_SYSTEM); + account_system_index_scaled(tsk, system, CPUTIME_SYSTEM); if (hardirq) - account_system_index_scaled(tsk, hardirq, scale_vtime(hardirq), - CPUTIME_IRQ); + account_system_index_scaled(tsk, hardirq, CPUTIME_IRQ); if (softirq) - account_system_index_scaled(tsk, softirq, scale_vtime(softirq), - CPUTIME_SOFTIRQ); + account_system_index_scaled(tsk, softirq, CPUTIME_SOFTIRQ); steal = S390_lowcore.steal_timer; if ((s64) steal > 0) { From 35bb092a91b4c43a2079d3d5458b74a059455b6c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 15 May 2017 10:23:38 +0200 Subject: [PATCH 13/52] s390/vdso: use _install_special_mapping to establish vdso Switch to the improved _install_special_mapping function to install the vdso mapping. This has two advantages, the arch_vma_name function is not needed anymore and the vdso vma still has its name after its memory location has been changed with mremap. Tested-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/vdso.c | 91 +++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 10516ae3b55e..b89d19f6f2ab 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -50,6 +50,56 @@ static struct page **vdso64_pagelist; */ unsigned int __read_mostly vdso_enabled = 1; +static int vdso_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page **vdso_pagelist; + unsigned long vdso_pages; + + vdso_pagelist = vdso64_pagelist; + vdso_pages = vdso64_pages; +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + vdso_pagelist = vdso32_pagelist; + vdso_pages = vdso32_pages; + } +#endif + + if (vmf->pgoff >= vdso_pages) + return VM_FAULT_SIGBUS; + + vmf->page = vdso_pagelist[vmf->pgoff]; + get_page(vmf->page); + return 0; +} + +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *vma) +{ + unsigned long vdso_pages; + + vdso_pages = vdso64_pages; +#ifdef CONFIG_COMPAT + if (is_compat_task()) + vdso_pages = vdso32_pages; +#endif + + if ((vdso_pages << PAGE_SHIFT) != vma->vm_end - vma->vm_start) + return -EINVAL; + + if (WARN_ON_ONCE(current->mm != vma->vm_mm)) + return -EFAULT; + + current->mm->context.vdso_base = vma->vm_start; + return 0; +} + +static const struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .fault = vdso_fault, + .mremap = vdso_mremap, +}; + static int __init vdso_setup(char *s) { unsigned long val; @@ -181,7 +231,7 @@ static void vdso_init_cr5(void) int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; - struct page **vdso_pagelist; + struct vm_area_struct *vma; unsigned long vdso_pages; unsigned long vdso_base; int rc; @@ -194,13 +244,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!uses_interp) return 0; - vdso_pagelist = vdso64_pagelist; vdso_pages = vdso64_pages; #ifdef CONFIG_COMPAT - if (is_compat_task()) { - vdso_pagelist = vdso32_pagelist; + if (is_compat_task()) vdso_pages = vdso32_pages; - } #endif /* * vDSO has a problem and was disabled, just don't "enable" it for @@ -209,8 +256,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (vdso_pages == 0) return 0; - current->mm->context.vdso_base = 0; - /* * pick a base address for the vDSO in process space. We try to put * it at vdso_base which is the "natural" base for it, but we might @@ -224,13 +269,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out_up; } - /* - * Put vDSO base into mm struct. We need to do this before calling - * install_special_mapping or the perf counter mmap tracking code - * will fail to recognise it as a vDSO (since arch_vma_name fails). - */ - current->mm->context.vdso_base = vdso_base; - /* * our vma flags don't have VM_WRITE so by default, the process * isn't allowed to write those pages. @@ -241,24 +279,23 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * It's fine to use that for setting breakpoints in the vDSO code * pages though. */ - rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso_pagelist); - if (rc) - current->mm->context.vdso_base = 0; + vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &vdso_mapping); + if (IS_ERR(vma)) { + rc = PTR_ERR(vma); + goto out_up; + } + + current->mm->context.vdso_base = vdso_base; + rc = 0; + out_up: up_write(&mm->mmap_sem); return rc; } -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso_base) - return "[vdso]"; - return NULL; -} - static int __init vdso_init(void) { int i; From 8ff3458865ef43de80c10a32a4797bb941840aee Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 May 2017 12:19:16 +0200 Subject: [PATCH 14/52] s390/zcrypt: get rid of little/big endian handling The zcrypt code contains a couple of functions which receive a "big_endian" argument. All callers naturally pass "1" for big endian, since s390 is big endian. Therefore get rid of this argument and also get rid of the cpu_to_le()/cpu_to_be() calls. This way we get rid of a couple of sparse warnings: drivers/s390/crypto/zcrypt_cca_key.h:255:34: warning: incorrect type in assignment (different base types) expected unsigned short [unsigned] ulen got restricted __be16 [usertype] Cc: Harald Freudenberger Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_cca_key.h | 36 +++++++-------------------- drivers/s390/crypto/zcrypt_msgtype6.c | 4 +-- 2 files changed, 11 insertions(+), 29 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_cca_key.h b/drivers/s390/crypto/zcrypt_cca_key.h index ca0cdbe46368..771cbb91c11e 100644 --- a/drivers/s390/crypto/zcrypt_cca_key.h +++ b/drivers/s390/crypto/zcrypt_cca_key.h @@ -133,8 +133,7 @@ struct cca_pvt_ext_CRT_sec { * * Returns the size of the key area or -EFAULT */ -static inline int zcrypt_type6_mex_key_de(struct ica_rsa_modexpo *mex, - void *p, int big_endian) +static inline int zcrypt_type6_mex_key_de(struct ica_rsa_modexpo *mex, void *p) { static struct cca_token_hdr static_pvt_me_hdr = { .token_identifier = 0x1E, @@ -162,13 +161,8 @@ static inline int zcrypt_type6_mex_key_de(struct ica_rsa_modexpo *mex, memset(key, 0, sizeof(*key)); - if (big_endian) { - key->t6_hdr.blen = cpu_to_be16(0x189); - key->t6_hdr.ulen = cpu_to_be16(0x189 - 2); - } else { - key->t6_hdr.blen = cpu_to_le16(0x189); - key->t6_hdr.ulen = cpu_to_le16(0x189 - 2); - } + key->t6_hdr.blen = 0x189; + key->t6_hdr.ulen = 0x189 - 2; key->pvtMeHdr = static_pvt_me_hdr; key->pvtMeSec = static_pvt_me_sec; key->pubMeSec = static_pub_me_sec; @@ -205,8 +199,7 @@ static inline int zcrypt_type6_mex_key_de(struct ica_rsa_modexpo *mex, * * Returns the size of the key area or -EFAULT */ -static inline int zcrypt_type6_mex_key_en(struct ica_rsa_modexpo *mex, - void *p, int big_endian) +static inline int zcrypt_type6_mex_key_en(struct ica_rsa_modexpo *mex, void *p) { static struct cca_token_hdr static_pub_hdr = { .token_identifier = 0x1E, @@ -251,13 +244,8 @@ static inline int zcrypt_type6_mex_key_en(struct ica_rsa_modexpo *mex, 2*mex->inputdatalength - i; key->pubHdr.token_length = key->pubSec.section_length + sizeof(key->pubHdr); - if (big_endian) { - key->t6_hdr.ulen = cpu_to_be16(key->pubHdr.token_length + 4); - key->t6_hdr.blen = cpu_to_be16(key->pubHdr.token_length + 6); - } else { - key->t6_hdr.ulen = cpu_to_le16(key->pubHdr.token_length + 4); - key->t6_hdr.blen = cpu_to_le16(key->pubHdr.token_length + 6); - } + key->t6_hdr.ulen = key->pubHdr.token_length + 4; + key->t6_hdr.blen = key->pubHdr.token_length + 6; return sizeof(*key) + 2*mex->inputdatalength - i; } @@ -271,8 +259,7 @@ static inline int zcrypt_type6_mex_key_en(struct ica_rsa_modexpo *mex, * * Returns the size of the key area or -EFAULT */ -static inline int zcrypt_type6_crt_key(struct ica_rsa_modexpo_crt *crt, - void *p, int big_endian) +static inline int zcrypt_type6_crt_key(struct ica_rsa_modexpo_crt *crt, void *p) { static struct cca_public_sec static_cca_pub_sec = { .section_identifier = 4, @@ -298,13 +285,8 @@ static inline int zcrypt_type6_crt_key(struct ica_rsa_modexpo_crt *crt, size = sizeof(*key) + key_len + sizeof(*pub) + 3; /* parameter block.key block */ - if (big_endian) { - key->t6_hdr.blen = cpu_to_be16(size); - key->t6_hdr.ulen = cpu_to_be16(size - 2); - } else { - key->t6_hdr.blen = cpu_to_le16(size); - key->t6_hdr.ulen = cpu_to_le16(size - 2); - } + key->t6_hdr.blen = size; + key->t6_hdr.ulen = size - 2; /* key token header */ key->token.token_identifier = CCA_TKN_HDR_ID_EXT; diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index e5563ffeb839..4fddb4319481 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -291,7 +291,7 @@ static int ICAMEX_msg_to_type6MEX_msgX(struct zcrypt_queue *zq, return -EFAULT; /* Set up key which is located after the variable length text. */ - size = zcrypt_type6_mex_key_en(mex, msg->text+mex->inputdatalength, 1); + size = zcrypt_type6_mex_key_en(mex, msg->text+mex->inputdatalength); if (size < 0) return size; size += sizeof(*msg) + mex->inputdatalength; @@ -353,7 +353,7 @@ static int ICACRT_msg_to_type6CRT_msgX(struct zcrypt_queue *zq, return -EFAULT; /* Set up key which is located after the variable length text. */ - size = zcrypt_type6_crt_key(crt, msg->text + crt->inputdatalength, 1); + size = zcrypt_type6_crt_key(crt, msg->text + crt->inputdatalength); if (size < 0) return size; size += sizeof(*msg) + crt->inputdatalength; /* total size of msg */ From a1b19d07ca71d6c60b49771f244fc6536cd15358 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Mon, 15 May 2017 12:38:15 +0200 Subject: [PATCH 15/52] s390/zcrypt: remove unused function zcrypt_type6_mex_key_de() Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_cca_key.h | 85 ---------------------------- 1 file changed, 85 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_cca_key.h b/drivers/s390/crypto/zcrypt_cca_key.h index 771cbb91c11e..12cff6262566 100644 --- a/drivers/s390/crypto/zcrypt_cca_key.h +++ b/drivers/s390/crypto/zcrypt_cca_key.h @@ -48,26 +48,6 @@ struct cca_token_hdr { #define CCA_TKN_HDR_ID_EXT 0x1E -/** - * mapping for the cca private ME section - */ -struct cca_private_ext_ME_sec { - unsigned char section_identifier; - unsigned char version; - unsigned short section_length; - unsigned char private_key_hash[20]; - unsigned char reserved1[4]; - unsigned char key_format; - unsigned char reserved2; - unsigned char key_name_hash[20]; - unsigned char key_use_flags[4]; - unsigned char reserved3[6]; - unsigned char reserved4[24]; - unsigned char confounder[24]; - unsigned char exponent[128]; - unsigned char modulus[128]; -} __attribute__((packed)); - #define CCA_PVT_USAGE_ALL 0x80 /** @@ -123,71 +103,6 @@ struct cca_pvt_ext_CRT_sec { #define CCA_PVT_EXT_CRT_SEC_ID_PVT 0x08 #define CCA_PVT_EXT_CRT_SEC_FMT_CL 0x40 -/** - * Set up private key fields of a type6 MEX message. - * Note that all numerics in the key token are big-endian, - * while the entries in the key block header are little-endian. - * - * @mex: pointer to user input data - * @p: pointer to memory area for the key - * - * Returns the size of the key area or -EFAULT - */ -static inline int zcrypt_type6_mex_key_de(struct ica_rsa_modexpo *mex, void *p) -{ - static struct cca_token_hdr static_pvt_me_hdr = { - .token_identifier = 0x1E, - .token_length = 0x0183, - }; - static struct cca_private_ext_ME_sec static_pvt_me_sec = { - .section_identifier = 0x02, - .section_length = 0x016C, - .key_use_flags = {0x80,0x00,0x00,0x00}, - }; - static struct cca_public_sec static_pub_me_sec = { - .section_identifier = 0x04, - .section_length = 0x000F, - .exponent_len = 0x0003, - }; - static char pk_exponent[3] = { 0x01, 0x00, 0x01 }; - struct { - struct T6_keyBlock_hdr t6_hdr; - struct cca_token_hdr pvtMeHdr; - struct cca_private_ext_ME_sec pvtMeSec; - struct cca_public_sec pubMeSec; - char exponent[3]; - } __attribute__((packed)) *key = p; - unsigned char *temp; - - memset(key, 0, sizeof(*key)); - - key->t6_hdr.blen = 0x189; - key->t6_hdr.ulen = 0x189 - 2; - key->pvtMeHdr = static_pvt_me_hdr; - key->pvtMeSec = static_pvt_me_sec; - key->pubMeSec = static_pub_me_sec; - /* - * In a private key, the modulus doesn't appear in the public - * section. So, an arbitrary public exponent of 0x010001 will be - * used. - */ - memcpy(key->exponent, pk_exponent, 3); - - /* key parameter block */ - temp = key->pvtMeSec.exponent + - sizeof(key->pvtMeSec.exponent) - mex->inputdatalength; - if (copy_from_user(temp, mex->b_key, mex->inputdatalength)) - return -EFAULT; - - /* modulus */ - temp = key->pvtMeSec.modulus + - sizeof(key->pvtMeSec.modulus) - mex->inputdatalength; - if (copy_from_user(temp, mex->n_modulus, mex->inputdatalength)) - return -EFAULT; - key->pubMeSec.modulus_bit_len = 8 * mex->inputdatalength; - return sizeof(*key); -} - /** * Set up private key fields of a type6 MEX message. The _pad variant * strips leading zeroes from the b_key. From c4684f98d3453dd07cc7ce67e0e795330eeec9c5 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Thu, 11 May 2017 17:15:54 +0200 Subject: [PATCH 16/52] s390/crypto: fix aes/paes Kconfig dependeny The s390_paes and the s390_aes kernel module used just one config symbol CONFIG_CRYPTO_AES. As paes has a dependency to PKEY and this requires ZCRYPT the aes module also had a dependency to the zcrypt device driver which is not true. Fixed by introducing a new config symbol CONFIG_CRYPTO_PAES which has dependencies to PKEY and ZCRYPT. Removed the dependency for the aes module to ZCRYPT. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- arch/s390/crypto/Makefile | 3 ++- drivers/crypto/Kconfig | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile index 678d9863e3f0..ad4bd777768d 100644 --- a/arch/s390/crypto/Makefile +++ b/arch/s390/crypto/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o -obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o paes_s390.o +obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o +obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o obj-$(CONFIG_S390_PRNG) += prng.o obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index fb1e60f5002e..9c7951bb05ac 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -89,6 +89,20 @@ config PKEY requires to have at least one CEX card in coprocessor mode available at runtime. +config CRYPTO_PAES_S390 + tristate "PAES cipher algorithms" + depends on S390 + depends on ZCRYPT + depends on PKEY + select CRYPTO_ALGAPI + select CRYPTO_BLKCIPHER + help + This is the s390 hardware accelerated implementation of the + AES cipher algorithms for use with protected key. + + Select this option if you want to use the paes cipher + for example to use protected key encrypted devices. + config CRYPTO_SHA1_S390 tristate "SHA1 digest algorithm" depends on S390 @@ -137,7 +151,6 @@ config CRYPTO_AES_S390 depends on S390 select CRYPTO_ALGAPI select CRYPTO_BLKCIPHER - select PKEY help This is the s390 hardware accelerated implementation of the AES cipher algorithms (FIPS-197). From 20d58cb642c640406f6466c64a0899400f52c47b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 8 May 2017 08:03:32 +0200 Subject: [PATCH 17/52] s390/hvc_iucv: fix broken Kconfig select statement Select statements in Kconfig do not necessarily enable all required dependencies and can lead to broken configs. This is also the case for the "select IUCV" statement within HVC_IUCV: warning: (HVC_IUCV) selects IUCV which has unmet direct dependencies (NET && S390) Just add the missing "depends on NET" to avoid broken configs. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/tty/hvc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig index 574da15fe618..b8d5ea0ae26b 100644 --- a/drivers/tty/hvc/Kconfig +++ b/drivers/tty/hvc/Kconfig @@ -44,7 +44,7 @@ config HVC_RTAS config HVC_IUCV bool "z/VM IUCV Hypervisor console support (VM only)" - depends on S390 + depends on S390 && NET select HVC_DRIVER select IUCV default y From ac994e80f94f440138774830f2edf148d6ece1f3 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 12 May 2017 16:35:14 +0200 Subject: [PATCH 18/52] s390/zcrypt: Rework ap init in case of out of range domain param. When a out of range domain parameter was given, the init function returned with -EINVAL and the driver was not operational. As the driver is statically build into the kernel and is able to work with multiple domains anyway the init function should continue. Now the user has a chance to write a new default domain value via sysfs attribute file. Also added two new dbf debug messages related to the domain value handling. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/ap_bus.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index ea099910b4e9..6dee598979e7 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -766,7 +766,7 @@ static ssize_t ap_domain_store(struct bus_type *bus, ap_domain_index = domain; spin_unlock_bh(&ap_domain_lock); - AP_DBF(DBF_DEBUG, "store new default domain=%d\n", domain); + AP_DBF(DBF_DEBUG, "stored new default domain=%d\n", domain); return count; } @@ -952,6 +952,7 @@ static int ap_select_domain(void) } if (best_domain >= 0){ ap_domain_index = best_domain; + AP_DBF(DBF_DEBUG, "new ap_domain_index=%d\n", ap_domain_index); spin_unlock_bh(&ap_domain_lock); return 0; } @@ -988,7 +989,7 @@ static void ap_scan_bus(struct work_struct *unused) ap_qid_t qid; int depth = 0, type = 0; unsigned int functions = 0; - int rc, id, dom, borked, domains; + int rc, id, dom, borked, domains, defdomdevs = 0; AP_DBF(DBF_DEBUG, "ap_scan_bus running\n"); @@ -1052,6 +1053,8 @@ static void ap_scan_bus(struct work_struct *unused) put_device(dev); if (!borked) { domains++; + if (dom == ap_domain_index) + defdomdevs++; continue; } } @@ -1098,6 +1101,8 @@ static void ap_scan_bus(struct work_struct *unused) continue; } domains++; + if (dom == ap_domain_index) + defdomdevs++; } /* end domain loop */ if (ac) { /* remove card dev if there are no queue devices */ @@ -1106,6 +1111,11 @@ static void ap_scan_bus(struct work_struct *unused) put_device(&ac->ap_dev.device); } } /* end device loop */ + + if (defdomdevs < 1) + AP_DBF(DBF_INFO, "no queue device with default domain %d available\n", + ap_domain_index); + out: mod_timer(&ap_config_timer, jiffies + ap_config_time * HZ); } @@ -1174,14 +1184,14 @@ int __init ap_module_init(void) ap_init_configuration(); if (ap_configuration) - max_domain_id = ap_max_domain_id ? : (AP_DOMAINS - 1); + max_domain_id = + ap_max_domain_id ? ap_max_domain_id : AP_DOMAINS - 1; else max_domain_id = 15; if (ap_domain_index < -1 || ap_domain_index > max_domain_id) { pr_warn("%d is not a valid cryptographic domain\n", ap_domain_index); - rc = -EINVAL; - goto out_free; + ap_domain_index = -1; } /* In resume callback we need to know if the user had set the domain. * If so, we can not just reset it. @@ -1254,7 +1264,6 @@ out: unregister_reset_call(&ap_reset_call); if (ap_using_interrupts()) unregister_adapter_interrupt(&ap_airq); -out_free: kfree(ap_configuration); return rc; } From 9cf8edb7a33b7689d4c3429b1d339d29e7c3db86 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 18 May 2017 13:04:16 +0200 Subject: [PATCH 19/52] s390/smp: fix false positive kmemleak of mcesa data structure I get number of CPUs - 1 kmemleak hits like unreferenced object 0x37ec6f000 (size 1024): comm "swapper/0", pid 1, jiffies 4294937330 (age 889.690s) hex dump (first 32 bytes): 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk backtrace: [<000000000034a848>] kmem_cache_alloc+0x2b8/0x3d0 [<00000000001164de>] __cpu_up+0x456/0x488 [<000000000016f60c>] bringup_cpu+0x4c/0xd0 [<000000000016d5d2>] cpuhp_invoke_callback+0xe2/0x9e8 [<000000000016f3c6>] cpuhp_up_callbacks+0x5e/0x110 [<000000000016f988>] _cpu_up+0xe0/0x158 [<000000000016faf0>] do_cpu_up+0xf0/0x110 [<0000000000dae1ee>] smp_init+0x126/0x130 [<0000000000d9bd04>] kernel_init_freeable+0x174/0x2e0 [<000000000089fc62>] kernel_init+0x2a/0x148 [<00000000008adce2>] kernel_thread_starter+0x6/0xc [<00000000008adcdc>] kernel_thread_starter+0x0/0xc [] 0xffffffffffffffff The pointer of this data structure is stored in the prefix page of that CPU together with some extra bits ORed into the the low bits. Mark the data structure as non-leak. Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/smp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 363000a77ffc..1020a11a24e5 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -207,6 +208,8 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) kmem_cache_alloc(pcpu_mcesa_cache, GFP_KERNEL); if (!mcesa_origin) goto out; + /* The pointer is stored with mcesa_bits ORed in */ + kmemleak_not_leak((void *) mcesa_origin); mcesa_bits = MACHINE_HAS_GS ? 11 : 0; } } else { From bf10b6687c19484cdf28107ec9b4be51c06c7746 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 19 May 2017 09:07:23 +0200 Subject: [PATCH 20/52] s390/smp: use sigp condition code define Use proper define instead of open-coding the condition code value. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/sigp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h index 72df5f2de6b0..020a8814d511 100644 --- a/arch/s390/include/asm/sigp.h +++ b/arch/s390/include/asm/sigp.h @@ -59,7 +59,7 @@ static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm, int cc; cc = ____pcpu_sigp(addr, order, parm, &_status); - if (status && cc == 1) + if (status && cc == SIGP_CC_STATUS_STORED) *status = _status; return cc; } From dbed23dba0fb153c4566af59da7dfa06b780b0af Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 12 May 2017 18:53:41 +0200 Subject: [PATCH 21/52] s390/zcrypt: Add some debug messages on failure. Added some dbf debug messages on failure of the most important ioctl calls. These messages are only enabled with dbf level 6 (debug) and so do not affect the normal operating mode which uses level 3 (errors and higher). Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_api.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 93015f85d4a6..b1c27e28859b 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -821,8 +821,10 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, do { rc = zcrypt_rsa_modexpo(&mex); } while (rc == -EAGAIN); - if (rc) + if (rc) { + ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSAMODEXPO rc=%d", rc); return rc; + } return put_user(mex.outputdatalength, &umex->outputdatalength); } case ICARSACRT: { @@ -838,8 +840,10 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, do { rc = zcrypt_rsa_crt(&crt); } while (rc == -EAGAIN); - if (rc) + if (rc) { + ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSACRT rc=%d", rc); return rc; + } return put_user(crt.outputdatalength, &ucrt->outputdatalength); } case ZSECSENDCPRB: { @@ -855,6 +859,8 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, do { rc = zcrypt_send_cprb(&xcRB); } while (rc == -EAGAIN); + if (rc) + ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDCPRB rc=%d", rc); if (copy_to_user(uxcRB, &xcRB, sizeof(xcRB))) return -EFAULT; return rc; @@ -872,6 +878,8 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, do { rc = zcrypt_send_ep11_cprb(&xcrb); } while (rc == -EAGAIN); + if (rc) + ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDEP11CPRB rc=%d", rc); if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb))) return -EFAULT; return rc; From fe7b274729fc0bab9b4238f875695d36726a6b10 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 May 2017 13:16:00 +0200 Subject: [PATCH 22/52] s390/fault: use _ASCE_ORIGIN instead of PAGE_MASK When masking an ASCE to get its origin use the corresponding define instead of the unrelated PAGE_MASK. This doesn't fix a bug since both masks are identical. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 5845d3028ffc..14f25798b001 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -130,7 +130,7 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long asce, unsigned long address) { - unsigned long *table = __va(asce & PAGE_MASK); + unsigned long *table = __va(asce & _ASCE_ORIGIN); pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { From 8aa8680aa383bf6e2ac2a7d3369097268c75f7da Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 22 May 2017 17:20:53 +0200 Subject: [PATCH 23/52] s390: Remove 'message security assist' from the list of vital facilities The code in arch/s390/crypto checks for the availability of the 'message security assist' facility on its own, either by using module_cpu_feature_match(MSA, ...) or by checking the facility bit during cpacf_query(). Thus setting the MSA facility bit in gen_facilities.c as hard requirement is not necessary. We can remove it here, so that the kernel can also run on systems that do not provide the MSA facility yet (like the emulated environment of QEMU, for example). Signed-off-by: Thomas Huth Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/tools/gen_facilities.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index be63fbd699fd..cfe3f76bc2de 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -35,7 +35,6 @@ static struct facility_def facility_defs[] = { #endif #ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES 7, /* stfle */ - 17, /* message security assist */ 21, /* extended-immediate facility */ 25, /* store clock fast */ #endif From 53d7f25f09eb0ea7cb119d86590f3992656a6892 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 May 2017 09:17:24 +0200 Subject: [PATCH 24/52] s390/facilities: remove stfle requirement All call sites of "stfle" check if the instruction is available before executing it. Therefore there is no reason to have the corresponding facility bit set within the architecture level set. This removes the last more or less odd bit from the list. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/tools/gen_facilities.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index cfe3f76bc2de..025ea20fc4b4 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -34,7 +34,6 @@ static struct facility_def facility_defs[] = { 18, /* long displacement facility */ #endif #ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - 7, /* stfle */ 21, /* extended-immediate facility */ 25, /* store clock fast */ #endif From f5bbd7219891364a6ba1231cb751905045fd4466 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 19 May 2017 15:32:09 +0200 Subject: [PATCH 25/52] s390/ptrace: guarded storage regset for the current task The regset functions for guarded storage are supposed to work on the current task as well. For task == current add the required load and store instructions for the guarded storage control block. Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ptrace.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 488c5bb8dc77..252ed61a128b 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -1160,6 +1160,8 @@ static int s390_gs_cb_get(struct task_struct *target, return -ENODEV; if (!data) return -ENODATA; + if (target == current) + save_gs_cb(data); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, sizeof(struct gs_cb)); } @@ -1170,6 +1172,7 @@ static int s390_gs_cb_set(struct task_struct *target, const void *kbuf, const void __user *ubuf) { struct gs_cb *data = target->thread.gs_cb; + int rc; if (!MACHINE_HAS_GS) return -ENODEV; @@ -1177,10 +1180,18 @@ static int s390_gs_cb_set(struct task_struct *target, data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; + data->gsd = 25; target->thread.gs_cb = data; + if (target == current) + __ctl_set_bit(2, 4); + } else if (target == current) { + save_gs_cb(data); } - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - data, 0, sizeof(struct gs_cb)); + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + data, 0, sizeof(struct gs_cb)); + if (target == current) + restore_gs_cb(data); + return rc; } static int s390_gs_bc_get(struct task_struct *target, From f96c6f72bc792e7add27e40ec859a8f5ea72e72b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 May 2017 13:27:34 +0200 Subject: [PATCH 26/52] s390/mm: remove incorrect _REGION3_ENTRY_ORIGIN define _REGION3_ENTRY_ORIGIN defines a wrong mask which can be used to extract a segment table origin from a region 3 table entry. It removes only the lower 11 instead of 12 bits from a region 3 table entry. Luckily this bit is currently always zero, so nothing bad happened yet. In order to avoid future bugs just remove the region 3 specific mask and use the correct generic _REGION_ENTRY_ORIGIN mask. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 3effb26f0e1a..6f8a67a103be 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -301,8 +301,6 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID) #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ -#define _REGION3_ENTRY_ORIGIN ~0x7ffUL/* region third table origin */ - #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ @@ -641,7 +639,7 @@ static inline unsigned long pud_pfn(pud_t pud) { unsigned long origin_mask; - origin_mask = _REGION3_ENTRY_ORIGIN; + origin_mask = _REGION_ENTRY_ORIGIN; if (pud_large(pud)) origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; return (pud_val(pud) & origin_mask) >> PAGE_SHIFT; From cc18b460dc72fc9020edcd617b3c4b23a577ee19 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 20 May 2017 11:43:26 +0200 Subject: [PATCH 27/52] s390/mm: add p?d_folded() helper functions Introduce and use p?d_folded() functions to clarify the page table code a bit more. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 6f8a67a103be..57057fb1cc07 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -563,18 +563,23 @@ static inline void crdte(unsigned long old, unsigned long new, } /* - * pgd/pmd/pte query functions + * pgd/p4d/pud/pmd/pte query functions */ +static inline int pgd_folded(pgd_t pgd) +{ + return (pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1; +} + static inline int pgd_present(pgd_t pgd) { - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1) + if (pgd_folded(pgd)) return 1; return (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) != 0UL; } static inline int pgd_none(pgd_t pgd) { - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1) + if (pgd_folded(pgd)) return 0; return (pgd_val(pgd) & _REGION_ENTRY_INVALID) != 0UL; } @@ -592,16 +597,21 @@ static inline int pgd_bad(pgd_t pgd) return (pgd_val(pgd) & mask) != 0; } +static inline int p4d_folded(p4d_t p4d) +{ + return (p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2; +} + static inline int p4d_present(p4d_t p4d) { - if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) + if (p4d_folded(p4d)) return 1; return (p4d_val(p4d) & _REGION_ENTRY_ORIGIN) != 0UL; } static inline int p4d_none(p4d_t p4d) { - if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) + if (p4d_folded(p4d)) return 0; return p4d_val(p4d) == _REGION2_ENTRY_EMPTY; } @@ -614,16 +624,21 @@ static inline unsigned long p4d_pfn(p4d_t p4d) return (p4d_val(p4d) & origin_mask) >> PAGE_SHIFT; } +static inline int pud_folded(pud_t pud) +{ + return (pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3; +} + static inline int pud_present(pud_t pud) { - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) + if (pud_folded(pud)) return 1; return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL; } static inline int pud_none(pud_t pud) { - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) + if (pud_folded(pud)) return 0; return pud_val(pud) == _REGION3_ENTRY_EMPTY; } From c39457ff1fd58bf5cc466025b1467de29ba0437a Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 23 May 2017 10:49:35 +0200 Subject: [PATCH 28/52] s390/perf: fix null string in perf list pmu command Command 'perf list pmu' displays events which contain an invalid string "(null)=xxx", where xxx is the pmu event name, for example: cpum_cf/AES_BLOCKED_CYCLES,(null)=AES_BLOCKED_CYCLES/ This is not correct, the invalid string should not be displayed at all. It is caused by an obsolete term in the sysfs attribute file for each s390 CPUMF counter event. Reading from the sysfs file also displays the event name. Fix this by omitting the event name. This patch makes s390 CPUMF sysfs files consistent with other plattforms. This is an interface change between user and kernel but does not break anything. Reading from a counter event sysfs file should only list terms mentioned in the /sys/bus/event_source/devices//format directory. Name is not listed. Reported-by: Zvonko Kosic Signed-off-by: Thomas Richter Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index 955a7b6fa0a4..93a386f4a3b5 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -245,6 +245,5 @@ ssize_t cpumf_events_sysfs_show(struct device *dev, struct perf_pmu_events_attr *pmu_attr; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - return sprintf(page, "event=0x%04llx,name=%s\n", - pmu_attr->id, attr->attr.name); + return sprintf(page, "event=0x%04llx\n", pmu_attr->id); } From 7e9710af2301e76421a6a8f2655937651279f5c4 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 29 May 2017 13:13:08 -0600 Subject: [PATCH 29/52] s390: provide default ioremap and iounmap declaration Move the CONFIG_PCI device so that ioremap and iounmap are always available. This looks safe as there's nothing PCI specific in the implementation of these functions. I have designs to use these functions in scatterlist.c where they'd likely never be called without CONFIG_PCI set, but this is needed to compile such changes. Signed-off-by: Logan Gunthorpe Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Sebastian Ott Cc: Al Viro Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/io.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index 437e9af96688..904e4b3af95d 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -25,8 +25,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 -#ifdef CONFIG_PCI - #define ioremap_nocache(addr, size) ioremap(addr, size) #define ioremap_wc ioremap_nocache #define ioremap_wt ioremap_nocache @@ -49,6 +47,8 @@ static inline void ioport_unmap(void __iomem *p) { } +#ifdef CONFIG_PCI + /* * s390 needs a private implementation of pci_iomap since ioremap with its * offset parameter isn't sufficient. That's because BAR spaces are not From b487a914f853545842a0899329b6b72fe56c4081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Tue, 23 May 2017 16:17:30 +0200 Subject: [PATCH 30/52] s390/dasd: Display read-only attribute correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have two flags, DASD_FLAG_DEVICE_RO and DASD_FEATURE_READONLY, that tell us whether a device is read-only. DASD_FLAG_DEVICE_RO is set when a device is attached as read-only to z/VM and DASD_FEATURE_READONLY is set when either the corresponding kernel parameter is configured, or the read-only state is changed via sysfs. This is valuable information in any case. However, only the feature flag is being checked at the moment when we display the current state. Fix this by checking both flags. Reviewed-by: Stefan Haberland Signed-off-by: Jan Höppner Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd_devmap.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 05e5762d045e..0ce84f0a4d7f 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -748,13 +748,22 @@ static ssize_t dasd_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dasd_devmap *devmap; - int ro_flag; + struct dasd_device *device; + int ro_flag = 0; devmap = dasd_find_busid(dev_name(dev)); - if (!IS_ERR(devmap)) - ro_flag = (devmap->features & DASD_FEATURE_READONLY) != 0; - else - ro_flag = (DASD_FEATURE_DEFAULT & DASD_FEATURE_READONLY) != 0; + if (IS_ERR(devmap)) + goto out; + + ro_flag = !!(devmap->features & DASD_FEATURE_READONLY); + + spin_lock(&dasd_devmap_lock); + device = devmap->device; + if (device) + ro_flag |= test_bit(DASD_FLAG_DEVICE_RO, &device->flags); + spin_unlock(&dasd_devmap_lock); + +out: return snprintf(buf, PAGE_SIZE, ro_flag ? "1\n" : "0\n"); } From 2757fe1d8ebd0e6ab1dbf1105978b8c8369dcc49 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Tue, 16 May 2017 10:30:13 +0200 Subject: [PATCH 31/52] s390/dasd: fix unusable device after safe offline processing The safe offline processing needs, as well as the normal offline processing, to be locked against multiple parallel executions. But it should be able to be overtaken by a normal offline processing to make sure that the device does not wait forever for outstanding I/O if the user wants to. Unfortunately the parallel processing of safe offline and normal offline might lead to a race situation where both threads report successful execution to the CIO layer which in turn tries to deregister the kobject of the device twice. This leads to a refcount_t: underflow; use-after-free. error and the device is not able to be set online again afterwards without a reboot. Correct the locking of the safe offline processing by doing the following: - Use the cdev lock to secure all set and test operations to the device flags. - Two safe offline processes are locked against each other using the DASD_FLAG_SAFE_OFFLINE and DASD_FLAG_SAFE_OFFLINE_RUNNING device flags. The differentiation between offline triggered and offline running is needed since the normal offline attribute is owned by CIO and we have to pass over control in between. - The dasd_generic_set_offline process handles the offline processing. It is locked against parallel execution using the DASD_FLAG_OFFLINE. - Only a running safe offline should be able to be overtaken by a single normal offline. This is ensured by clearing the DASD_FLAG_SAFE_OFFLINE_RUNNING flag when a normal offline overtakes. So this can only happen ones. - The safe offline just aborts in this case doing nothing and the normal offline processing finishes as usual. Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 68 ++++++++++++++++++-------------- drivers/s390/block/dasd_devmap.c | 7 +++- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 6fb3fd5efc11..b0c65dcb6865 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3562,57 +3562,69 @@ int dasd_generic_set_offline(struct ccw_device *cdev) else pr_warn("%s: The DASD cannot be set offline while it is in use\n", dev_name(&cdev->dev)); - clear_bit(DASD_FLAG_OFFLINE, &device->flags); - goto out_busy; + rc = -EBUSY; + goto out_err; } } - if (test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { - /* - * safe offline already running - * could only be called by normal offline so safe_offline flag - * needs to be removed to run normal offline and kill all I/O - */ - if (test_and_set_bit(DASD_FLAG_OFFLINE, &device->flags)) - /* Already doing normal offline processing */ - goto out_busy; - else - clear_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags); - } else { - if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) - /* Already doing offline processing */ - goto out_busy; + /* + * Test if the offline processing is already running and exit if so. + * If a safe offline is being processed this could only be a normal + * offline that should be able to overtake the safe offline and + * cancel any I/O we do not want to wait for any longer + */ + if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) { + if (test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { + clear_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, + &device->flags); + } else { + rc = -EBUSY; + goto out_err; + } } - set_bit(DASD_FLAG_OFFLINE, &device->flags); - spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); /* - * if safe_offline called set safe_offline_running flag and + * if safe_offline is called set safe_offline_running flag and * clear safe_offline so that a call to normal offline * can overrun safe_offline processing */ if (test_and_clear_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags) && !test_and_set_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { + /* need to unlock here to wait for outstanding I/O */ + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); /* * If we want to set the device safe offline all IO operations * should be finished before continuing the offline process * so sync bdev first and then wait for our queues to become * empty */ - /* sync blockdev and partitions */ if (device->block) { rc = fsync_bdev(device->block->bdev); if (rc != 0) goto interrupted; } - /* schedule device tasklet and wait for completion */ dasd_schedule_device_bh(device); rc = wait_event_interruptible(shutdown_waitq, _wait_for_empty_queues(device)); if (rc != 0) goto interrupted; + + /* + * check if a normal offline process overtook the offline + * processing in this case simply do nothing beside returning + * that we got interrupted + * otherwise mark safe offline as not running any longer and + * continue with normal offline + */ + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); + if (!test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { + rc = -ERESTARTSYS; + goto out_err; + } + clear_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags); } + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); dasd_set_target_state(device, DASD_STATE_NEW); /* dasd_delete_device destroys the device reference. */ @@ -3624,22 +3636,18 @@ int dasd_generic_set_offline(struct ccw_device *cdev) */ if (block) dasd_free_block(block); + return 0; interrupted: /* interrupted by signal */ - clear_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags); + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); clear_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags); clear_bit(DASD_FLAG_OFFLINE, &device->flags); - dasd_put_device(device); - - return rc; - -out_busy: +out_err: dasd_put_device(device); spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); - - return -EBUSY; + return rc; } EXPORT_SYMBOL_GPL(dasd_generic_set_offline); diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 0ce84f0a4d7f..e943d9c48926 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -950,11 +950,14 @@ dasd_safe_offline_store(struct device *dev, struct device_attribute *attr, { struct ccw_device *cdev = to_ccwdev(dev); struct dasd_device *device; + unsigned long flags; int rc; - device = dasd_device_from_cdev(cdev); + spin_lock_irqsave(get_ccwdev_lock(cdev), flags); + device = dasd_device_from_cdev_locked(cdev); if (IS_ERR(device)) { rc = PTR_ERR(device); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); goto out; } @@ -962,12 +965,14 @@ dasd_safe_offline_store(struct device *dev, struct device_attribute *attr, test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { /* Already doing offline processing */ dasd_put_device(device); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); rc = -EBUSY; goto out; } set_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags); dasd_put_device(device); + spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); rc = ccw_device_set_offline(cdev); From e8ac01555d9e464249e8bb122337d6d6e5589ccc Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Thu, 18 May 2017 13:24:45 +0200 Subject: [PATCH 32/52] s390/dasd: fix hanging safe offline The safe offline processing may hang forever because it waits for I/O which can not be started because of the offline flag that prevents new I/O from being started. Allow I/O to be started during safe offline processing because in this special case we take care that the queues are empty before throwing away the device. Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index b0c65dcb6865..c72ac57940f4 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1965,8 +1965,12 @@ static int __dasd_device_is_unusable(struct dasd_device *device, { int mask = ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM); - if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) { - /* dasd is being set offline. */ + if (test_bit(DASD_FLAG_OFFLINE, &device->flags) && + !test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { + /* + * dasd is being set offline + * but it is no safe offline where we have to allow I/O + */ return 1; } if (device->stopped) { From 2b7b9817c2dbfce0501a313db817718fc5ef6524 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 6 Jun 2017 10:04:26 +0200 Subject: [PATCH 33/52] s390/dumpstack: remove raw stack dump Remove raw stack dumps that are printed before call traces in case of a warning, or the 'l' sysrq trigger (show a stack backtrace for all active CPUs). Besides that a raw stack dump should not be shown for the 'l' sysrq trigger the value of the dump is close to zero. That's also why we don't print it in case of a panic since ages anymore. That this is still printed on warnings is just a leftover. So get rid of this completely. The following won't be printed anymore with this change: Stack: 00000000bbc4fbc8 00000000bbc4fc58 0000000000000003 0000000000000000 00000000bbc4fcf8 00000000bbc4fc70 00000000bbc4fc70 0000000000000020 000000007fe00098 00000000bfe8be00 00000000bbc4fe94 000000000000000a 000000000000000c 00000000bbc4fcc0 0000000000000000 0000000000000000 000000000095b930 0000000000113366 00000000bbc4fc58 00000000bbc4fca0 Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dumpstack.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 829e1c53005c..62904890d127 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -98,8 +98,10 @@ static int show_address(void *data, unsigned long address, int reliable) return 0; } -static void show_trace(struct task_struct *task, unsigned long sp) +void show_stack(struct task_struct *task, unsigned long *stack) { + unsigned long sp = (unsigned long) stack; + if (!sp) sp = task ? task->thread.ksp : current_stack_pointer(); printk("Call Trace:\n"); @@ -109,29 +111,6 @@ static void show_trace(struct task_struct *task, unsigned long sp) debug_show_held_locks(task); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - unsigned long *stack; - int i; - - stack = sp; - if (!stack) { - if (!task) - stack = (unsigned long *)current_stack_pointer(); - else - stack = (unsigned long *)task->thread.ksp; - } - printk(KERN_DEFAULT "Stack:\n"); - for (i = 0; i < 20; i++) { - if (((addr_t) stack & (THREAD_SIZE-1)) == 0) - break; - if (i % 4 == 0) - printk(KERN_DEFAULT " "); - pr_cont("%016lx%c", *stack++, i % 4 == 3 ? '\n' : ' '); - } - show_trace(task, (unsigned long)sp); -} - static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); @@ -169,7 +148,7 @@ void show_regs(struct pt_regs *regs) show_registers(regs); /* Show stack backtrace if pt_regs is from kernel mode */ if (!user_mode(regs)) - show_trace(NULL, regs->gprs[15]); + show_stack(NULL, (unsigned long *) regs->gprs[15]); show_last_breaking_event(regs); } From ead1dec8ed0497c4a3f7fc36135899e5f26563ab Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 6 Jun 2017 13:55:42 +0200 Subject: [PATCH 34/52] s390/ipl: revert Load Normal semantics for LPAR CCW-type re-IPL This reverts the two commits 7afbeb6df2aa ("s390/ipl: always use load normal for CCW-type re-IPL") 0f7451ff3ab8 ("s390/ipl: use load normal for LPAR re-ipl") The two commits did not take into account that behavior of standby memory changes fundamentally if the re-IPL method is changed from Load Clear to Load Normal. In case of the old re-IPL clear method all memory that was initially in standby state will be put into standby state again within the re-IPL process. Or in other words: memory that was brought online before a re-IPL will be offline again after a reboot. Given that we use different re-IPL methods depending on the hypervisor and CCW-type vs SCSI re-IPL it is not easy to tell in advance when and why memory will stay online or will be offline after a re-IPL. This does also have other side effects, since memory that is online from the beginning will be in ZONE_NORMAL by default vs ZONE_MOVABLE for memory that is offline. Therefore, before the change, a user could online and offline memory easily since standby memory was always in ZONE_NORMAL. After the change, and a re-IPL, this depended on which memory parts were online before the re-IPL. From a usability point of view the current behavior is more than suboptimal. Therefore revert these changes until we have a better solution and get back to a consistent behavior. The bad thing about this is that the time required for a re-IPL will be significantly increased for configurations with several 100GB or 1TB of memory. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ipl.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index e545ffe5155a..8e622bb52f7a 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -564,8 +564,6 @@ static struct kset *ipl_kset; static void __ipl_run(void *unused) { - if (MACHINE_IS_LPAR && ipl_info.type == IPL_TYPE_CCW) - diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); diag308(DIAG308_LOAD_CLEAR, NULL); if (MACHINE_IS_VM) __cpcmd("IPL", NULL, 0, NULL); @@ -1088,10 +1086,7 @@ static void __reipl_run(void *unused) break; case REIPL_METHOD_CCW_DIAG: diag308(DIAG308_SET, reipl_block_ccw); - if (MACHINE_IS_LPAR) - diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); - else - diag308(DIAG308_LOAD_CLEAR, NULL); + diag308(DIAG308_LOAD_CLEAR, NULL); break; case REIPL_METHOD_FCP_RW_DIAG: diag308(DIAG308_SET, reipl_block_fcp); From 36f6237ebf4dfdf62813540e962d53584ba8b271 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 15 May 2017 15:49:07 +0200 Subject: [PATCH 35/52] s390/cio: introduce io_subchannel_type The sysfs attributes implemented by the vfio_ccw driver are also implemented by the io_subchannel driver. Move these into a device_type which is set by the css bus. Signed-off-by: Sebastian Ott Reviewed-by: Dong Jia Shi Reviewed-by: Cornelia Huck Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/css.c | 49 ++++++++++++++++++++++++++++ drivers/s390/cio/device.c | 42 ------------------------ drivers/s390/cio/vfio_ccw_drv.c | 58 +-------------------------------- 3 files changed, 50 insertions(+), 99 deletions(-) diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index e2aa944eb566..d3e504c3c362 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -296,6 +296,51 @@ static const struct attribute_group *default_subch_attr_groups[] = { NULL, }; +static ssize_t chpids_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct subchannel *sch = to_subchannel(dev); + struct chsc_ssd_info *ssd = &sch->ssd_info; + ssize_t ret = 0; + int mask; + int chp; + + for (chp = 0; chp < 8; chp++) { + mask = 0x80 >> chp; + if (ssd->path_mask & mask) + ret += sprintf(buf + ret, "%02x ", ssd->chpid[chp].id); + else + ret += sprintf(buf + ret, "00 "); + } + ret += sprintf(buf + ret, "\n"); + return ret; +} +static DEVICE_ATTR(chpids, 0444, chpids_show, NULL); + +static ssize_t pimpampom_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct subchannel *sch = to_subchannel(dev); + struct pmcw *pmcw = &sch->schib.pmcw; + + return sprintf(buf, "%02x %02x %02x\n", + pmcw->pim, pmcw->pam, pmcw->pom); +} +static DEVICE_ATTR(pimpampom, 0444, pimpampom_show, NULL); + +static struct attribute *io_subchannel_type_attrs[] = { + &dev_attr_chpids.attr, + &dev_attr_pimpampom.attr, + NULL, +}; +ATTRIBUTE_GROUPS(io_subchannel_type); + +static const struct device_type io_subchannel_type = { + .groups = io_subchannel_type_groups, +}; + int css_register_subchannel(struct subchannel *sch) { int ret; @@ -304,6 +349,10 @@ int css_register_subchannel(struct subchannel *sch) sch->dev.parent = &channel_subsystems[0]->device; sch->dev.bus = &css_bus_type; sch->dev.groups = default_subch_attr_groups; + + if (sch->st == SUBCHANNEL_TYPE_IO) + sch->dev.type = &io_subchannel_type; + /* * We don't want to generate uevents for I/O subchannels that don't * have a working ccw device behind them since they will be diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index b8006ea9099c..7be01a58b44f 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -208,44 +208,6 @@ int __init io_subchannel_init(void) /************************ device handling **************************/ -/* - * A ccw_device has some interfaces in sysfs in addition to the - * standard ones. - * The following entries are designed to export the information which - * resided in 2.4 in /proc/subchannels. Subchannel and device number - * are obvious, so they don't have an entry :) - * TODO: Split chpids and pimpampom up? Where is "in use" in the tree? - */ -static ssize_t -chpids_show (struct device * dev, struct device_attribute *attr, char * buf) -{ - struct subchannel *sch = to_subchannel(dev); - struct chsc_ssd_info *ssd = &sch->ssd_info; - ssize_t ret = 0; - int chp; - int mask; - - for (chp = 0; chp < 8; chp++) { - mask = 0x80 >> chp; - if (ssd->path_mask & mask) - ret += sprintf(buf + ret, "%02x ", ssd->chpid[chp].id); - else - ret += sprintf(buf + ret, "00 "); - } - ret += sprintf (buf+ret, "\n"); - return min((ssize_t)PAGE_SIZE, ret); -} - -static ssize_t -pimpampom_show (struct device * dev, struct device_attribute *attr, char * buf) -{ - struct subchannel *sch = to_subchannel(dev); - struct pmcw *pmcw = &sch->schib.pmcw; - - return sprintf (buf, "%02x %02x %02x\n", - pmcw->pim, pmcw->pam, pmcw->pom); -} - static ssize_t devtype_show (struct device *dev, struct device_attribute *attr, char *buf) { @@ -636,8 +598,6 @@ static ssize_t vpm_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%02x\n", sch->vpm); } -static DEVICE_ATTR(chpids, 0444, chpids_show, NULL); -static DEVICE_ATTR(pimpampom, 0444, pimpampom_show, NULL); static DEVICE_ATTR(devtype, 0444, devtype_show, NULL); static DEVICE_ATTR(cutype, 0444, cutype_show, NULL); static DEVICE_ATTR(modalias, 0444, modalias_show, NULL); @@ -647,8 +607,6 @@ static DEVICE_ATTR(logging, 0200, NULL, initiate_logging); static DEVICE_ATTR(vpm, 0444, vpm_show, NULL); static struct attribute *io_subchannel_attrs[] = { - &dev_attr_chpids.attr, - &dev_attr_pimpampom.attr, &dev_attr_logging.attr, &dev_attr_vpm.attr, NULL, diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index e90dd43d2a55..a25367ebaa89 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -89,54 +89,6 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work) private->state = VFIO_CCW_STATE_IDLE; } -/* - * Sysfs interfaces - */ -static ssize_t chpids_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct subchannel *sch = to_subchannel(dev); - struct chsc_ssd_info *ssd = &sch->ssd_info; - ssize_t ret = 0; - int chp; - int mask; - - for (chp = 0; chp < 8; chp++) { - mask = 0x80 >> chp; - if (ssd->path_mask & mask) - ret += sprintf(buf + ret, "%02x ", ssd->chpid[chp].id); - else - ret += sprintf(buf + ret, "00 "); - } - ret += sprintf(buf+ret, "\n"); - return ret; -} - -static ssize_t pimpampom_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct subchannel *sch = to_subchannel(dev); - struct pmcw *pmcw = &sch->schib.pmcw; - - return sprintf(buf, "%02x %02x %02x\n", - pmcw->pim, pmcw->pam, pmcw->pom); -} - -static DEVICE_ATTR(chpids, 0444, chpids_show, NULL); -static DEVICE_ATTR(pimpampom, 0444, pimpampom_show, NULL); - -static struct attribute *vfio_subchannel_attrs[] = { - &dev_attr_chpids.attr, - &dev_attr_pimpampom.attr, - NULL, -}; - -static struct attribute_group vfio_subchannel_attr_group = { - .attrs = vfio_subchannel_attrs, -}; - /* * Css driver callbacks */ @@ -174,13 +126,9 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (ret) goto out_free; - ret = sysfs_create_group(&sch->dev.kobj, &vfio_subchannel_attr_group); - if (ret) - goto out_disable; - ret = vfio_ccw_mdev_reg(sch); if (ret) - goto out_rm_group; + goto out_disable; INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); atomic_set(&private->avail, 1); @@ -188,8 +136,6 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) return 0; -out_rm_group: - sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group); out_disable: cio_disable_subchannel(sch); out_free: @@ -206,8 +152,6 @@ static int vfio_ccw_sch_remove(struct subchannel *sch) vfio_ccw_mdev_unreg(sch); - sysfs_remove_group(&sch->dev.kobj, &vfio_subchannel_attr_group); - dev_set_drvdata(&sch->dev, NULL); kfree(private); From 60c497014e34af5aa0be56d0869c67fa2b5c3786 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 1 Jun 2017 11:04:04 +0200 Subject: [PATCH 36/52] s390/mm: use correct address space when enabling DAT Right now the kernel uses the primary address space until finally the switch to the correct home address space will be done when the idle PSW will be loaded within psw_idle(). Correct this and simply use the home address space when DAT is enabled for the first time. This doesn't really fix a bug, but fixes odd behavior. Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/init.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ee6a1d3d4983..0352f9f88c73 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -81,6 +81,7 @@ void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; unsigned long pgd_type, asce_bits; + psw_t psw; init_mm.pgd = swapper_pg_dir; if (VMALLOC_END > (1UL << 42)) { @@ -100,7 +101,10 @@ void __init paging_init(void) __ctl_load(S390_lowcore.kernel_asce, 1, 1); __ctl_load(S390_lowcore.kernel_asce, 7, 7); __ctl_load(S390_lowcore.kernel_asce, 13, 13); - __arch_local_irq_stosm(0x04); + psw.mask = __extract_psw(); + psw_bits(psw).t = 1; + psw_bits(psw).as = PSW_AS_HOME; + __load_psw_mask(psw.mask); sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); From 8bb3fdd6863c3b6b84bbab750d6b35e889c1399d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 3 Jun 2017 10:19:55 +0200 Subject: [PATCH 37/52] s390: rename psw_bits enums The address space enums that must be used when modifying the address space part of a psw with the psw_bits() macro can easily be confused with the psw defines that are used to mask and compare directly the mask part of a psw. We have e.g. PSW_AS_PRIMARY vs PSW_ASC_PRIMARY. To avoid confusion rename the PSW_AS_* enums to PSW_BITS_AS_*. In addition also rename the PSW_AMODE_* enums, so they also follow the same naming scheme: PSW_BITS_AMODE_*. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ptrace.h | 14 +++++++------- arch/s390/kernel/uprobes.c | 8 ++++---- arch/s390/kvm/gaccess.c | 12 ++++++------ arch/s390/kvm/gaccess.h | 4 ++-- arch/s390/kvm/guestdbg.c | 6 +++--- arch/s390/kvm/priv.c | 6 +++--- arch/s390/mm/init.c | 2 +- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 99bc456cc26a..c8d13bcc9f5d 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -46,16 +46,16 @@ struct psw_bits { }; enum { - PSW_AMODE_24BIT = 0, - PSW_AMODE_31BIT = 1, - PSW_AMODE_64BIT = 3 + PSW_BITS_AMODE_24BIT = 0, + PSW_BITS_AMODE_31BIT = 1, + PSW_BITS_AMODE_64BIT = 3 }; enum { - PSW_AS_PRIMARY = 0, - PSW_AS_ACCREG = 1, - PSW_AS_SECONDARY = 2, - PSW_AS_HOME = 3 + PSW_BITS_AS_PRIMARY = 0, + PSW_BITS_AS_ACCREG = 1, + PSW_BITS_AS_SECONDARY = 2, + PSW_BITS_AS_HOME = 3 }; #define psw_bits(__psw) (*({ \ diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index 314e0ee3016a..0eec45b4575b 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -27,9 +27,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (psw_bits(regs->psw).eaba == PSW_AMODE_24BIT) + if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) return -EINVAL; - if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_AMODE_31BIT) + if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) return -EINVAL; clear_pt_regs_flag(regs, PIF_PER_TRAP); auprobe->saved_per = psw_bits(regs->psw).r; @@ -372,8 +372,8 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if ((psw_bits(regs->psw).eaba == PSW_AMODE_24BIT) || - ((psw_bits(regs->psw).eaba == PSW_AMODE_31BIT) && + if ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) || + ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) && !is_compat_task())) { regs->psw.addr = __rewind_psw(regs->psw, UPROBE_SWBP_INSN_SIZE); do_report_trap(regs, SIGILL, ILL_ILLADR, NULL); diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 9da243d94cc3..6ad4a9797de8 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -557,20 +557,20 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, return 0; } - if (mode == GACC_IFETCH) - psw.as = psw.as == PSW_AS_HOME ? PSW_AS_HOME : PSW_AS_PRIMARY; + if ((mode == GACC_IFETCH) && (psw.as != PSW_BITS_AS_HOME)) + psw.as = PSW_BITS_AS_PRIMARY; switch (psw.as) { - case PSW_AS_PRIMARY: + case PSW_BITS_AS_PRIMARY: asce->val = vcpu->arch.sie_block->gcr[1]; return 0; - case PSW_AS_SECONDARY: + case PSW_BITS_AS_SECONDARY: asce->val = vcpu->arch.sie_block->gcr[7]; return 0; - case PSW_AS_HOME: + case PSW_BITS_AS_HOME: asce->val = vcpu->arch.sie_block->gcr[13]; return 0; - case PSW_AS_ACCREG: + case PSW_BITS_AS_ACCREG: rc = ar_translation(vcpu, asce, ar, mode); if (rc > 0) return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC); diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 7ce47fd36f28..bec42b852246 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -57,9 +57,9 @@ static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu, { psw_t *psw = &vcpu->arch.sie_block->gpsw; - if (psw_bits(*psw).eaba == PSW_AMODE_64BIT) + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) return ga; - if (psw_bits(*psw).eaba == PSW_AMODE_31BIT) + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) return ga & ((1UL << 31) - 1); return ga & ((1UL << 24) - 1); } diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 23d9a4e12da1..c2e0ddc1356e 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -613,15 +613,15 @@ int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) * instruction. Check primary and home space-switch-event * controls. (theoretically home -> home produced no event) */ - if (((new_as == PSW_AS_HOME) ^ old_as_is_home(vcpu)) && - (pssec(vcpu) || hssec(vcpu))) + if (((new_as == PSW_BITS_AS_HOME) ^ old_as_is_home(vcpu)) && + (pssec(vcpu) || hssec(vcpu))) vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; /* * PT, PTI, PR, PC instruction operate on primary AS only. Check * if the primary-space-switch-event control was or got set. */ - if (new_as == PSW_AS_PRIMARY && !old_as_is_home(vcpu) && + if (new_as == PSW_BITS_AS_PRIMARY && !old_as_is_home(vcpu) && (pssec(vcpu) || old_ssec(vcpu))) vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; } diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c03106c428cf..e9dd7efc57b8 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -361,7 +361,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) } } if (m3 & SSKE_MB) { - if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK; else vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL; @@ -901,7 +901,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) /* only support 2G frame size if EDAT2 is available and we are not in 24-bit addressing mode */ if (!test_kvm_facility(vcpu->kvm, 78) || - psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_24BIT) + psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_24BIT) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); end = (start + (1UL << 31)) & ~((1UL << 31) - 1); break; @@ -938,7 +938,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) start += PAGE_SIZE; } if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { - if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) { vcpu->run->s.regs.gprs[reg2] = end; } else { vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 0352f9f88c73..bc8c301f82b6 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -103,7 +103,7 @@ void __init paging_init(void) __ctl_load(S390_lowcore.kernel_asce, 13, 13); psw.mask = __extract_psw(); psw_bits(psw).t = 1; - psw_bits(psw).as = PSW_AS_HOME; + psw_bits(psw).as = PSW_BITS_AS_HOME; __load_psw_mask(psw.mask); sparse_memory_present_with_active_regions(MAX_NUMNODES); From a752598254016d2f9b4415d43a6402fe083f70b2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 3 Jun 2017 10:56:07 +0200 Subject: [PATCH 38/52] s390: rename struct psw_bits members Rename a couple of the struct psw_bits members so it is more obvious for what they are good. Initially I thought using the single character names from the PoP would be sufficient and obvious, but admittedly that is not true. The current implementation is not easy to use, if one has to look into the source file to figure out which member represents the 'per' bit (which is the 'r' member). Therefore rename the members to sane names that are identical to the uapi psw mask defines: r -> per i -> io e -> ext t -> dat m -> mcheck w -> wait p -> pstate Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ptrace.h | 38 ++++++++++++++++----------------- arch/s390/kernel/dumpstack.c | 4 ++-- arch/s390/kernel/perf_cpum_sf.c | 10 ++++----- arch/s390/kernel/uprobes.c | 4 ++-- arch/s390/kvm/gaccess.c | 10 ++++----- arch/s390/kvm/priv.c | 2 +- arch/s390/mm/init.c | 2 +- 7 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index c8d13bcc9f5d..004f54909235 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -24,25 +24,25 @@ PSW_MASK_PSTATE | PSW_ASC_PRIMARY) struct psw_bits { - unsigned long : 1; - unsigned long r : 1; /* PER-Mask */ - unsigned long : 3; - unsigned long t : 1; /* DAT Mode */ - unsigned long i : 1; /* Input/Output Mask */ - unsigned long e : 1; /* External Mask */ - unsigned long key : 4; /* PSW Key */ - unsigned long : 1; - unsigned long m : 1; /* Machine-Check Mask */ - unsigned long w : 1; /* Wait State */ - unsigned long p : 1; /* Problem State */ - unsigned long as : 2; /* Address Space Control */ - unsigned long cc : 2; /* Condition Code */ - unsigned long pm : 4; /* Program Mask */ - unsigned long ri : 1; /* Runtime Instrumentation */ - unsigned long : 6; - unsigned long eaba : 2; /* Addressing Mode */ - unsigned long : 31; - unsigned long ia : 64; /* Instruction Address */ + unsigned long : 1; + unsigned long per : 1; /* PER-Mask */ + unsigned long : 3; + unsigned long dat : 1; /* DAT Mode */ + unsigned long io : 1; /* Input/Output Mask */ + unsigned long ext : 1; /* External Mask */ + unsigned long key : 4; /* PSW Key */ + unsigned long : 1; + unsigned long mcheck : 1; /* Machine-Check Mask */ + unsigned long wait : 1; /* Wait State */ + unsigned long pstate : 1; /* Problem State */ + unsigned long as : 2; /* Address Space Control */ + unsigned long cc : 2; /* Condition Code */ + unsigned long pm : 4; /* Program Mask */ + unsigned long ri : 1; /* Runtime Instrumentation */ + unsigned long : 6; + unsigned long eaba : 2; /* Addressing Mode */ + unsigned long : 31; + unsigned long ia : 64; /* Instruction Address */ }; enum { diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 62904890d127..dab78babfab6 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -128,8 +128,8 @@ void show_registers(struct pt_regs *regs) pr_cont(" (%pSR)", (void *)regs->psw.addr); pr_cont("\n"); printk(" R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x " - "P:%x AS:%x CC:%x PM:%x", psw->r, psw->t, psw->i, psw->e, - psw->key, psw->m, psw->w, psw->p, psw->as, psw->cc, psw->pm); + "P:%x AS:%x CC:%x PM:%x", psw->per, psw->dat, psw->io, psw->ext, + psw->key, psw->mcheck, psw->wait, psw->pstate, psw->as, psw->cc, psw->pm); pr_cont(" RI:%x EA:%x\n", psw->ri, psw->eaba); printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode, regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index ca960d0370d5..0c82f7903fc7 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -995,11 +995,11 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) regs.int_parm = CPU_MF_INT_SF_PRA; sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long; - psw_bits(regs.psw).ia = sfr->basic.ia; - psw_bits(regs.psw).t = sfr->basic.T; - psw_bits(regs.psw).w = sfr->basic.W; - psw_bits(regs.psw).p = sfr->basic.P; - psw_bits(regs.psw).as = sfr->basic.AS; + psw_bits(regs.psw).ia = sfr->basic.ia; + psw_bits(regs.psw).dat = sfr->basic.T; + psw_bits(regs.psw).wait = sfr->basic.W; + psw_bits(regs.psw).per = sfr->basic.P; + psw_bits(regs.psw).as = sfr->basic.AS; /* * Use the hardware provided configuration level to decide if the diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index 0eec45b4575b..d94baa8db507 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -32,7 +32,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) return -EINVAL; clear_pt_regs_flag(regs, PIF_PER_TRAP); - auprobe->saved_per = psw_bits(regs->psw).r; + auprobe->saved_per = psw_bits(regs->psw).per; auprobe->saved_int_code = regs->int_code; regs->int_code = UPROBE_TRAP_NR; regs->psw.addr = current->utask->xol_vaddr; @@ -81,7 +81,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) clear_tsk_thread_flag(current, TIF_UPROBE_SINGLESTEP); update_cr_regs(current); - psw_bits(regs->psw).r = auprobe->saved_per; + psw_bits(regs->psw).per = auprobe->saved_per; regs->int_code = auprobe->saved_int_code; if (fixup & FIXUP_PSW_NORMAL) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 6ad4a9797de8..e0f7d5fc7efd 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -551,7 +551,7 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, int rc; struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); - if (!psw.t) { + if (!psw.dat) { asce->val = 0; asce->r = 1; return 0; @@ -771,7 +771,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, if (!ctlreg0.lap) return 0; - if (psw_bits(*psw).t && asce.p) + if (psw_bits(*psw).dat && asce.p) return 0; return 1; } @@ -790,7 +790,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, PROT_TYPE_LA); ga &= PAGE_MASK; - if (psw_bits(*psw).t) { + if (psw_bits(*psw).dat) { rc = guest_translate(vcpu, ga, pages, asce, mode); if (rc < 0) return rc; @@ -831,7 +831,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, pages = vmalloc(nr_pages * sizeof(unsigned long)); if (!pages) return -ENOMEM; - need_ipte_lock = psw_bits(*psw).t && !asce.r; + need_ipte_lock = psw_bits(*psw).dat && !asce.r; if (need_ipte_lock) ipte_lock(vcpu); rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode); @@ -899,7 +899,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, mode, PROT_TYPE_LA); } - if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */ + if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */ rc = guest_translate(vcpu, gva, gpa, asce, mode); if (rc > 0) return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index e9dd7efc57b8..e53292a89257 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -374,7 +374,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) static int handle_ipte_interlock(struct kvm_vcpu *vcpu) { vcpu->stat.instruction_ipte_interlock++; - if (psw_bits(vcpu->arch.sie_block->gpsw).p) + if (psw_bits(vcpu->arch.sie_block->gpsw).pstate) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu)); kvm_s390_retry_instr(vcpu); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index bc8c301f82b6..3348e60dd8ad 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -102,7 +102,7 @@ void __init paging_init(void) __ctl_load(S390_lowcore.kernel_asce, 7, 7); __ctl_load(S390_lowcore.kernel_asce, 13, 13); psw.mask = __extract_psw(); - psw_bits(psw).t = 1; + psw_bits(psw).dat = 1; psw_bits(psw).as = PSW_BITS_AS_HOME; __load_psw_mask(psw.mask); From 23fefe119ceb5fb0c7d3321010620010a4eddb18 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 7 Jun 2017 14:10:24 +0200 Subject: [PATCH 39/52] s390/kvm: avoid global config of vm.alloc_pgste=1 The system control vm.alloc_pgste is used to control the size of the page tables, either 2K or 4K. The idea is that a KVM host sets the vm.alloc_pgste control to 1 which causes *all* new processes to run with 4K page tables. For a non-kvm system the control should stay off to save on memory used for page tables. Trouble is that distributions choose to set the control globally to be able to run KVM guests. This wastes memory on non-KVM systems. Introduce the PT_S390_PGSTE ELF segment type to "mark" the qemu executable with it. All executables with this (empty) segment in its ELF phdr array will be started with 4K page tables. Any executable without PT_S390_PGSTE will run with the default 2K page tables. This removes the need to set vm.alloc_pgste=1 for a KVM host and minimizes the waste of memory for page tables. Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 1 + arch/s390/include/asm/elf.h | 32 +++++++++++++++++++++++++++++ arch/s390/include/asm/mmu_context.h | 4 +++- arch/s390/include/asm/ptrace.h | 2 ++ arch/s390/include/asm/thread_info.h | 1 + arch/s390/kernel/entry.S | 15 +++++++++++++- 6 files changed, 53 insertions(+), 2 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e7ff58150e8f..bb11f9f30c8d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES config S390 def_bool y + select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index e8f623041769..ec024c08dabe 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -117,6 +117,9 @@ #define ELF_DATA ELFDATA2MSB #define ELF_ARCH EM_S390 +/* s390 specific phdr types */ +#define PT_S390_PGSTE 0x70000000 + /* * ELF register definitions.. */ @@ -151,6 +154,35 @@ extern unsigned int vdso_enabled; && (x)->e_ident[EI_CLASS] == ELF_CLASS) #define compat_start_thread start_thread31 +struct arch_elf_state { + int rc; +}; + +#define INIT_ARCH_ELF_STATE { .rc = 0 } + +#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0) +#ifdef CONFIG_PGSTE +#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ +({ \ + struct arch_elf_state *_state = state; \ + if ((phdr)->p_type == PT_S390_PGSTE && \ + !page_table_allocate_pgste && \ + !test_thread_flag(TIF_PGSTE) && \ + !current->mm->context.alloc_pgste) { \ + set_thread_flag(TIF_PGSTE); \ + set_pt_regs_flag(task_pt_regs(current), \ + PIF_SYSCALL_RESTART); \ + _state->rc = -EAGAIN; \ + } \ + _state->rc; \ +}) +#else +#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ +({ \ + (state)->rc; \ +}) +#endif + /* For SVR4/S390 the function pointer to be registered with `atexit` is passed in R14. */ #define ELF_PLAT_INIT(_r, load_addr) \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 8712e11bead4..4541ac44b35f 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -25,7 +25,9 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.gmap_asce = 0; mm->context.flush_mm = 0; #ifdef CONFIG_PGSTE - mm->context.alloc_pgste = page_table_allocate_pgste; + mm->context.alloc_pgste = page_table_allocate_pgste || + test_thread_flag(TIF_PGSTE) || + current->mm->context.alloc_pgste; mm->context.has_pgste = 0; mm->context.use_skey = 0; mm->context.use_cmma = 0; diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 004f54909235..853b01245c20 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -11,9 +11,11 @@ #define PIF_SYSCALL 0 /* inside a system call */ #define PIF_PER_TRAP 1 /* deliver sigtrap on return to user */ +#define PIF_SYSCALL_RESTART 2 /* restart the current system call */ #define _PIF_SYSCALL _BITUL(PIF_SYSCALL) #define _PIF_PER_TRAP _BITUL(PIF_PER_TRAP) +#define _PIF_SYSCALL_RESTART _BITUL(PIF_SYSCALL_RESTART) #ifndef __ASSEMBLY__ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 0b3ee083a665..1aecf432c48d 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -58,6 +58,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #define TIF_UPROBE 3 /* breakpointed or single-stepping */ #define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ #define TIF_PATCH_PENDING 5 /* pending live patching update */ +#define TIF_PGSTE 6 /* New mm's will use 4K page tables */ #define TIF_31BIT 16 /* 32bit process */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 6315037335ba..0c0138c7dfc7 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -52,7 +52,7 @@ _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ _TIF_SYSCALL_TRACEPOINT) _CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \ _CIF_ASCE_SECONDARY | _CIF_FPU) -_PIF_WORK = (_PIF_PER_TRAP) +_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) #define BASED(name) name-cleanup_critical(%r13) @@ -334,6 +334,8 @@ ENTRY(system_call) jo .Lsysc_mcck_pending TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED jo .Lsysc_reschedule + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART + jo .Lsysc_syscall_restart #ifdef CONFIG_UPROBES TSTMSK __TI_flags(%r12),_TIF_UPROBE jo .Lsysc_uprobe_notify @@ -347,6 +349,8 @@ ENTRY(system_call) jo .Lsysc_patch_pending # handle live patching just before # signals and possible syscall restart #endif + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART + jo .Lsysc_syscall_restart TSTMSK __TI_flags(%r12),_TIF_SIGPENDING jo .Lsysc_sigpending TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME @@ -447,6 +451,15 @@ ENTRY(system_call) larl %r14,.Lsysc_return jg do_per_trap +# +# _PIF_SYSCALL_RESTART is set, repeat the current system call +# +.Lsysc_syscall_restart: + ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART + lmg %r1,%r7,__PT_R1(%r11) # load svc arguments + lg %r2,__PT_ORIG_GPR2(%r11) + j .Lsysc_do_svc + # # call tracehook_report_syscall_entry/tracehook_report_syscall_exit before # and after the system call From f044f4c588985867d17499e51460053a96f9ff7a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 12 Jun 2017 08:52:56 +0200 Subject: [PATCH 40/52] s390/fpu: export save_fpu_regs for all configs The save_fpu_regs function is a general API that is supposed to be usable for modules as well. Remove the #ifdef that hides the symbol for CONFIG_KVM=n. Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 0c0138c7dfc7..e83aff630bcf 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -894,9 +894,7 @@ ENTRY(save_fpu_regs) oi __LC_CPU_FLAGS+7,_CIF_FPU br %r14 .Lsave_fpu_regs_end: -#if IS_ENABLED(CONFIG_KVM) EXPORT_SYMBOL(save_fpu_regs) -#endif /* * Load floating-point controls and floating-point or vector registers. From c929500d7a5aaea4f2eeba10816bc5341c66ae57 Mon Sep 17 00:00:00 2001 From: QingFeng Hao Date: Wed, 7 Jun 2017 11:30:42 +0200 Subject: [PATCH 41/52] s390/nmi: s390: New low level handling for machine check happening in guest Add the logic to check if the machine check happens when the guest is running. If yes, set the exit reason -EINTR in the machine check's interrupt handler. Refactor s390_do_machine_check to avoid panicing the host for some kinds of machine checks which happen when guest is running. Reinject the instruction processing damage's machine checks including Delayed Access Exception instead of damaging the host if it happens in the guest because it could be caused by improper update on TLB entry or other software case and impacts the guest only. Signed-off-by: QingFeng Hao Acked-by: Martin Schwidefsky Acked-by: Heiko Carstens Acked-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/nmi.h | 7 +++++ arch/s390/include/asm/processor.h | 2 ++ arch/s390/kernel/asm-offsets.c | 3 ++ arch/s390/kernel/entry.S | 13 +++++++- arch/s390/kernel/nmi.c | 50 ++++++++++++++++++++++++------- 5 files changed, 64 insertions(+), 11 deletions(-) diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index e3e8895f5d3e..13623b9991d4 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -14,7 +14,14 @@ #include #include +#define MCIC_SUBCLASS_MASK (1ULL<<63 | 1ULL<<62 | 1ULL<<61 | \ + 1ULL<<59 | 1ULL<<58 | 1ULL<<56 | \ + 1ULL<<55 | 1ULL<<54 | 1ULL<<53 | \ + 1ULL<<52 | 1ULL<<47 | 1ULL<<46 | \ + 1ULL<<45 | 1ULL<<44) #define MCCK_CODE_SYSTEM_DAMAGE _BITUL(63) +#define MCCK_CODE_EXT_DAMAGE _BITUL(63 - 5) +#define MCCK_CODE_CP _BITUL(63 - 9) #define MCCK_CODE_CPU_TIMER_VALID _BITUL(63 - 46) #define MCCK_CODE_PSW_MWP_VALID _BITUL(63 - 20) #define MCCK_CODE_PSW_IA_VALID _BITUL(63 - 23) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 60d395fdc864..5b1b247dfbca 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -20,6 +20,7 @@ #define CIF_FPU 4 /* restore FPU registers */ #define CIF_IGNORE_IRQ 5 /* ignore interrupt (for udelay) */ #define CIF_ENABLED_WAIT 6 /* in enabled wait state */ +#define CIF_MCCK_GUEST 7 /* machine check happening in guest */ #define _CIF_MCCK_PENDING _BITUL(CIF_MCCK_PENDING) #define _CIF_ASCE_PRIMARY _BITUL(CIF_ASCE_PRIMARY) @@ -28,6 +29,7 @@ #define _CIF_FPU _BITUL(CIF_FPU) #define _CIF_IGNORE_IRQ _BITUL(CIF_IGNORE_IRQ) #define _CIF_ENABLED_WAIT _BITUL(CIF_ENABLED_WAIT) +#define _CIF_MCCK_GUEST _BITUL(CIF_MCCK_GUEST) #ifndef __ASSEMBLY__ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 6bb29633e1f1..b65c414b6c0e 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -58,6 +58,9 @@ int main(void) OFFSET(__SF_BACKCHAIN, stack_frame, back_chain); OFFSET(__SF_GPRS, stack_frame, gprs); OFFSET(__SF_EMPTY, stack_frame, empty1); + OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]); + OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]); + OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]); BLANK(); /* timeval/timezone offsets for use by vdso */ OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index a5f5d3bb3dbc..9b48196ebf40 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -225,6 +225,7 @@ ENTRY(sie64a) jnz .Lsie_skip TSTMSK __LC_CPU_FLAGS,_CIF_FPU jo .Lsie_skip # exit if fp/vx regs changed +.Lsie_entry: sie 0(%r14) .Lsie_skip: ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE @@ -1102,7 +1103,13 @@ cleanup_critical: .quad .Lsie_done .Lcleanup_sie: - lg %r9,__SF_EMPTY(%r15) # get control block pointer + cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt? + je 1f + slg %r9,BASED(.Lsie_crit_mcck_start) + clg %r9,BASED(.Lsie_crit_mcck_length) + jh 1f + oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST +1: lg %r9,__SF_EMPTY(%r15) # get control block pointer ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit @@ -1274,6 +1281,10 @@ cleanup_critical: .quad .Lsie_gmap .Lsie_critical_length: .quad .Lsie_done - .Lsie_gmap +.Lsie_crit_mcck_start: + .quad .Lsie_entry +.Lsie_crit_mcck_length: + .quad .Lsie_skip - .Lsie_entry #endif .section .rodata, "a" diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 985589523970..958cc3352faa 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -25,6 +25,7 @@ #include #include #include +#include struct mcck_struct { unsigned int kill_task : 1; @@ -280,6 +281,8 @@ static int notrace s390_validate_registers(union mci mci, int umode) #define ED_STP_ISLAND 6 /* External damage STP island check */ #define ED_STP_SYNC 7 /* External damage STP sync check */ +#define MCCK_CODE_NO_GUEST (MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE) + /* * machine check handler. */ @@ -291,6 +294,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) struct mcck_struct *mcck; unsigned long long tmp; union mci mci; + unsigned long mcck_dam_code; nmi_enter(); inc_irq_stat(NMI_NMI); @@ -301,7 +305,13 @@ void notrace s390_do_machine_check(struct pt_regs *regs) /* System damage -> stopping machine */ s390_handle_damage(); } - if (mci.pd) { + + /* + * Reinject the instruction processing damages' machine checks + * including Delayed Access Exception into the guest + * instead of damaging the host if they happen in the guest. + */ + if (mci.pd && !test_cpu_flag(CIF_MCCK_GUEST)) { if (mci.b) { /* Processing backup -> verify if we can survive this */ u64 z_mcic, o_mcic, t_mcic; @@ -358,15 +368,22 @@ void notrace s390_do_machine_check(struct pt_regs *regs) if (mcck->stp_queue) set_cpu_flag(CIF_MCCK_PENDING); } - if (mci.se) - /* Storage error uncorrected */ - s390_handle_damage(); - if (mci.ke) - /* Storage key-error uncorrected */ - s390_handle_damage(); - if (mci.ds && mci.fa) - /* Storage degradation */ - s390_handle_damage(); + + /* + * Reinject storage related machine checks into the guest if they + * happen when the guest is running. + */ + if (!test_cpu_flag(CIF_MCCK_GUEST)) { + if (mci.se) + /* Storage error uncorrected */ + s390_handle_damage(); + if (mci.ke) + /* Storage key-error uncorrected */ + s390_handle_damage(); + if (mci.ds && mci.fa) + /* Storage degradation */ + s390_handle_damage(); + } if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; @@ -377,6 +394,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs) mcck->warning = 1; set_cpu_flag(CIF_MCCK_PENDING); } + + /* + * If there are only Channel Report Pending and External Damage + * machine checks, they will not be reinjected into the guest + * because they refer to host conditions only. + */ + mcck_dam_code = (mci.val & MCIC_SUBCLASS_MASK); + if (test_cpu_flag(CIF_MCCK_GUEST) && + (mcck_dam_code & MCCK_CODE_NO_GUEST) != mcck_dam_code) { + /* Set exit reason code for host's later handling */ + *((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR; + } + clear_cpu_flag(CIF_MCCK_GUEST); nmi_exit(); } From da72ca4d4090a8ab0e6b0a23682ef42d39d7ae00 Mon Sep 17 00:00:00 2001 From: QingFeng Hao Date: Wed, 7 Jun 2017 11:41:19 +0200 Subject: [PATCH 42/52] KVM: s390: Backup the guest's machine check info When a machine check happens in the guest, related mcck info (mcic, external damage code, ...) is stored in the vcpu's lowcore on the host. Then the machine check handler's low-level part is executed, followed by the high-level part. If the high-level part's execution is interrupted by a new machine check happening on the same vcpu on the host, the mcck info in the lowcore is overwritten with the new machine check's data. If the high-level part's execution is scheduled to a different cpu, the mcck info in the lowcore is uncertain. Therefore, for both cases, the further reinjection to the guest will use the wrong data. Let's backup the mcck info in the lowcore to the sie page for further reinjection, so that the right data will be used. Add new member into struct sie_page to store related machine check's info of mcic, failing storage address and external damage code. Signed-off-by: QingFeng Hao Acked-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 17 +++++++++++++++- arch/s390/kernel/nmi.c | 34 ++++++++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 1 + 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 426614a882a9..c6e1d5fa1ad1 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -107,6 +107,20 @@ struct esca_block { struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; } __packed; +/* + * This struct is used to store some machine check info from lowcore + * for machine checks that happen while the guest is running. + * This info in host's lowcore might be overwritten by a second machine + * check from host when host is in the machine check's high-level handling. + * The size is 24 bytes. + */ +struct mcck_volatile_info { + __u64 mcic; + __u64 failing_storage_address; + __u32 ext_damage_code; + __u32 reserved; +}; + #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 #define CPUSTAT_ECALL_PEND 0x08000000 @@ -264,7 +278,8 @@ struct kvm_s390_itdb { struct sie_page { struct kvm_s390_sie_block sie_block; - __u8 reserved200[1024]; /* 0x0200 */ + struct mcck_volatile_info mcck_info; /* 0x0200 */ + __u8 reserved218[1000]; /* 0x0218 */ struct kvm_s390_itdb itdb; /* 0x0600 */ __u8 reserved700[2304]; /* 0x0700 */ } __packed; diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 958cc3352faa..31d03a84126c 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -26,6 +26,7 @@ #include #include #include +#include struct mcck_struct { unsigned int kill_task : 1; @@ -275,6 +276,31 @@ static int notrace s390_validate_registers(union mci mci, int umode) return kill_task; } +/* + * Backup the guest's machine check info to its description block + */ +static void notrace s390_backup_mcck_info(struct pt_regs *regs) +{ + struct mcck_volatile_info *mcck_backup; + struct sie_page *sie_page; + + /* r14 contains the sie block, which was set in sie64a */ + struct kvm_s390_sie_block *sie_block = + (struct kvm_s390_sie_block *) regs->gprs[14]; + + if (sie_block == NULL) + /* Something's seriously wrong, stop system. */ + s390_handle_damage(); + + sie_page = container_of(sie_block, struct sie_page, sie_block); + mcck_backup = &sie_page->mcck_info; + mcck_backup->mcic = S390_lowcore.mcck_interruption_code & + ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE); + mcck_backup->ext_damage_code = S390_lowcore.external_damage_code; + mcck_backup->failing_storage_address + = S390_lowcore.failing_storage_address; +} + #define MAX_IPD_COUNT 29 #define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ @@ -355,6 +381,14 @@ void notrace s390_do_machine_check(struct pt_regs *regs) mcck->mcck_code = mci.val; set_cpu_flag(CIF_MCCK_PENDING); } + + /* + * Backup the machine check's info if it happens when the guest + * is running. + */ + if (test_cpu_flag(CIF_MCCK_GUEST)) + s390_backup_mcck_info(regs); + if (mci.cd) { /* Timing facility damage */ s390_handle_damage(); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 689ac48361c6..0457e03199c5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2069,6 +2069,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, if (!vcpu) goto out; + BUILD_BUG_ON(sizeof(struct sie_page) != 4096); sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); if (!sie_page) goto out_free_cpu; From 795818e8bf17dbc791764ba9fb723278ee934676 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Sat, 10 Jun 2017 13:09:07 +0200 Subject: [PATCH 43/52] s390/pci: don't cleanup in arch_setup_msi_irqs After failures in arch_setup_msi_irqs common code calls arch_teardown_msi_irqs. Thus, remove cleanup code from arch_setup_msi_irqs. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci.h | 2 +- arch/s390/pci/pci.c | 45 ++++++++++++++----------------------- 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 4e3186649578..328142c8fe92 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -109,7 +109,7 @@ struct zpci_dev { u64 msi_addr; /* MSI address */ unsigned int max_msi; /* maximum number of MSI's */ struct airq_iv *aibv; /* adapter interrupt bit vector */ - unsigned int aisb; /* number of the summary bit */ + unsigned long aisb; /* number of the summary bit */ /* DMA stuff */ unsigned long *dma_table; diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 8051df109db3..3dd9686c576a 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -372,22 +372,21 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) struct msi_msg msg; int rc, irq; + zdev->aisb = -1UL; if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); /* Allocate adapter summary indicator bit */ - rc = -EIO; aisb = airq_iv_alloc_bit(zpci_aisb_iv); if (aisb == -1UL) - goto out; + return -EIO; zdev->aisb = aisb; /* Create adapter interrupt vector */ - rc = -ENOMEM; zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); if (!zdev->aibv) - goto out_si; + return -ENOMEM; /* Wire up shortcut pointer */ zpci_aibv[aisb] = zdev->aibv; @@ -398,10 +397,10 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) rc = -EIO; irq = irq_alloc_desc(0); /* Alloc irq on node 0 */ if (irq < 0) - goto out_msi; + return -ENOMEM; rc = irq_set_msi_desc(irq, msi); if (rc) - goto out_msi; + return rc; irq_set_chip_and_handler(irq, &zpci_irq_chip, handle_simple_irq); msg.data = hwirq; @@ -415,27 +414,9 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) /* Enable adapter interrupts */ rc = zpci_set_airq(zdev); if (rc) - goto out_msi; + return rc; return (msi_vecs == nvec) ? 0 : msi_vecs; - -out_msi: - for_each_pci_msi_entry(msi, pdev) { - if (hwirq-- == 0) - break; - irq_set_msi_desc(msi->irq, NULL); - irq_free_desc(msi->irq); - msi->msg.address_lo = 0; - msi->msg.address_hi = 0; - msi->msg.data = 0; - msi->irq = 0; - } - zpci_aibv[aisb] = NULL; - airq_iv_release(zdev->aibv); -out_si: - airq_iv_free_bit(zpci_aisb_iv, aisb); -out: - return rc; } void arch_teardown_msi_irqs(struct pci_dev *pdev) @@ -451,6 +432,8 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) /* Release MSI interrupts */ for_each_pci_msi_entry(msi, pdev) { + if (!msi->irq) + continue; if (msi->msi_attrib.is_msix) __pci_msix_desc_mask_irq(msi, 1); else @@ -463,9 +446,15 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) msi->irq = 0; } - zpci_aibv[zdev->aisb] = NULL; - airq_iv_release(zdev->aibv); - airq_iv_free_bit(zpci_aisb_iv, zdev->aisb); + if (zdev->aisb != -1UL) { + zpci_aibv[zdev->aisb] = NULL; + airq_iv_free_bit(zpci_aisb_iv, zdev->aisb); + zdev->aisb = -1UL; + } + if (zdev->aibv) { + airq_iv_release(zdev->aibv); + zdev->aibv = NULL; + } } static void zpci_map_resources(struct pci_dev *pdev) From 4dfbd3efe3f0cf9ff1325b87491e1b1fe07afaf1 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Sat, 10 Jun 2017 13:54:44 +0200 Subject: [PATCH 44/52] s390/pci: improve error handling during interrupt deregistration When we ask a function to stop creating interrupts this may fail due to the function being already gone (e.g. after hot-unplug). Consequently we don't free associated resources like summary bits and bit vectors used for irq processing. This could lead to situations where we ran out of these resources and fail to setup new interrupts. The fix is to just ignore the errors in cases where we can be sure no new interrupts are generated. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci_insn.h | 2 +- arch/s390/pci/pci.c | 29 +++++++++++++++++++---------- arch/s390/pci/pci_insn.c | 10 +++++----- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index 649eb62c52b3..34abcf275799 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -76,7 +76,7 @@ struct zpci_fib { u32 gd; } __packed __aligned(8); -int zpci_mod_fc(u64 req, struct zpci_fib *fib); +u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status); int zpci_refresh_trans(u64 fn, u64 addr, u64 range); int zpci_load(u64 *data, u64 req, u64 offset); int zpci_store(u64 data, u64 req, u64 offset); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 3dd9686c576a..82f3e788b0be 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -108,6 +108,7 @@ static int zpci_set_airq(struct zpci_dev *zdev) { u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); struct zpci_fib fib = {0}; + u8 status; fib.isc = PCI_ISC; fib.sum = 1; /* enable summary notifications */ @@ -117,7 +118,22 @@ static int zpci_set_airq(struct zpci_dev *zdev) fib.aisb = (unsigned long) zpci_aisb_iv->vector + (zdev->aisb/64)*8; fib.aisbo = zdev->aisb & 63; - return zpci_mod_fc(req, &fib); + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister adapter interruptions */ +static int zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {0}; + u8 cc, status; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; } struct mod_pci_args { @@ -131,13 +147,14 @@ static int mod_pci(struct zpci_dev *zdev, int fn, u8 dmaas, struct mod_pci_args { u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, fn); struct zpci_fib fib = {0}; + u8 status; fib.pba = args->base; fib.pal = args->limit; fib.iota = args->iota; fib.fmb_addr = args->fmb_addr; - return zpci_mod_fc(req, &fib); + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } /* Modify PCI: Register I/O address translation parameters */ @@ -159,14 +176,6 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) return mod_pci(zdev, ZPCI_MOD_FC_DEREG_IOAT, dmaas, &args); } -/* Modify PCI: Unregister adapter interruptions */ -static int zpci_clear_airq(struct zpci_dev *zdev) -{ - struct mod_pci_args args = { 0, 0, 0, 0 }; - - return mod_pci(zdev, ZPCI_MOD_FC_DEREG_INT, 0, &args); -} - /* Modify PCI: Set PCI function measurement parameters */ int zpci_fmb_enable_device(struct zpci_dev *zdev) { diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index fa8d7d4b9751..ea34086c8674 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -40,20 +40,20 @@ static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status) return cc; } -int zpci_mod_fc(u64 req, struct zpci_fib *fib) +u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status) { - u8 cc, status; + u8 cc; do { - cc = __mpcifc(req, fib, &status); + cc = __mpcifc(req, fib, status); if (cc == 2) msleep(ZPCI_INSN_BUSY_DELAY); } while (cc == 2); if (cc) - zpci_err_insn(cc, status, req, 0); + zpci_err_insn(cc, *status, req, 0); - return (cc) ? -EIO : 0; + return cc; } /* Refresh PCI Translations */ From 725708349172f00b86e2bac5c03de360b79aaf65 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Sat, 10 Jun 2017 14:10:00 +0200 Subject: [PATCH 45/52] s390/pci: improve unreg_ioat error handling DMA tables are freed in zpci_dma_exit_device regardless of the return code of zpci_unregister_ioat. This could lead to a use after free. On the other hand during function hot-unplug, zpci_unregister_ioat will always fail since the function is already gone. So let zpci_unregister_ioat report success when the function is gone but don't cleanup the dma table when a function could still have it in access. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci.c | 19 ++++++++++++++----- arch/s390/pci/pci_dma.c | 4 +++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 82f3e788b0be..4f541f54470f 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -161,19 +161,28 @@ static int mod_pci(struct zpci_dev *zdev, int fn, u8 dmaas, struct mod_pci_args int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, u64 base, u64 limit, u64 iota) { - struct mod_pci_args args = { base, limit, iota, 0 }; + u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT); + struct zpci_fib fib = {0}; + u8 status; WARN_ON_ONCE(iota & 0x3fff); - args.iota |= ZPCI_IOTA_RTTO_FLAG; - return mod_pci(zdev, ZPCI_MOD_FC_REG_IOAT, dmaas, &args); + fib.pba = base; + fib.pal = limit; + fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } /* Modify PCI: Unregister I/O address translation parameters */ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) { - struct mod_pci_args args = { 0, 0, 0, 0 }; + u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_DEREG_IOAT); + struct zpci_fib fib = {0}; + u8 cc, status; - return mod_pci(zdev, ZPCI_MOD_FC_DEREG_IOAT, dmaas, &args); + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3) /* Function already gone. */ + cc = 0; + return cc ? -EIO : 0; } /* Modify PCI: Set PCI function measurement parameters */ diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 9081a57fa340..8eb1cc341dab 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -601,7 +601,9 @@ void zpci_dma_exit_device(struct zpci_dev *zdev) */ WARN_ON(zdev->s390_domain); - zpci_unregister_ioat(zdev, 0); + if (zpci_unregister_ioat(zdev, 0)) + return; + dma_cleanup_tables(zdev->dma_table); zdev->dma_table = NULL; vfree(zdev->iommu_bitmap); From 4e5bd7803bd1e822513d1007bad1527a2ec589b6 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Sat, 10 Jun 2017 14:12:13 +0200 Subject: [PATCH 46/52] s390/pci: improve error handling during fmb (de)registration Cleanup in zpci_fmb_enable_device when fmb registration fails. Also don't free the fmb when deregistration fails in zpci_fmb_disable_device but handle error situations when a function was hot-unplugged. Also remove the mod_pci helper since it is no longer used. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci.c | 51 +++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 4f541f54470f..6a44a68efb81 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -136,27 +136,6 @@ static int zpci_clear_airq(struct zpci_dev *zdev) return cc ? -EIO : 0; } -struct mod_pci_args { - u64 base; - u64 limit; - u64 iota; - u64 fmb_addr; -}; - -static int mod_pci(struct zpci_dev *zdev, int fn, u8 dmaas, struct mod_pci_args *args) -{ - u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, fn); - struct zpci_fib fib = {0}; - u8 status; - - fib.pba = args->base; - fib.pal = args->limit; - fib.iota = args->iota; - fib.fmb_addr = args->fmb_addr; - - return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; -} - /* Modify PCI: Register I/O address translation parameters */ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, u64 base, u64 limit, u64 iota) @@ -188,7 +167,9 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) /* Modify PCI: Set PCI function measurement parameters */ int zpci_fmb_enable_device(struct zpci_dev *zdev) { - struct mod_pci_args args = { 0, 0, 0, 0 }; + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); + struct zpci_fib fib = {0}; + u8 cc, status; if (zdev->fmb || sizeof(*zdev->fmb) < zdev->fmb_length) return -EINVAL; @@ -203,25 +184,35 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) atomic64_set(&zdev->mapped_pages, 0); atomic64_set(&zdev->unmapped_pages, 0); - args.fmb_addr = virt_to_phys(zdev->fmb); - return mod_pci(zdev, ZPCI_MOD_FC_SET_MEASURE, 0, &args); + fib.fmb_addr = virt_to_phys(zdev->fmb); + cc = zpci_mod_fc(req, &fib, &status); + if (cc) { + kmem_cache_free(zdev_fmb_cache, zdev->fmb); + zdev->fmb = NULL; + } + return cc ? -EIO : 0; } /* Modify PCI: Disable PCI function measurement */ int zpci_fmb_disable_device(struct zpci_dev *zdev) { - struct mod_pci_args args = { 0, 0, 0, 0 }; - int rc; + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); + struct zpci_fib fib = {0}; + u8 cc, status; if (!zdev->fmb) return -EINVAL; /* Function measurement is disabled if fmb address is zero */ - rc = mod_pci(zdev, ZPCI_MOD_FC_SET_MEASURE, 0, &args); + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3) /* Function already gone. */ + cc = 0; - kmem_cache_free(zdev_fmb_cache, zdev->fmb); - zdev->fmb = NULL; - return rc; + if (!cc) { + kmem_cache_free(zdev_fmb_cache, zdev->fmb); + zdev->fmb = NULL; + } + return cc ? -EIO : 0; } static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len) From 783684f1f60faec09f3ac74c0b12e89bdb182429 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 26 Apr 2017 18:59:52 +0200 Subject: [PATCH 47/52] s390/pci: introduce clp_get_state Code handling pci hotplug needs to determine the configuration state of a pci function. Implement clp_get_state as a wrapper for list pci functions. Also change enum zpci_state to match the configuration state values. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci.h | 10 ++++---- arch/s390/pci/pci_clp.c | 51 ++++++++++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 328142c8fe92..01c58d41bee8 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -70,11 +70,10 @@ struct zpci_fmb { } __packed __aligned(128); enum zpci_state { - ZPCI_FN_STATE_RESERVED, - ZPCI_FN_STATE_STANDBY, - ZPCI_FN_STATE_CONFIGURED, - ZPCI_FN_STATE_ONLINE, - NR_ZPCI_FN_STATES, + ZPCI_FN_STATE_STANDBY = 0, + ZPCI_FN_STATE_CONFIGURED = 1, + ZPCI_FN_STATE_RESERVED = 2, + ZPCI_FN_STATE_ONLINE = 3, }; struct zpci_bar_struct { @@ -172,6 +171,7 @@ int clp_rescan_pci_devices_simple(void); int clp_add_pci_device(u32, u32, int); int clp_enable_fh(struct zpci_dev *, u8); int clp_disable_fh(struct zpci_dev *); +int clp_get_state(u32 fid, enum zpci_state *state); #ifdef CONFIG_PCI /* Error handling and recovery */ diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 1c3332ac1957..267cdd69e6da 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -295,8 +295,8 @@ int clp_disable_fh(struct zpci_dev *zdev) return rc; } -static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, - void (*cb)(struct clp_fh_list_entry *entry)) +static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data, + void (*cb)(struct clp_fh_list_entry *, void *)) { u64 resume_token = 0; int entries, i, rc; @@ -327,13 +327,13 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, resume_token = rrb->response.resume_token; for (i = 0; i < entries; i++) - cb(&rrb->response.fh_list[i]); + cb(&rrb->response.fh_list[i], data); } while (resume_token); out: return rc; } -static void __clp_add(struct clp_fh_list_entry *entry) +static void __clp_add(struct clp_fh_list_entry *entry, void *data) { if (!entry->vendor_id) return; @@ -341,7 +341,7 @@ static void __clp_add(struct clp_fh_list_entry *entry) clp_add_pci_device(entry->fid, entry->fh, entry->config_state); } -static void __clp_rescan(struct clp_fh_list_entry *entry) +static void __clp_rescan(struct clp_fh_list_entry *entry, void *data) { struct zpci_dev *zdev; @@ -364,7 +364,7 @@ static void __clp_rescan(struct clp_fh_list_entry *entry) } } -static void __clp_update(struct clp_fh_list_entry *entry) +static void __clp_update(struct clp_fh_list_entry *entry, void *data) { struct zpci_dev *zdev; @@ -387,7 +387,7 @@ int clp_scan_pci_devices(void) if (!rrb) return -ENOMEM; - rc = clp_list_pci(rrb, __clp_add); + rc = clp_list_pci(rrb, NULL, __clp_add); clp_free_block(rrb); return rc; @@ -402,7 +402,7 @@ int clp_rescan_pci_devices(void) if (!rrb) return -ENOMEM; - rc = clp_list_pci(rrb, __clp_rescan); + rc = clp_list_pci(rrb, NULL, __clp_rescan); clp_free_block(rrb); return rc; @@ -417,7 +417,40 @@ int clp_rescan_pci_devices_simple(void) if (!rrb) return -ENOMEM; - rc = clp_list_pci(rrb, __clp_update); + rc = clp_list_pci(rrb, NULL, __clp_update); + + clp_free_block(rrb); + return rc; +} + +struct clp_state_data { + u32 fid; + enum zpci_state state; +}; + +static void __clp_get_state(struct clp_fh_list_entry *entry, void *data) +{ + struct clp_state_data *sd = data; + + if (entry->fid != sd->fid) + return; + + sd->state = entry->config_state; +} + +int clp_get_state(u32 fid, enum zpci_state *state) +{ + struct clp_req_rsp_list_pci *rrb; + struct clp_state_data sd = {fid, ZPCI_FN_STATE_RESERVED}; + int rc; + + rrb = clp_alloc_block(GFP_KERNEL); + if (!rrb) + return -ENOMEM; + + rc = clp_list_pci(rrb, &sd, __clp_get_state); + if (!rc) + *state = sd.state; clp_free_block(rrb); return rc; From 623bd44d3f277b7bbe16e0e091bd361e75964b5d Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 9 May 2017 12:27:30 +0200 Subject: [PATCH 48/52] s390/pci: improve pci hotplug PCI hotplug events basically notify about the new state of a function. Unfortunately some hypervisors implement hotplug events in a way where it is not clear what the new state of the function should be. Use clp_get_state to find the current state of the function and handle accordingly. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci.c | 9 +++++++++ arch/s390/pci/pci_event.c | 14 +++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 01c58d41bee8..280458c82104 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -158,6 +158,7 @@ extern const struct attribute_group *zpci_attr_groups[]; ----------------------------------------------------------------------------- */ /* Base stuff */ int zpci_create_device(struct zpci_dev *); +void zpci_remove_device(struct zpci_dev *zdev); int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); void zpci_stop_device(struct zpci_dev *); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 6a44a68efb81..f4928bc57773 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -855,6 +855,15 @@ void zpci_stop_device(struct zpci_dev *zdev) } EXPORT_SYMBOL_GPL(zpci_stop_device); +void zpci_remove_device(struct zpci_dev *zdev) +{ + if (!zdev->bus) + return; + + pci_stop_root_bus(zdev->bus); + pci_remove_root_bus(zdev->bus); +} + int zpci_report_error(struct pci_dev *pdev, struct zpci_report_error_header *report) { diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index c2b27ad8e94d..0bbc04af4418 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -74,6 +74,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); struct pci_dev *pdev = NULL; + enum zpci_state state; int ret; if (zdev) @@ -108,6 +109,8 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) clp_add_pci_device(ccdf->fid, ccdf->fh, 0); break; case 0x0303: /* Deconfiguration requested */ + if (!zdev) + break; if (pdev) pci_stop_and_remove_bus_device_locked(pdev); @@ -121,7 +124,9 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zdev->state = ZPCI_FN_STATE_STANDBY; break; - case 0x0304: /* Configured -> Standby */ + case 0x0304: /* Configured -> Standby|Reserved */ + if (!zdev) + break; if (pdev) { /* Give the driver a hint that the function is * already unusable. */ @@ -132,6 +137,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zdev->fh = ccdf->fh; zpci_disable_device(zdev); zdev->state = ZPCI_FN_STATE_STANDBY; + if (!clp_get_state(ccdf->fid, &state) && + state == ZPCI_FN_STATE_RESERVED) { + zpci_remove_device(zdev); + } break; case 0x0306: /* 0x308 or 0x302 for multiple devices */ clp_rescan_pci_devices(); @@ -139,8 +148,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) case 0x0308: /* Standby -> Reserved */ if (!zdev) break; - pci_stop_root_bus(zdev->bus); - pci_remove_root_bus(zdev->bus); + zpci_remove_device(zdev); break; default: break; From 01553d9a2ba7c658bf9e9e5c65466508c1bd6db5 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 20 Jun 2017 15:56:05 +0200 Subject: [PATCH 49/52] s390/pci: fix handling of PEC 306 In contrast to other hotplug events PEC 0x306 isn't about a single but multiple devices. Also there's no information on what happened to these devices. We correctly handled hotplug that way but failed to handle hot-unplug. This patch addresses that and implements hot-unplug of multiple devices via PEC 306. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci.h | 2 +- arch/s390/pci/pci.c | 29 +++++++++++++++++++---------- arch/s390/pci/pci_clp.c | 25 ++++--------------------- 3 files changed, 24 insertions(+), 32 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 280458c82104..f36b4b726057 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -161,9 +161,9 @@ int zpci_create_device(struct zpci_dev *); void zpci_remove_device(struct zpci_dev *zdev); int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); -void zpci_stop_device(struct zpci_dev *); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); int zpci_unregister_ioat(struct zpci_dev *, u8); +void zpci_remove_reserved_devices(void); /* CLP */ int clp_scan_pci_devices(void); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index f4928bc57773..114b390d80f9 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -86,6 +86,25 @@ struct zpci_dev *get_zdev_by_fid(u32 fid) return zdev; } +void zpci_remove_reserved_devices(void) +{ + struct zpci_dev *tmp, *zdev; + enum zpci_state state; + LIST_HEAD(remove); + + spin_lock(&zpci_list_lock); + list_for_each_entry_safe(zdev, tmp, &zpci_list, entry) { + if (zdev->state == ZPCI_FN_STATE_STANDBY && + !clp_get_state(zdev->fid, &state) && + state == ZPCI_FN_STATE_RESERVED) + list_move_tail(&zdev->entry, &remove); + } + spin_unlock(&zpci_list_lock); + + list_for_each_entry_safe(zdev, tmp, &remove, entry) + zpci_remove_device(zdev); +} + static struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus) { return (bus && bus->sysdata) ? (struct zpci_dev *) bus->sysdata : NULL; @@ -845,16 +864,6 @@ out: return rc; } -void zpci_stop_device(struct zpci_dev *zdev) -{ - zpci_dma_exit_device(zdev); - /* - * Note: SCLP disables fh via set-pci-fn so don't - * do that here. - */ -} -EXPORT_SYMBOL_GPL(zpci_stop_device); - void zpci_remove_device(struct zpci_dev *zdev) { if (!zdev->bus) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 267cdd69e6da..3a5cd84e5a3b 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -334,14 +334,6 @@ out: } static void __clp_add(struct clp_fh_list_entry *entry, void *data) -{ - if (!entry->vendor_id) - return; - - clp_add_pci_device(entry->fid, entry->fh, entry->config_state); -} - -static void __clp_rescan(struct clp_fh_list_entry *entry, void *data) { struct zpci_dev *zdev; @@ -349,19 +341,8 @@ static void __clp_rescan(struct clp_fh_list_entry *entry, void *data) return; zdev = get_zdev_by_fid(entry->fid); - if (!zdev) { + if (!zdev) clp_add_pci_device(entry->fid, entry->fh, entry->config_state); - return; - } - - if (!entry->config_state) { - /* - * The handle is already disabled, that means no iota/irq freeing via - * the firmware interfaces anymore. Need to free resources manually - * (DMA memory, debug, sysfs)... - */ - zpci_stop_device(zdev); - } } static void __clp_update(struct clp_fh_list_entry *entry, void *data) @@ -398,11 +379,13 @@ int clp_rescan_pci_devices(void) struct clp_req_rsp_list_pci *rrb; int rc; + zpci_remove_reserved_devices(); + rrb = clp_alloc_block(GFP_KERNEL); if (!rrb) return -ENOMEM; - rc = clp_list_pci(rrb, NULL, __clp_rescan); + rc = clp_list_pci(rrb, NULL, __clp_add); clp_free_block(rrb); return rc; From be2c36769f6340322c5367586adad6c3ce1ed51a Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 21 Jun 2017 12:30:40 +0200 Subject: [PATCH 50/52] s390/pci: provide more debug information Add some debug data to observe the lifetime of the architecture specific device information. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci.c | 1 + arch/s390/pci/pci_clp.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 114b390d80f9..d5c7444a0c05 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -772,6 +772,7 @@ void pcibios_remove_bus(struct pci_bus *bus) list_del(&zdev->entry); spin_unlock(&zpci_list_lock); + zpci_dbg(3, "rem fid:%x\n", zdev->fid); kfree(zdev); } diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 3a5cd84e5a3b..bd534b4d40e3 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -193,12 +193,12 @@ out: int clp_add_pci_device(u32 fid, u32 fh, int configured) { struct zpci_dev *zdev; - int rc; + int rc = -ENOMEM; zpci_dbg(3, "add fid:%x, fh:%x, c:%d\n", fid, fh, configured); zdev = kzalloc(sizeof(*zdev), GFP_KERNEL); if (!zdev) - return -ENOMEM; + goto error; zdev->fh = fh; zdev->fid = fid; @@ -219,6 +219,7 @@ int clp_add_pci_device(u32 fid, u32 fh, int configured) return 0; error: + zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc); kfree(zdev); return rc; } From 312e8462ab6a1594a0bc4bc51c8e36c9f07b447b Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 21 Jun 2017 10:20:35 +0200 Subject: [PATCH 51/52] s390/pci: recognize name clashes with uids When uid checking is enabled firmware guarantees uniqueness of the uids and we use them for device enumeration. Tests have shown that uid checking can be toggled at runtime. This is unfortunate since it can lead to name clashes. Recognize these name clashes by allocating bits in zpci_domain even for firmware provided ids. Signed-off-by: Sebastian Ott Reviewed-by: Pierre Morel Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index d5c7444a0c05..7b30af5da222 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -736,6 +736,16 @@ static int zpci_alloc_domain(struct zpci_dev *zdev) { if (zpci_unique_uid) { zdev->domain = (u16) zdev->uid; + if (zdev->domain >= ZPCI_NR_DEVICES) + return 0; + + spin_lock(&zpci_domain_lock); + if (test_bit(zdev->domain, zpci_domain)) { + spin_unlock(&zpci_domain_lock); + return -EEXIST; + } + set_bit(zdev->domain, zpci_domain); + spin_unlock(&zpci_domain_lock); return 0; } @@ -752,7 +762,7 @@ static int zpci_alloc_domain(struct zpci_dev *zdev) static void zpci_free_domain(struct zpci_dev *zdev) { - if (zpci_unique_uid) + if (zdev->domain >= ZPCI_NR_DEVICES) return; spin_lock(&zpci_domain_lock); From 795c9a5106119f45d2501c4fb01051178904753f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 22 Jun 2017 17:17:15 +0200 Subject: [PATCH 52/52] s390/dasd: Fix faulty ENODEV for RO sysfs attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a device is offline it can still be set to read-only via the bus id through sysfs. Only the read-only feature flag for the ccw_device is then set. If the device is online the corresponding block device needs to be set to read-only as well (via set_disk_ro()). The check whether there is a device to do so, however, happens after the feature flag was set. This leads to an unnecessary "no such device" error in the offline case. This bug was introduced by commit 7571cb1c8e3cc ("s390/dasd: Make use of dasd_set_feature() more often"). Fix this by simply returning count if no device is available. Fixes: 7571cb1c8e3cc ("s390/dasd: Make use of dasd_set_feature() more often") Reviewed-by: Stefan Haberland Signed-off-by: Jan Höppner Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd_devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index e943d9c48926..7c7351276d2e 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -786,7 +786,7 @@ dasd_ro_store(struct device *dev, struct device_attribute *attr, device = dasd_device_from_cdev(cdev); if (IS_ERR(device)) - return PTR_ERR(device); + return count; spin_lock_irqsave(get_ccwdev_lock(cdev), flags); val = val || test_bit(DASD_FLAG_DEVICE_RO, &device->flags);