From cbc41b43e7c2ba6503b6b96dfe38e6b1316f46cf Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Sat, 14 Jan 2017 18:01:54 +0530 Subject: [PATCH 01/13] sparc32: mm: srmmu: add __ro_after_init to sparc32_cachetlb_ops structures The objects viking_ops, viking_sun4d_smp_ops and smp_cachetlb_ops of type sparc32_cachetlb_ops are not modified anywhere after getting modified in the init functions. Inside init their reference is also stored in a pointer of type const struct sparc32_cachetlb_ops *. So these structures are never modified after init, therefore add __ro_after to the declaration of these structures. Signed-off-by: Bhumika Goyal Reviewed-by: Kees Cook Signed-off-by: David S. Miller --- arch/sparc/mm/srmmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index c7f2a5295b3a..def82f6d626f 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -1444,7 +1444,7 @@ static void poke_viking(void) srmmu_set_mmureg(mreg); } -static struct sparc32_cachetlb_ops viking_ops = { +static struct sparc32_cachetlb_ops viking_ops __ro_after_init = { .cache_all = viking_flush_cache_all, .cache_mm = viking_flush_cache_mm, .cache_page = viking_flush_cache_page, @@ -1475,7 +1475,7 @@ static struct sparc32_cachetlb_ops viking_ops = { * flushes going at once will require SMP locking anyways so there's * no real value in trying any harder than this. */ -static struct sparc32_cachetlb_ops viking_sun4d_smp_ops = { +static struct sparc32_cachetlb_ops viking_sun4d_smp_ops __ro_after_init = { .cache_all = viking_flush_cache_all, .cache_mm = viking_flush_cache_mm, .cache_page = viking_flush_cache_page, @@ -1759,7 +1759,7 @@ static void smp_flush_sig_insns(struct mm_struct *mm, unsigned long insn_addr) local_ops->sig_insns(mm, insn_addr); } -static struct sparc32_cachetlb_ops smp_cachetlb_ops = { +static struct sparc32_cachetlb_ops smp_cachetlb_ops __ro_after_init = { .cache_all = smp_flush_cache_all, .cache_mm = smp_flush_cache_mm, .cache_page = smp_flush_cache_page, From fc5e8c283999168942baa5ef7f3805444f80be2b Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 4 Feb 2017 16:32:22 +0000 Subject: [PATCH 02/13] sparc: topology_64.h: Fix condition for including cpudata.h We currently define macros referring to cpu_data if CONFIG_SMP is defined, but only include the declaration if CONFIG_NUMA is defined. Fixes: 541cc39433a8 ("sparc: fix a building error reported by kbuild") Signed-off-by: Ben Hutchings Acked-by: Gonglei Signed-off-by: David S. Miller --- arch/sparc/include/asm/topology_64.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 225543000122..ad5293f89680 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -4,7 +4,6 @@ #ifdef CONFIG_NUMA #include -#include static inline int cpu_to_node(int cpu) { @@ -42,6 +41,9 @@ int __node_distance(int, int); #endif /* !(CONFIG_NUMA) */ #ifdef CONFIG_SMP + +#include + #define topology_physical_package_id(cpu) (cpu_data(cpu).proc_id) #define topology_core_id(cpu) (cpu_data(cpu).core_id) #define topology_core_cpumask(cpu) (&cpu_core_sib_map[cpu]) From 269d8523585c1b82b53aff3cf00d88ccbaf58c35 Mon Sep 17 00:00:00 2001 From: Eric Saint Etienne Date: Mon, 6 Feb 2017 14:32:41 +0000 Subject: [PATCH 03/13] sparc64: fix for user probes in high memory When returning from the user probe code into userspace process, PC & NPC are truncated to 32 bits. Due to shared libraries getting loaded very high in the virtual address space of the process, placing a user probe inside a shared library makes the kernel return into the process at the wrong address, causing it to seg'fault most of the time. This patch prevents truncating PC and NPC. Signed-off-by: Eric Saint Etienne Reviewed-by: David Aldridge Signed-off-by: David S. Miller --- arch/sparc/include/asm/uprobes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/uprobes.h b/arch/sparc/include/asm/uprobes.h index f87aae5a908e..36196c17aff8 100644 --- a/arch/sparc/include/asm/uprobes.h +++ b/arch/sparc/include/asm/uprobes.h @@ -42,8 +42,8 @@ struct arch_uprobe { }; struct arch_uprobe_task { - u32 saved_tpc; - u32 saved_tnpc; + u64 saved_tpc; + u64 saved_tnpc; }; struct task_struct; From f41e54616ca1a199f6c17228f26082ccdaaab3de Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Feb 2017 14:09:26 -0500 Subject: [PATCH 04/13] sunvdc: Add support for setting physical sector size Physical sector size is supported in v1.2 of the vDisk protocol and should be set if available. If protocol version 1.2 is used and the physical disk size is unavailable, then the disk is considered busy. Signed-off-by: Liam R. Howlett Signed-off-by: David S. Miller --- drivers/block/sunvdc.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index cab157331c4e..3f3a3ab3d50a 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -34,6 +34,7 @@ MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_MODULE_VERSION); #define VDC_TX_RING_SIZE 512 +#define VDC_DEFAULT_BLK_SIZE 512 #define WAITING_FOR_LINK_UP 0x01 #define WAITING_FOR_TX_SPACE 0x02 @@ -73,6 +74,7 @@ struct vdc_port { u32 vdisk_size; u8 vdisk_type; u8 vdisk_mtype; + u32 vdisk_phys_blksz; char disk_name[32]; }; @@ -88,6 +90,7 @@ static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio) /* Ordered from largest major to lowest */ static struct vio_version vdc_versions[] = { + { .major = 1, .minor = 2 }, { .major = 1, .minor = 1 }, { .major = 1, .minor = 0 }, }; @@ -271,6 +274,11 @@ static int vdc_handle_attr(struct vio_driver_state *vio, void *arg) if (pkt->max_xfer_size < port->max_xfer_size) port->max_xfer_size = pkt->max_xfer_size; port->vdisk_block_size = pkt->vdisk_block_size; + + port->vdisk_phys_blksz = VDC_DEFAULT_BLK_SIZE; + if (vdc_version_supported(port, 1, 2)) + port->vdisk_phys_blksz = pkt->phys_block_size; + return 0; } else { printk(KERN_ERR PFX "%s: Attribute NACK\n", vio->name); @@ -754,6 +762,12 @@ static int probe_disk(struct vdc_port *port) if (err) return err; + /* Using version 1.2 means vdisk_phys_blksz should be set unless the + * disk is reserved by another system. + */ + if (vdc_version_supported(port, 1, 2) && !port->vdisk_phys_blksz) + return -ENODEV; + if (vdc_version_supported(port, 1, 1)) { /* vdisk_size should be set during the handshake, if it wasn't * then the underlying disk is reserved by another system @@ -829,6 +843,8 @@ static int probe_disk(struct vdc_port *port) } } + blk_queue_physical_block_size(q, port->vdisk_phys_blksz); + pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n", g->disk_name, port->vdisk_size, (port->vdisk_size >> (20 - 9)), @@ -910,7 +926,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) if (err) goto err_out_free_port; - port->vdisk_block_size = 512; + port->vdisk_block_size = VDC_DEFAULT_BLK_SIZE; port->max_xfer_size = ((128 * 1024) / port->vdisk_block_size); port->ring_cookies = ((port->max_xfer_size * port->vdisk_block_size) / PAGE_SIZE) + 2; From cffb3e76818fee4763a2ce5f2b1eca2d7885e2cf Mon Sep 17 00:00:00 2001 From: Vijay Kumar Date: Wed, 1 Feb 2017 11:34:37 -0800 Subject: [PATCH 05/13] sparc64: Set cpu state to offline when stopped CPU needs to be marked offline before stopping it. When not marked offline, the xcall receives HV_EWOULDBLOCK and so assumes that not all CPUs received the message, and retries. After 10000 retries, it finally fails with fatal mondo timeout. Signed-off-by: Vijay Kumar Signed-off-by: David S. Miller --- arch/sparc/kernel/smp_64.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 0ce347f8e4cc..712bf1b7f630 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1443,6 +1443,7 @@ void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) static void stop_this_cpu(void *dummy) { + set_cpu_online(smp_processor_id(), false); prom_stopself(); } @@ -1454,6 +1455,8 @@ void smp_send_stop(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; + + set_cpu_online(cpu, false); #ifdef CONFIG_SUN_LDOMS if (ldom_domaining_enabled) { unsigned long hv_err; From 7dd4fcf5b70694dc961eb6b954673e4fc9730dbd Mon Sep 17 00:00:00 2001 From: Vijay Kumar Date: Wed, 1 Feb 2017 11:34:38 -0800 Subject: [PATCH 06/13] sparc64: Migrate hvcons irq to panicked cpu On panic, all other CPUs are stopped except the one which had hit panic. To keep console alive, we need to migrate hvcons irq to panicked CPU. Signed-off-by: Vijay Kumar Signed-off-by: David S. Miller --- arch/sparc/include/asm/setup.h | 5 ++++- arch/sparc/kernel/smp_64.c | 6 +++++- drivers/tty/serial/sunhv.c | 6 ++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index 29d64b1758ed..478bf6bb4598 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -59,8 +59,11 @@ extern atomic_t dcpage_flushes; extern atomic_t dcpage_flushes_xcall; extern int sysctl_tsb_ratio; -#endif +#ifdef CONFIG_SERIAL_SUNHV +void sunhv_migrate_hvcons_irq(int cpu); +#endif +#endif void sun_do_break(void); extern int stop_a_enabled; extern int scons_pwroff; diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 712bf1b7f630..90a02cb64e20 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1452,8 +1452,12 @@ void smp_send_stop(void) int cpu; if (tlb_type == hypervisor) { + int this_cpu = smp_processor_id(); +#ifdef CONFIG_SERIAL_SUNHV + sunhv_migrate_hvcons_irq(this_cpu); +#endif for_each_online_cpu(cpu) { - if (cpu == smp_processor_id()) + if (cpu == this_cpu) continue; set_cpu_online(cpu, false); diff --git a/drivers/tty/serial/sunhv.c b/drivers/tty/serial/sunhv.c index 73abd89c0108..c5ebdc8bed6a 100644 --- a/drivers/tty/serial/sunhv.c +++ b/drivers/tty/serial/sunhv.c @@ -398,6 +398,12 @@ static struct uart_driver sunhv_reg = { static struct uart_port *sunhv_port; +void sunhv_migrate_hvcons_irq(int cpu) +{ + /* Migrate hvcons irq to param cpu */ + irq_force_affinity(sunhv_port->irq, cpumask_of(cpu)); +} + /* Copy 's' into the con_write_page, decoding "\n" into * "\r\n" along the way. We have to return two lengths * because the caller needs to know how much to advance From 7db60d05e5ccc0a473fa2275f90f2fca0002ab21 Mon Sep 17 00:00:00 2001 From: Vijay Kumar Date: Wed, 1 Feb 2017 11:34:39 -0800 Subject: [PATCH 07/13] sparc64: Send break twice from console to return to boot prom Now we can also jump to boot prom from sunhv console by sending break twice on console for both running and panicked kernel cases. Signed-off-by: Vijay Kumar Signed-off-by: David S. Miller --- drivers/tty/serial/sunhv.c | 6 +++++- kernel/panic.c | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/sunhv.c b/drivers/tty/serial/sunhv.c index c5ebdc8bed6a..46e46894e918 100644 --- a/drivers/tty/serial/sunhv.c +++ b/drivers/tty/serial/sunhv.c @@ -116,7 +116,7 @@ static int receive_chars_getchar(struct uart_port *port) static int receive_chars_read(struct uart_port *port) { - int saw_console_brk = 0; + static int saw_console_brk; int limit = 10000; while (limit-- > 0) { @@ -128,6 +128,9 @@ static int receive_chars_read(struct uart_port *port) bytes_read = 0; if (stat == CON_BREAK) { + if (saw_console_brk) + sun_do_break(); + if (uart_handle_break(port)) continue; saw_console_brk = 1; @@ -151,6 +154,7 @@ static int receive_chars_read(struct uart_port *port) if (port->sysrq != 0 && *con_read_page) { for (i = 0; i < bytes_read; i++) uart_handle_sysrq_char(port, con_read_page[i]); + saw_console_brk = 0; } if (port->state == NULL) diff --git a/kernel/panic.c b/kernel/panic.c index b95959733ce0..3ec16e603e88 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -273,7 +273,8 @@ void panic(const char *fmt, ...) extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; - pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n"); + pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n" + "twice on console to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) From 4cfe140618b99e653134598de9f18b48743549ec Mon Sep 17 00:00:00 2001 From: Vijay Kumar Date: Wed, 1 Feb 2017 11:34:40 -0800 Subject: [PATCH 08/13] Documentation/sparc: Steps for sending break on sunhv console Documented the steps for sending break on sunhv console. Signed-off-by: Vijay Kumar Signed-off-by: David S. Miller --- Documentation/sparc/console.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 Documentation/sparc/console.txt diff --git a/Documentation/sparc/console.txt b/Documentation/sparc/console.txt new file mode 100644 index 000000000000..5aa735a44e02 --- /dev/null +++ b/Documentation/sparc/console.txt @@ -0,0 +1,9 @@ +Steps for sending 'break' on sunhv console: +=========================================== + +On Baremetal: + 1. press Esc + 'B' + +On LDOM: + 1. press Ctrl + ']' + 2. telnet> send break From c7d9f77d33a779ad582d8b2284ba007931ebd894 Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Wed, 1 Feb 2017 16:16:36 -0800 Subject: [PATCH 09/13] sparc64: Multi-page size support Add support for using multiple hugepage sizes simultaneously on mainline. Currently, support for 256M has been added which can be used along with 8M pages. Page tables are set like this (e.g. for 256M page): VA + (8M * x) -> PA + (8M * x) (sz bit = 256M) where x in [0, 31] and TSB is set similarly: VA + (4M * x) -> PA + (4M * x) (sz bit = 256M) where x in [0, 63] - Testing Tested on Sonoma (which supports 256M pages) by running stream benchmark instances in parallel: one instance uses 8M pages and another uses 256M pages, consuming 48G each. Boot params used: default_hugepagesz=256M hugepagesz=256M hugepages=300 hugepagesz=8M hugepages=10000 Signed-off-by: Nitin Gupta Signed-off-by: David S. Miller --- arch/sparc/include/asm/page_64.h | 3 +- arch/sparc/include/asm/pgtable_64.h | 23 ++-- arch/sparc/include/asm/tlbflush_64.h | 5 +- arch/sparc/kernel/tsb.S | 21 +--- arch/sparc/mm/hugetlbpage.c | 160 ++++++++++++++++++++++++--- arch/sparc/mm/init_64.c | 42 ++++++- arch/sparc/mm/tlb.c | 17 ++- arch/sparc/mm/tsb.c | 44 ++++++-- 8 files changed, 253 insertions(+), 62 deletions(-) diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index c1263fc390db..d76f38d4171b 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h @@ -17,7 +17,7 @@ #define HPAGE_SHIFT 23 #define REAL_HPAGE_SHIFT 22 - +#define HPAGE_256MB_SHIFT 28 #define REAL_HPAGE_SIZE (_AC(1,UL) << REAL_HPAGE_SHIFT) #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) @@ -26,6 +26,7 @@ #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #define REAL_HPAGE_PER_HPAGE (_AC(1,UL) << (HPAGE_SHIFT - REAL_HPAGE_SHIFT)) +#define HUGE_MAX_HSTATE 2 #endif #ifndef __ASSEMBLY__ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 314b66851348..7932a4a37817 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -375,7 +375,10 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot) #define pgprot_noncached pgprot_noncached #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline unsigned long __pte_huge_mask(void) +extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable); +#define arch_make_huge_pte arch_make_huge_pte +static inline unsigned long __pte_default_huge_mask(void) { unsigned long mask; @@ -395,12 +398,14 @@ static inline unsigned long __pte_huge_mask(void) static inline pte_t pte_mkhuge(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_PMD_HUGE | __pte_huge_mask()); + return __pte(pte_val(pte) | __pte_default_huge_mask()); } -static inline bool is_hugetlb_pte(pte_t pte) +static inline bool is_default_hugetlb_pte(pte_t pte) { - return !!(pte_val(pte) & __pte_huge_mask()); + unsigned long mask = __pte_default_huge_mask(); + + return (pte_val(pte) & mask) == mask; } static inline bool is_hugetlb_pmd(pmd_t pmd) @@ -875,10 +880,12 @@ static inline unsigned long pud_pfn(pud_t pud) /* Actual page table PTE updates. */ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, - pte_t *ptep, pte_t orig, int fullmm); + pte_t *ptep, pte_t orig, int fullmm, + unsigned int hugepage_shift); static void maybe_tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, - pte_t *ptep, pte_t orig, int fullmm) + pte_t *ptep, pte_t orig, int fullmm, + unsigned int hugepage_shift) { /* It is more efficient to let flush_tlb_kernel_range() * handle init_mm tlb flushes. @@ -887,7 +894,7 @@ static void maybe_tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, * and SUN4V pte layout, so this inline test is fine. */ if (likely(mm != &init_mm) && pte_accessible(mm, orig)) - tlb_batch_add(mm, vaddr, ptep, orig, fullmm); + tlb_batch_add(mm, vaddr, ptep, orig, fullmm, hugepage_shift); } #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR @@ -906,7 +913,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t orig = *ptep; *ptep = pte; - maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm); + maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm, PAGE_SHIFT); } #define set_pte_at(mm,addr,ptep,pte) \ diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index a8e192e90700..54be88a6774c 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -8,7 +8,7 @@ #define TLB_BATCH_NR 192 struct tlb_batch { - bool huge; + unsigned int hugepage_shift; struct mm_struct *mm; unsigned long tlb_nr; unsigned long active; @@ -17,7 +17,8 @@ struct tlb_batch { void flush_tsb_kernel_range(unsigned long start, unsigned long end); void flush_tsb_user(struct tlb_batch *tb); -void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, bool huge); +void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, + unsigned int hugepage_shift); /* TLB flush operations. */ diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S index d568c8207af7..10689cfd0ad4 100644 --- a/arch/sparc/kernel/tsb.S +++ b/arch/sparc/kernel/tsb.S @@ -117,26 +117,11 @@ tsb_miss_page_table_walk_sun4v_fastpath: /* Valid PTE is now in %g5. */ #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) -661: sethi %uhi(_PAGE_SZALL_4U), %g7 + sethi %uhi(_PAGE_PMD_HUGE), %g7 sllx %g7, 32, %g7 - .section .sun4v_2insn_patch, "ax" - .word 661b - mov _PAGE_SZALL_4V, %g7 - nop - .previous - and %g5, %g7, %g2 - -661: sethi %uhi(_PAGE_SZHUGE_4U), %g7 - sllx %g7, 32, %g7 - .section .sun4v_2insn_patch, "ax" - .word 661b - mov _PAGE_SZHUGE_4V, %g7 - nop - .previous - - cmp %g2, %g7 - bne,pt %xcc, 60f + andcc %g5, %g7, %g0 + be,pt %xcc, 60f nop /* It is a huge page, use huge page TSB entry address we diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 988acc8b1b80..618a568cc7f2 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -28,6 +28,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(filp); unsigned long task_size = TASK_SIZE; struct vm_unmapped_area_info info; @@ -38,7 +39,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp, info.length = len; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = min(task_size, VA_EXCLUDE_START); - info.align_mask = PAGE_MASK & ~HPAGE_MASK; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -58,6 +59,7 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long pgoff, const unsigned long flags) { + struct hstate *h = hstate_file(filp); struct mm_struct *mm = current->mm; unsigned long addr = addr0; struct vm_unmapped_area_info info; @@ -69,7 +71,7 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - info.align_mask = PAGE_MASK & ~HPAGE_MASK; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -94,6 +96,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long task_size = TASK_SIZE; @@ -101,7 +104,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (test_thread_flag(TIF_32BIT)) task_size = STACK_TOP32; - if (len & ~HPAGE_MASK) + if (len & ~huge_page_mask(h)) return -EINVAL; if (len > task_size) return -ENOMEM; @@ -113,7 +116,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } if (addr) { - addr = ALIGN(addr, HPAGE_SIZE); + addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (task_size - len >= addr && (!vma || addr + len <= vma->vm_start)) @@ -127,6 +130,112 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, pgoff, flags); } +static pte_t sun4u_hugepage_shift_to_tte(pte_t entry, unsigned int shift) +{ + return entry; +} + +static pte_t sun4v_hugepage_shift_to_tte(pte_t entry, unsigned int shift) +{ + unsigned long hugepage_size = _PAGE_SZ4MB_4V; + + pte_val(entry) = pte_val(entry) & ~_PAGE_SZALL_4V; + + switch (shift) { + case HPAGE_256MB_SHIFT: + hugepage_size = _PAGE_SZ256MB_4V; + pte_val(entry) |= _PAGE_PMD_HUGE; + break; + case HPAGE_SHIFT: + pte_val(entry) |= _PAGE_PMD_HUGE; + break; + default: + WARN_ONCE(1, "unsupported hugepage shift=%u\n", shift); + } + + pte_val(entry) = pte_val(entry) | hugepage_size; + return entry; +} + +static pte_t hugepage_shift_to_tte(pte_t entry, unsigned int shift) +{ + if (tlb_type == hypervisor) + return sun4v_hugepage_shift_to_tte(entry, shift); + else + return sun4u_hugepage_shift_to_tte(entry, shift); +} + +pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writeable) +{ + unsigned int shift = huge_page_shift(hstate_vma(vma)); + + return hugepage_shift_to_tte(entry, shift); +} + +static unsigned int sun4v_huge_tte_to_shift(pte_t entry) +{ + unsigned long tte_szbits = pte_val(entry) & _PAGE_SZALL_4V; + unsigned int shift; + + switch (tte_szbits) { + case _PAGE_SZ256MB_4V: + shift = HPAGE_256MB_SHIFT; + break; + case _PAGE_SZ4MB_4V: + shift = REAL_HPAGE_SHIFT; + break; + default: + shift = PAGE_SHIFT; + break; + } + return shift; +} + +static unsigned int sun4u_huge_tte_to_shift(pte_t entry) +{ + unsigned long tte_szbits = pte_val(entry) & _PAGE_SZALL_4U; + unsigned int shift; + + switch (tte_szbits) { + case _PAGE_SZ256MB_4U: + shift = HPAGE_256MB_SHIFT; + break; + case _PAGE_SZ4MB_4U: + shift = REAL_HPAGE_SHIFT; + break; + default: + shift = PAGE_SHIFT; + break; + } + return shift; +} + +static unsigned int huge_tte_to_shift(pte_t entry) +{ + unsigned long shift; + + if (tlb_type == hypervisor) + shift = sun4v_huge_tte_to_shift(entry); + else + shift = sun4u_huge_tte_to_shift(entry); + + if (shift == PAGE_SHIFT) + WARN_ONCE(1, "tto_to_shift: invalid hugepage tte=0x%lx\n", + pte_val(entry)); + + return shift; +} + +static unsigned long huge_tte_to_size(pte_t pte) +{ + unsigned long size = 1UL << huge_tte_to_shift(pte); + + if (size == REAL_HPAGE_SIZE) + size = HPAGE_SIZE; + return size; +} + pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { @@ -160,35 +269,54 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { + unsigned int i, nptes, hugepage_shift; + unsigned long size; pte_t orig; + size = huge_tte_to_size(entry); + nptes = size >> PMD_SHIFT; + if (!pte_present(*ptep) && pte_present(entry)) - mm->context.hugetlb_pte_count++; + mm->context.hugetlb_pte_count += nptes; - addr &= HPAGE_MASK; + addr &= ~(size - 1); orig = *ptep; - *ptep = entry; + hugepage_shift = pte_none(orig) ? PAGE_SIZE : huge_tte_to_shift(orig); - /* Issue TLB flush at REAL_HPAGE_SIZE boundaries */ - maybe_tlb_batch_add(mm, addr, ptep, orig, 0); - maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, orig, 0); + for (i = 0; i < nptes; i++) + ptep[i] = __pte(pte_val(entry) + (i << PMD_SHIFT)); + + maybe_tlb_batch_add(mm, addr, ptep, orig, 0, hugepage_shift); + /* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */ + if (size == HPAGE_SIZE) + maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, orig, 0, + hugepage_shift); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { + unsigned int i, nptes, hugepage_shift; + unsigned long size; pte_t entry; entry = *ptep; + size = huge_tte_to_size(entry); + nptes = size >> PMD_SHIFT; + hugepage_shift = pte_none(entry) ? PAGE_SIZE : huge_tte_to_shift(entry); + if (pte_present(entry)) - mm->context.hugetlb_pte_count--; + mm->context.hugetlb_pte_count -= nptes; - addr &= HPAGE_MASK; - *ptep = __pte(0UL); + addr &= ~(size - 1); + for (i = 0; i < nptes; i++) + ptep[i] = __pte(0UL); - /* Issue TLB flush at REAL_HPAGE_SIZE boundaries */ - maybe_tlb_batch_add(mm, addr, ptep, entry, 0); - maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, entry, 0); + maybe_tlb_batch_add(mm, addr, ptep, entry, 0, hugepage_shift); + /* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */ + if (size == HPAGE_SIZE) + maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, entry, 0, + hugepage_shift); return entry; } diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5d2f91511c60..7ed3975d04cc 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -324,6 +324,46 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_inde tsb_insert(tsb, tag, tte); } +#ifdef CONFIG_HUGETLB_PAGE +static int __init setup_hugepagesz(char *string) +{ + unsigned long long hugepage_size; + unsigned int hugepage_shift; + unsigned short hv_pgsz_idx; + unsigned int hv_pgsz_mask; + int rc = 0; + + hugepage_size = memparse(string, &string); + hugepage_shift = ilog2(hugepage_size); + + switch (hugepage_shift) { + case HPAGE_256MB_SHIFT: + hv_pgsz_mask = HV_PGSZ_MASK_256MB; + hv_pgsz_idx = HV_PGSZ_IDX_256MB; + break; + case HPAGE_SHIFT: + hv_pgsz_mask = HV_PGSZ_MASK_4MB; + hv_pgsz_idx = HV_PGSZ_IDX_4MB; + break; + default: + hv_pgsz_mask = 0; + } + + if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) { + pr_warn("hugepagesz=%llu not supported by MMU.\n", + hugepage_size); + goto out; + } + + hugetlb_add_hstate(hugepage_shift - PAGE_SHIFT); + rc = 1; + +out: + return rc; +} +__setup("hugepagesz=", setup_hugepagesz); +#endif /* CONFIG_HUGETLB_PAGE */ + void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { struct mm_struct *mm; @@ -347,7 +387,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) if ((mm->context.hugetlb_pte_count || mm->context.thp_pte_count) && - is_hugetlb_pte(pte)) { + is_hugetlb_pmd(__pmd(pte_val(pte)))) { /* We are fabricating 8MB pages using 4MB real hw pages. */ pte_val(pte) |= (address & (1UL << REAL_HPAGE_SHIFT)); __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT, diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index c56a195c9071..afda3bbf7854 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -67,7 +67,7 @@ void arch_leave_lazy_mmu_mode(void) } static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr, - bool exec, bool huge) + bool exec, unsigned int hugepage_shift) { struct tlb_batch *tb = &get_cpu_var(tlb_batch); unsigned long nr; @@ -84,19 +84,19 @@ static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr, } if (!tb->active) { - flush_tsb_user_page(mm, vaddr, huge); + flush_tsb_user_page(mm, vaddr, hugepage_shift); global_flush_tlb_page(mm, vaddr); goto out; } if (nr == 0) { tb->mm = mm; - tb->huge = huge; + tb->hugepage_shift = hugepage_shift; } - if (tb->huge != huge) { + if (tb->hugepage_shift != hugepage_shift) { flush_tlb_pending(); - tb->huge = huge; + tb->hugepage_shift = hugepage_shift; nr = 0; } @@ -110,10 +110,9 @@ out: } void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, - pte_t *ptep, pte_t orig, int fullmm) + pte_t *ptep, pte_t orig, int fullmm, + unsigned int hugepage_shift) { - bool huge = is_hugetlb_pte(orig); - if (tlb_type != hypervisor && pte_dirty(orig)) { unsigned long paddr, pfn = pte_pfn(orig); @@ -139,7 +138,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, no_cache_flush: if (!fullmm) - tlb_batch_add_one(mm, vaddr, pte_exec(orig), huge); + tlb_batch_add_one(mm, vaddr, pte_exec(orig), hugepage_shift); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index e20fbbafb0b0..4ccca32bd1e1 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -86,6 +86,33 @@ static void __flush_tsb_one(struct tlb_batch *tb, unsigned long hash_shift, __flush_tsb_one_entry(tsb, tb->vaddrs[i], hash_shift, nentries); } +#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) +static void __flush_huge_tsb_one_entry(unsigned long tsb, unsigned long v, + unsigned long hash_shift, + unsigned long nentries, + unsigned int hugepage_shift) +{ + unsigned int hpage_entries; + unsigned int i; + + hpage_entries = 1 << (hugepage_shift - hash_shift); + for (i = 0; i < hpage_entries; i++) + __flush_tsb_one_entry(tsb, v + (i << hash_shift), hash_shift, + nentries); +} + +static void __flush_huge_tsb_one(struct tlb_batch *tb, unsigned long hash_shift, + unsigned long tsb, unsigned long nentries, + unsigned int hugepage_shift) +{ + unsigned long i; + + for (i = 0; i < tb->tlb_nr; i++) + __flush_huge_tsb_one_entry(tsb, tb->vaddrs[i], hash_shift, + nentries, hugepage_shift); +} +#endif + void flush_tsb_user(struct tlb_batch *tb) { struct mm_struct *mm = tb->mm; @@ -93,7 +120,7 @@ void flush_tsb_user(struct tlb_batch *tb) spin_lock_irqsave(&mm->context.lock, flags); - if (!tb->huge) { + if (tb->hugepage_shift == PAGE_SHIFT) { base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb; nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) @@ -101,24 +128,26 @@ void flush_tsb_user(struct tlb_batch *tb) __flush_tsb_one(tb, PAGE_SHIFT, base, nentries); } #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) - if (tb->huge && mm->context.tsb_block[MM_TSB_HUGE].tsb) { + else if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb; nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) base = __pa(base); - __flush_tsb_one(tb, REAL_HPAGE_SHIFT, base, nentries); + __flush_huge_tsb_one(tb, REAL_HPAGE_SHIFT, base, nentries, + tb->hugepage_shift); } #endif spin_unlock_irqrestore(&mm->context.lock, flags); } -void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, bool huge) +void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, + unsigned int hugepage_shift) { unsigned long nentries, base, flags; spin_lock_irqsave(&mm->context.lock, flags); - if (!huge) { + if (hugepage_shift == PAGE_SHIFT) { base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb; nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) @@ -126,12 +155,13 @@ void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, bool huge) __flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries); } #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) - if (huge && mm->context.tsb_block[MM_TSB_HUGE].tsb) { + else if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb; nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) base = __pa(base); - __flush_tsb_one_entry(base, vaddr, REAL_HPAGE_SHIFT, nentries); + __flush_huge_tsb_one_entry(base, vaddr, REAL_HPAGE_SHIFT, + nentries, hugepage_shift); } #endif spin_unlock_irqrestore(&mm->context.lock, flags); From dcd1912d21a02534d1f0a9005d5ba3283f164780 Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Mon, 6 Feb 2017 12:33:26 -0800 Subject: [PATCH 10/13] sparc64: Add 64K page size support This patch depends on: [v6] sparc64: Multi-page size support - Testing Tested on Sonoma by running stream benchmark instance which allocated 48G worth of 64K pages. boot params: default_hugepagesz=64K hugepagesz=64K hugepages=1310720 Signed-off-by: Nitin Gupta Signed-off-by: David S. Miller --- arch/sparc/include/asm/page_64.h | 3 +- arch/sparc/mm/hugetlbpage.c | 54 +++++++++++++++++++++++++------- arch/sparc/mm/init_64.c | 4 +++ arch/sparc/mm/tsb.c | 5 +-- 4 files changed, 52 insertions(+), 14 deletions(-) diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h index d76f38d4171b..f294dd42fc7d 100644 --- a/arch/sparc/include/asm/page_64.h +++ b/arch/sparc/include/asm/page_64.h @@ -18,6 +18,7 @@ #define HPAGE_SHIFT 23 #define REAL_HPAGE_SHIFT 22 #define HPAGE_256MB_SHIFT 28 +#define HPAGE_64K_SHIFT 16 #define REAL_HPAGE_SIZE (_AC(1,UL) << REAL_HPAGE_SHIFT) #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) @@ -26,7 +27,7 @@ #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #define REAL_HPAGE_PER_HPAGE (_AC(1,UL) << (HPAGE_SHIFT - REAL_HPAGE_SHIFT)) -#define HUGE_MAX_HSTATE 2 +#define HUGE_MAX_HSTATE 3 #endif #ifndef __ASSEMBLY__ diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 618a568cc7f2..605bfceb7d54 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -149,6 +149,9 @@ static pte_t sun4v_hugepage_shift_to_tte(pte_t entry, unsigned int shift) case HPAGE_SHIFT: pte_val(entry) |= _PAGE_PMD_HUGE; break; + case HPAGE_64K_SHIFT: + hugepage_size = _PAGE_SZ64K_4V; + break; default: WARN_ONCE(1, "unsupported hugepage shift=%u\n", shift); } @@ -185,6 +188,9 @@ static unsigned int sun4v_huge_tte_to_shift(pte_t entry) case _PAGE_SZ4MB_4V: shift = REAL_HPAGE_SHIFT; break; + case _PAGE_SZ64K_4V: + shift = HPAGE_64K_SHIFT; + break; default: shift = PAGE_SHIFT; break; @@ -204,6 +210,9 @@ static unsigned int sun4u_huge_tte_to_shift(pte_t entry) case _PAGE_SZ4MB_4U: shift = REAL_HPAGE_SHIFT; break; + case _PAGE_SZ64K_4U: + shift = HPAGE_64K_SHIFT; + break; default: shift = PAGE_SHIFT; break; @@ -241,12 +250,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, { pgd_t *pgd; pud_t *pud; + pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); - if (pud) - pte = (pte_t *)pmd_alloc(mm, pud, addr); + if (pud) { + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + if (sz == PMD_SHIFT) + pte = (pte_t *)pmd; + else + pte = pte_alloc_map(mm, pmd, addr); + } return pte; } @@ -255,42 +273,52 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; + pmd_t *pmd; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); if (!pgd_none(*pgd)) { pud = pud_offset(pgd, addr); - if (!pud_none(*pud)) - pte = (pte_t *)pmd_offset(pud, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + if (is_hugetlb_pmd(*pmd)) + pte = (pte_t *)pmd; + else + pte = pte_offset_map(pmd, addr); + } + } } + return pte; } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { - unsigned int i, nptes, hugepage_shift; + unsigned int i, nptes, orig_shift, shift; unsigned long size; pte_t orig; size = huge_tte_to_size(entry); - nptes = size >> PMD_SHIFT; + shift = size >= HPAGE_SIZE ? PMD_SHIFT : PAGE_SHIFT; + nptes = size >> shift; if (!pte_present(*ptep) && pte_present(entry)) mm->context.hugetlb_pte_count += nptes; addr &= ~(size - 1); orig = *ptep; - hugepage_shift = pte_none(orig) ? PAGE_SIZE : huge_tte_to_shift(orig); + orig_shift = pte_none(orig) ? PAGE_SIZE : huge_tte_to_shift(orig); for (i = 0; i < nptes; i++) - ptep[i] = __pte(pte_val(entry) + (i << PMD_SHIFT)); + ptep[i] = __pte(pte_val(entry) + (i << shift)); - maybe_tlb_batch_add(mm, addr, ptep, orig, 0, hugepage_shift); + maybe_tlb_batch_add(mm, addr, ptep, orig, 0, orig_shift); /* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */ if (size == HPAGE_SIZE) maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, orig, 0, - hugepage_shift); + orig_shift); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, @@ -302,7 +330,11 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, entry = *ptep; size = huge_tte_to_size(entry); - nptes = size >> PMD_SHIFT; + if (size >= HPAGE_SIZE) + nptes = size >> PMD_SHIFT; + else + nptes = size >> PAGE_SHIFT; + hugepage_shift = pte_none(entry) ? PAGE_SIZE : huge_tte_to_shift(entry); if (pte_present(entry)) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 7ed3975d04cc..16c1e46e8603 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -345,6 +345,10 @@ static int __init setup_hugepagesz(char *string) hv_pgsz_mask = HV_PGSZ_MASK_4MB; hv_pgsz_idx = HV_PGSZ_IDX_4MB; break; + case HPAGE_64K_SHIFT: + hv_pgsz_mask = HV_PGSZ_MASK_64K; + hv_pgsz_idx = HV_PGSZ_IDX_64K; + break; default: hv_pgsz_mask = 0; } diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index 4ccca32bd1e1..e39fc57ad850 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -147,12 +147,13 @@ void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, spin_lock_irqsave(&mm->context.lock, flags); - if (hugepage_shift == PAGE_SHIFT) { + if (hugepage_shift < HPAGE_SHIFT) { base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb; nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) base = __pa(base); - __flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries); + __flush_huge_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries, + hugepage_shift); } #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) else if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { From 1537b26dab1c1dfd92a116a933143f52b1112a22 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 16 Feb 2017 15:05:58 -0500 Subject: [PATCH 11/13] sparc64: use latency groups to improve add_node_ranges speed add_node_ranges() takes 2.6s - 3.6s per 1T of boot time. On machine with 6T memory it takes 15.4s, on 32T it would take 82s-115s of boot time. This function sets NUMA ids for memory blocks, and scans the whole memory a page at a time to do so. But, we could use values in latency groups mask and match to determine the boundaries without checking every single page. With the fix the add_node_ranges() time is reduced from 15.4s down to 0.2s on machine with 6T memory. Signed-off-by: Pavel Tatashin Reviewed-by: Babu Moger Reviewed-by: Bob Picco Signed-off-by: David S. Miller --- arch/sparc/mm/init_64.c | 212 ++++++++++++++++++++++------------------ 1 file changed, 115 insertions(+), 97 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 16c1e46e8603..77446eaf1395 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -829,13 +829,23 @@ static void __init find_ramdisk(unsigned long phys_base) struct node_mem_mask { unsigned long mask; - unsigned long val; + unsigned long match; }; static struct node_mem_mask node_masks[MAX_NUMNODES]; static int num_node_masks; #ifdef CONFIG_NEED_MULTIPLE_NODES +struct mdesc_mlgroup { + u64 node; + u64 latency; + u64 match; + u64 mask; +}; + +static struct mdesc_mlgroup *mlgroups; +static int num_mlgroups; + int numa_cpu_lookup_table[NR_CPUS]; cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; @@ -846,78 +856,129 @@ struct mdesc_mblock { }; static struct mdesc_mblock *mblocks; static int num_mblocks; -static int find_numa_node_for_addr(unsigned long pa, - struct node_mem_mask *pnode_mask); -static unsigned long __init ra_to_pa(unsigned long addr) +static struct mdesc_mblock * __init addr_to_mblock(unsigned long addr) { + struct mdesc_mblock *m = NULL; int i; for (i = 0; i < num_mblocks; i++) { - struct mdesc_mblock *m = &mblocks[i]; + m = &mblocks[i]; if (addr >= m->base && addr < (m->base + m->size)) { - addr += m->offset; break; } } - return addr; + + return m; } -static int __init find_node(unsigned long addr) +static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid) { - static bool search_mdesc = true; - static struct node_mem_mask last_mem_mask = { ~0UL, ~0UL }; - static int last_index; - int i; + int prev_nid, new_nid; - addr = ra_to_pa(addr); - for (i = 0; i < num_node_masks; i++) { - struct node_mem_mask *p = &node_masks[i]; + prev_nid = -1; + for ( ; start < end; start += PAGE_SIZE) { + for (new_nid = 0; new_nid < num_node_masks; new_nid++) { + struct node_mem_mask *p = &node_masks[new_nid]; - if ((addr & p->mask) == p->val) - return i; - } - /* The following condition has been observed on LDOM guests because - * node_masks only contains the best latency mask and value. - * LDOM guest's mdesc can contain a single latency group to - * cover multiple address range. Print warning message only if the - * address cannot be found in node_masks nor mdesc. - */ - if ((search_mdesc) && - ((addr & last_mem_mask.mask) != last_mem_mask.val)) { - /* find the available node in the mdesc */ - last_index = find_numa_node_for_addr(addr, &last_mem_mask); - numadbg("find_node: latency group for address 0x%lx is %d\n", - addr, last_index); - if ((last_index < 0) || (last_index >= num_node_masks)) { - /* WARN_ONCE() and use default group 0 */ - WARN_ONCE(1, "find_node: A physical address doesn't match a NUMA node rule. Some physical memory will be owned by node 0."); - search_mdesc = false; - last_index = 0; + if ((start & p->mask) == p->match) { + if (prev_nid == -1) + prev_nid = new_nid; + break; + } } - } - return last_index; + if (new_nid == num_node_masks) { + prev_nid = 0; + WARN_ONCE(1, "addr[%Lx] doesn't match a NUMA node rule. Some memory will be owned by node 0.", + start); + break; + } + + if (prev_nid != new_nid) + break; + } + *nid = prev_nid; + + return start > end ? end : start; } static u64 __init memblock_nid_range(u64 start, u64 end, int *nid) { - *nid = find_node(start); - start += PAGE_SIZE; - while (start < end) { - int n = find_node(start); + u64 ret_end, pa_start, m_mask, m_match, m_end; + struct mdesc_mblock *mblock; + int _nid, i; - if (n != *nid) - break; - start += PAGE_SIZE; + if (tlb_type != hypervisor) + return memblock_nid_range_sun4u(start, end, nid); + + mblock = addr_to_mblock(start); + if (!mblock) { + WARN_ONCE(1, "memblock_nid_range: Can't find mblock addr[%Lx]", + start); + + _nid = 0; + ret_end = end; + goto done; } - if (start > end) - start = end; + pa_start = start + mblock->offset; + m_match = 0; + m_mask = 0; - return start; + for (_nid = 0; _nid < num_node_masks; _nid++) { + struct node_mem_mask *const m = &node_masks[_nid]; + + if ((pa_start & m->mask) == m->match) { + m_match = m->match; + m_mask = m->mask; + break; + } + } + + if (num_node_masks == _nid) { + /* We could not find NUMA group, so default to 0, but lets + * search for latency group, so we could calculate the correct + * end address that we return + */ + _nid = 0; + + for (i = 0; i < num_mlgroups; i++) { + struct mdesc_mlgroup *const m = &mlgroups[i]; + + if ((pa_start & m->mask) == m->match) { + m_match = m->match; + m_mask = m->mask; + break; + } + } + + if (i == num_mlgroups) { + WARN_ONCE(1, "memblock_nid_range: Can't find latency group addr[%Lx]", + start); + + ret_end = end; + goto done; + } + } + + /* + * Each latency group has match and mask, and each memory block has an + * offset. An address belongs to a latency group if its address matches + * the following formula: ((addr + offset) & mask) == match + * It is, however, slow to check every single page if it matches a + * particular latency group. As optimization we calculate end value by + * using bit arithmetics. + */ + m_end = m_match + (1ul << __ffs(m_mask)) - mblock->offset; + m_end += pa_start & ~((1ul << fls64(m_mask)) - 1); + ret_end = m_end > end ? end : m_end; + +done: + *nid = _nid; + return ret_end; } #endif @@ -958,7 +1019,8 @@ static void init_node_masks_nonnuma(void) numadbg("Initializing tables for non-numa.\n"); - node_masks[0].mask = node_masks[0].val = 0; + node_masks[0].mask = 0; + node_masks[0].match = 0; num_node_masks = 1; #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -976,15 +1038,6 @@ EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(numa_cpumask_lookup_table); EXPORT_SYMBOL(node_data); -struct mdesc_mlgroup { - u64 node; - u64 latency; - u64 match; - u64 mask; -}; -static struct mdesc_mlgroup *mlgroups; -static int num_mlgroups; - static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio, u32 cfg_handle) { @@ -1226,41 +1279,6 @@ int __node_distance(int from, int to) return numa_latency[from][to]; } -static int find_numa_node_for_addr(unsigned long pa, - struct node_mem_mask *pnode_mask) -{ - struct mdesc_handle *md = mdesc_grab(); - u64 node, arc; - int i = 0; - - node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups"); - if (node == MDESC_NODE_NULL) - goto out; - - mdesc_for_each_node_by_name(md, node, "group") { - mdesc_for_each_arc(arc, md, node, MDESC_ARC_TYPE_FWD) { - u64 target = mdesc_arc_target(md, arc); - struct mdesc_mlgroup *m = find_mlgroup(target); - - if (!m) - continue; - if ((pa & m->mask) == m->match) { - if (pnode_mask) { - pnode_mask->mask = m->mask; - pnode_mask->val = m->match; - } - mdesc_release(md); - return i; - } - } - i++; - } - -out: - mdesc_release(md); - return -1; -} - static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp) { int i; @@ -1268,7 +1286,7 @@ static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp) for (i = 0; i < MAX_NUMNODES; i++) { struct node_mem_mask *n = &node_masks[i]; - if ((grp->mask == n->mask) && (grp->match == n->val)) + if ((grp->mask == n->mask) && (grp->match == n->match)) break; } return i; @@ -1323,10 +1341,10 @@ static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp, n = &node_masks[num_node_masks++]; n->mask = candidate->mask; - n->val = candidate->match; + n->match = candidate->match; - numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%llx])\n", - index, n->mask, n->val, candidate->latency); + numadbg("NUMA NODE[%d]: mask[%lx] match[%lx] (latency[%llx])\n", + index, n->mask, n->match, candidate->latency); return 0; } @@ -1423,7 +1441,7 @@ static int __init numa_parse_jbus(void) numa_cpu_lookup_table[cpu] = index; cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu)); node_masks[index].mask = ~((1UL << 36UL) - 1UL); - node_masks[index].val = cpu << 36UL; + node_masks[index].match = cpu << 36UL; index++; } From cd429ce2d095041d249ec85feaed608bbf72154f Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 16 Feb 2017 15:13:54 -0500 Subject: [PATCH 12/13] sparc64: memblock resizes are not handled properly In add_node_ranges() when memblock resize happens, the iterator keeps using the previous freed array. This bug cause hangs on machine where there are over 128 memory blocks during boot. For example, on machines where memory interleaving is small. The problem is seen on T4-4 because it cant have 2T of memory, and memory is interleaved at 8G. So we have 2T/8G = 256 regions to set node IDs. The starting size of regions array is 128. Thus, we have to double at least one time (actually we have to double twice because some memory is already reserved and thus we need more than 256 regions). We start using an incorrect pointer to the array after the first doubling. Signed-off-by: Pavel Tatashin Signed-off-by: Babu Moger Reviewed-by: Babu Moger Signed-off-by: David S. Miller --- arch/sparc/mm/init_64.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 77446eaf1395..ccd455328989 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1126,6 +1126,10 @@ int of_node_to_nid(struct device_node *dp) static void __init add_node_ranges(void) { struct memblock_region *reg; + unsigned long prev_max; + +memblock_resized: + prev_max = memblock.memory.max; for_each_memblock(memory, reg) { unsigned long size = reg->size; @@ -1145,6 +1149,8 @@ static void __init add_node_ranges(void) memblock_set_node(start, this_end - start, &memblock.memory, nid); + if (memblock.memory.max != prev_max) + goto memblock_resized; start = this_end; } } From ac65e2828d03ddf84e9fe1fb6d110d8de933dc22 Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Fri, 24 Feb 2017 03:03:16 -0800 Subject: [PATCH 13/13] sparc64: Fix build error in flush_tsb_user_page Patch "sparc64: Add 64K page size support" unconditionally used __flush_huge_tsb_one_entry() which is available only when hugetlb support is enabled. Another issue was incorrect TSB flushing for 64K pages in flush_tsb_user(). Signed-off-by: Nitin Gupta Signed-off-by: David S. Miller --- arch/sparc/mm/hugetlbpage.c | 5 +++-- arch/sparc/mm/tsb.c | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 605bfceb7d54..e98a3f2e8f0f 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -309,7 +309,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, addr &= ~(size - 1); orig = *ptep; - orig_shift = pte_none(orig) ? PAGE_SIZE : huge_tte_to_shift(orig); + orig_shift = pte_none(orig) ? PAGE_SHIFT : huge_tte_to_shift(orig); for (i = 0; i < nptes; i++) ptep[i] = __pte(pte_val(entry) + (i << shift)); @@ -335,7 +335,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, else nptes = size >> PAGE_SHIFT; - hugepage_shift = pte_none(entry) ? PAGE_SIZE : huge_tte_to_shift(entry); + hugepage_shift = pte_none(entry) ? PAGE_SHIFT : + huge_tte_to_shift(entry); if (pte_present(entry)) mm->context.hugetlb_pte_count -= nptes; diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index e39fc57ad850..23479c3d39f0 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -120,12 +120,18 @@ void flush_tsb_user(struct tlb_batch *tb) spin_lock_irqsave(&mm->context.lock, flags); - if (tb->hugepage_shift == PAGE_SHIFT) { + if (tb->hugepage_shift < HPAGE_SHIFT) { base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb; nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) base = __pa(base); - __flush_tsb_one(tb, PAGE_SHIFT, base, nentries); + if (tb->hugepage_shift == PAGE_SHIFT) + __flush_tsb_one(tb, PAGE_SHIFT, base, nentries); +#if defined(CONFIG_HUGETLB_PAGE) + else + __flush_huge_tsb_one(tb, PAGE_SHIFT, base, nentries, + tb->hugepage_shift); +#endif } #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) else if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { @@ -152,8 +158,14 @@ void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr, nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries; if (tlb_type == cheetah_plus || tlb_type == hypervisor) base = __pa(base); - __flush_huge_tsb_one_entry(base, vaddr, PAGE_SHIFT, nentries, - hugepage_shift); + if (hugepage_shift == PAGE_SHIFT) + __flush_tsb_one_entry(base, vaddr, PAGE_SHIFT, + nentries); +#if defined(CONFIG_HUGETLB_PAGE) + else + __flush_huge_tsb_one_entry(base, vaddr, PAGE_SHIFT, + nentries, hugepage_shift); +#endif } #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE) else if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {