0895ecda79
The hugepage arch code provides a number of hook functions/macros which mirror the functionality of various normal page pte access functions. Various changes in the normal page accessors (in particular BenH's recent changes to the handling of lazy icache flushing and PAGE_EXEC) have caused the hugepage versions to get out of sync with the originals. In some cases, this is a bug, at least on some MMU types. One of the reasons that some hooks were not identical to the normal page versions, is that the fact we're dealing with a hugepage needed to be passed down do use the correct dcache-icache flush function. This patch makes the main flush_dcache_icache_page() function hugepage aware (by checking for the PageCompound flag). That in turn means we can make set_huge_pte_at() just a call to set_pte_at() bringing it back into sync. As a bonus, this lets us remove the hash_huge_page_do_lazy_icache() function, replacing it with a call to the hash_page_do_lazy_icache() function it was based on. Some other hugepage pte access hooks - huge_ptep_get_and_clear() and huge_ptep_clear_flush() - are not so easily unified, but this patch at least brings them back into sync with the current versions of the corresponding normal page functions. Signed-off-by: David Gibson <dwg@au1.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
572 lines
13 KiB
C
572 lines
13 KiB
C
/*
|
|
* PPC64 (POWER4) Huge TLB Page Support for Kernel.
|
|
*
|
|
* Copyright (C) 2003 David Gibson, IBM Corporation.
|
|
*
|
|
* Based on the IA-32 version:
|
|
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/io.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <asm/pgtable.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlb.h>
|
|
|
|
#define PAGE_SHIFT_64K 16
|
|
#define PAGE_SHIFT_16M 24
|
|
#define PAGE_SHIFT_16G 34
|
|
|
|
#define MAX_NUMBER_GPAGES 1024
|
|
|
|
/* Tracks the 16G pages after the device tree is scanned and before the
|
|
* huge_boot_pages list is ready. */
|
|
static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
|
|
static unsigned nr_gpages;
|
|
|
|
/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
|
|
* will choke on pointers to hugepte tables, which is handy for
|
|
* catching screwups early. */
|
|
|
|
static inline int shift_to_mmu_psize(unsigned int shift)
|
|
{
|
|
int psize;
|
|
|
|
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
|
|
if (mmu_psize_defs[psize].shift == shift)
|
|
return psize;
|
|
return -1;
|
|
}
|
|
|
|
static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
|
|
{
|
|
if (mmu_psize_defs[mmu_psize].shift)
|
|
return mmu_psize_defs[mmu_psize].shift;
|
|
BUG();
|
|
}
|
|
|
|
#define hugepd_none(hpd) ((hpd).pd == 0)
|
|
|
|
static inline pte_t *hugepd_page(hugepd_t hpd)
|
|
{
|
|
BUG_ON(!hugepd_ok(hpd));
|
|
return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
|
|
}
|
|
|
|
static inline unsigned int hugepd_shift(hugepd_t hpd)
|
|
{
|
|
return hpd.pd & HUGEPD_SHIFT_MASK;
|
|
}
|
|
|
|
static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
|
|
{
|
|
unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
|
|
pte_t *dir = hugepd_page(*hpdp);
|
|
|
|
return dir + idx;
|
|
}
|
|
|
|
pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
|
|
{
|
|
pgd_t *pg;
|
|
pud_t *pu;
|
|
pmd_t *pm;
|
|
hugepd_t *hpdp = NULL;
|
|
unsigned pdshift = PGDIR_SHIFT;
|
|
|
|
if (shift)
|
|
*shift = 0;
|
|
|
|
pg = pgdir + pgd_index(ea);
|
|
if (is_hugepd(pg)) {
|
|
hpdp = (hugepd_t *)pg;
|
|
} else if (!pgd_none(*pg)) {
|
|
pdshift = PUD_SHIFT;
|
|
pu = pud_offset(pg, ea);
|
|
if (is_hugepd(pu))
|
|
hpdp = (hugepd_t *)pu;
|
|
else if (!pud_none(*pu)) {
|
|
pdshift = PMD_SHIFT;
|
|
pm = pmd_offset(pu, ea);
|
|
if (is_hugepd(pm))
|
|
hpdp = (hugepd_t *)pm;
|
|
else if (!pmd_none(*pm)) {
|
|
return pte_offset_map(pm, ea);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!hpdp)
|
|
return NULL;
|
|
|
|
if (shift)
|
|
*shift = hugepd_shift(*hpdp);
|
|
return hugepte_offset(hpdp, ea, pdshift);
|
|
}
|
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
|
|
}
|
|
|
|
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
|
|
unsigned long address, unsigned pdshift, unsigned pshift)
|
|
{
|
|
pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
|
|
GFP_KERNEL|__GFP_REPEAT);
|
|
|
|
BUG_ON(pshift > HUGEPD_SHIFT_MASK);
|
|
BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
|
|
|
|
if (! new)
|
|
return -ENOMEM;
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
if (!hugepd_none(*hpdp))
|
|
kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
|
|
else
|
|
hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
|
|
spin_unlock(&mm->page_table_lock);
|
|
return 0;
|
|
}
|
|
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
|
|
{
|
|
pgd_t *pg;
|
|
pud_t *pu;
|
|
pmd_t *pm;
|
|
hugepd_t *hpdp = NULL;
|
|
unsigned pshift = __ffs(sz);
|
|
unsigned pdshift = PGDIR_SHIFT;
|
|
|
|
addr &= ~(sz-1);
|
|
|
|
pg = pgd_offset(mm, addr);
|
|
if (pshift >= PUD_SHIFT) {
|
|
hpdp = (hugepd_t *)pg;
|
|
} else {
|
|
pdshift = PUD_SHIFT;
|
|
pu = pud_alloc(mm, pg, addr);
|
|
if (pshift >= PMD_SHIFT) {
|
|
hpdp = (hugepd_t *)pu;
|
|
} else {
|
|
pdshift = PMD_SHIFT;
|
|
pm = pmd_alloc(mm, pu, addr);
|
|
hpdp = (hugepd_t *)pm;
|
|
}
|
|
}
|
|
|
|
if (!hpdp)
|
|
return NULL;
|
|
|
|
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
|
|
|
|
if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
|
|
return NULL;
|
|
|
|
return hugepte_offset(hpdp, addr, pdshift);
|
|
}
|
|
|
|
/* Build list of addresses of gigantic pages. This function is used in early
|
|
* boot before the buddy or bootmem allocator is setup.
|
|
*/
|
|
void add_gpage(unsigned long addr, unsigned long page_size,
|
|
unsigned long number_of_pages)
|
|
{
|
|
if (!addr)
|
|
return;
|
|
while (number_of_pages > 0) {
|
|
gpage_freearray[nr_gpages] = addr;
|
|
nr_gpages++;
|
|
number_of_pages--;
|
|
addr += page_size;
|
|
}
|
|
}
|
|
|
|
/* Moves the gigantic page addresses from the temporary list to the
|
|
* huge_boot_pages list.
|
|
*/
|
|
int alloc_bootmem_huge_page(struct hstate *hstate)
|
|
{
|
|
struct huge_bootmem_page *m;
|
|
if (nr_gpages == 0)
|
|
return 0;
|
|
m = phys_to_virt(gpage_freearray[--nr_gpages]);
|
|
gpage_freearray[nr_gpages] = 0;
|
|
list_add(&m->list, &huge_boot_pages);
|
|
m->hstate = hstate;
|
|
return 1;
|
|
}
|
|
|
|
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
|
|
unsigned long start, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pte_t *hugepte = hugepd_page(*hpdp);
|
|
unsigned shift = hugepd_shift(*hpdp);
|
|
unsigned long pdmask = ~((1UL << pdshift) - 1);
|
|
|
|
start &= pdmask;
|
|
if (start < floor)
|
|
return;
|
|
if (ceiling) {
|
|
ceiling &= pdmask;
|
|
if (! ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
return;
|
|
|
|
hpdp->pd = 0;
|
|
tlb->need_flush = 1;
|
|
pgtable_free_tlb(tlb, hugepte, pdshift - shift);
|
|
}
|
|
|
|
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
unsigned long start;
|
|
|
|
start = addr;
|
|
pmd = pmd_offset(pud, addr);
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
if (pmd_none(*pmd))
|
|
continue;
|
|
free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
|
|
addr, next, floor, ceiling);
|
|
} while (pmd++, addr = next, addr != end);
|
|
|
|
start &= PUD_MASK;
|
|
if (start < floor)
|
|
return;
|
|
if (ceiling) {
|
|
ceiling &= PUD_MASK;
|
|
if (!ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
return;
|
|
|
|
pmd = pmd_offset(pud, start);
|
|
pud_clear(pud);
|
|
pmd_free_tlb(tlb, pmd, start);
|
|
}
|
|
|
|
static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
unsigned long start;
|
|
|
|
start = addr;
|
|
pud = pud_offset(pgd, addr);
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
if (!is_hugepd(pud)) {
|
|
if (pud_none_or_clear_bad(pud))
|
|
continue;
|
|
hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
|
|
ceiling);
|
|
} else {
|
|
free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
|
|
addr, next, floor, ceiling);
|
|
}
|
|
} while (pud++, addr = next, addr != end);
|
|
|
|
start &= PGDIR_MASK;
|
|
if (start < floor)
|
|
return;
|
|
if (ceiling) {
|
|
ceiling &= PGDIR_MASK;
|
|
if (!ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
return;
|
|
|
|
pud = pud_offset(pgd, start);
|
|
pgd_clear(pgd);
|
|
pud_free_tlb(tlb, pud, start);
|
|
}
|
|
|
|
/*
|
|
* This function frees user-level page tables of a process.
|
|
*
|
|
* Must be called with pagetable lock held.
|
|
*/
|
|
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pgd_t *pgd;
|
|
unsigned long next;
|
|
|
|
/*
|
|
* Because there are a number of different possible pagetable
|
|
* layouts for hugepage ranges, we limit knowledge of how
|
|
* things should be laid out to the allocation path
|
|
* (huge_pte_alloc(), above). Everything else works out the
|
|
* structure as it goes from information in the hugepd
|
|
* pointers. That means that we can't here use the
|
|
* optimization used in the normal page free_pgd_range(), of
|
|
* checking whether we're actually covering a large enough
|
|
* range to have to do anything at the top level of the walk
|
|
* instead of at the bottom.
|
|
*
|
|
* To make sense of this, you should probably go read the big
|
|
* block comment at the top of the normal free_pgd_range(),
|
|
* too.
|
|
*/
|
|
|
|
pgd = pgd_offset(tlb->mm, addr);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
if (!is_hugepd(pgd)) {
|
|
if (pgd_none_or_clear_bad(pgd))
|
|
continue;
|
|
hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
|
|
} else {
|
|
free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
|
|
addr, next, floor, ceiling);
|
|
}
|
|
} while (pgd++, addr = next, addr != end);
|
|
}
|
|
|
|
struct page *
|
|
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
|
|
{
|
|
pte_t *ptep;
|
|
struct page *page;
|
|
unsigned shift;
|
|
unsigned long mask;
|
|
|
|
ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
|
|
|
|
/* Verify it is a huge page else bail. */
|
|
if (!ptep || !shift)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
mask = (1UL << shift) - 1;
|
|
page = pte_page(*ptep);
|
|
if (page)
|
|
page += (address & mask) / PAGE_SIZE;
|
|
|
|
return page;
|
|
}
|
|
|
|
int pmd_huge(pmd_t pmd)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
int pud_huge(pud_t pud)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
struct page *
|
|
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
|
pmd_t *pmd, int write)
|
|
{
|
|
BUG();
|
|
return NULL;
|
|
}
|
|
|
|
static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
|
|
unsigned long end, int write, struct page **pages, int *nr)
|
|
{
|
|
unsigned long mask;
|
|
unsigned long pte_end;
|
|
struct page *head, *page;
|
|
pte_t pte;
|
|
int refs;
|
|
|
|
pte_end = (addr + sz) & ~(sz-1);
|
|
if (pte_end < end)
|
|
end = pte_end;
|
|
|
|
pte = *ptep;
|
|
mask = _PAGE_PRESENT | _PAGE_USER;
|
|
if (write)
|
|
mask |= _PAGE_RW;
|
|
|
|
if ((pte_val(pte) & mask) != mask)
|
|
return 0;
|
|
|
|
/* hugepages are never "special" */
|
|
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
|
|
|
refs = 0;
|
|
head = pte_page(pte);
|
|
|
|
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
|
|
do {
|
|
VM_BUG_ON(compound_head(page) != head);
|
|
pages[*nr] = page;
|
|
(*nr)++;
|
|
page++;
|
|
refs++;
|
|
} while (addr += PAGE_SIZE, addr != end);
|
|
|
|
if (!page_cache_add_speculative(head, refs)) {
|
|
*nr -= refs;
|
|
return 0;
|
|
}
|
|
|
|
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
|
/* Could be optimized better */
|
|
while (*nr) {
|
|
put_page(page);
|
|
(*nr)--;
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
|
|
unsigned long addr, unsigned long end,
|
|
int write, struct page **pages, int *nr)
|
|
{
|
|
pte_t *ptep;
|
|
unsigned long sz = 1UL << hugepd_shift(*hugepd);
|
|
|
|
ptep = hugepte_offset(hugepd, addr, pdshift);
|
|
do {
|
|
if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
|
|
return 0;
|
|
} while (ptep++, addr += sz, addr != end);
|
|
|
|
return 1;
|
|
}
|
|
|
|
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long pgoff,
|
|
unsigned long flags)
|
|
{
|
|
struct hstate *hstate = hstate_file(file);
|
|
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
|
|
|
|
return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
|
|
}
|
|
|
|
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
|
|
{
|
|
unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
|
|
|
|
return 1UL << mmu_psize_to_shift(psize);
|
|
}
|
|
|
|
static int __init add_huge_page_size(unsigned long long size)
|
|
{
|
|
int shift = __ffs(size);
|
|
int mmu_psize;
|
|
|
|
/* Check that it is a page size supported by the hardware and
|
|
* that it fits within pagetable and slice limits. */
|
|
if (!is_power_of_2(size)
|
|
|| (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
|
|
return -EINVAL;
|
|
|
|
if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
|
|
return -EINVAL;
|
|
|
|
#ifdef CONFIG_SPU_FS_64K_LS
|
|
/* Disable support for 64K huge pages when 64K SPU local store
|
|
* support is enabled as the current implementation conflicts.
|
|
*/
|
|
if (shift == PAGE_SHIFT_64K)
|
|
return -EINVAL;
|
|
#endif /* CONFIG_SPU_FS_64K_LS */
|
|
|
|
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
|
|
|
|
/* Return if huge page size has already been setup */
|
|
if (size_to_hstate(size))
|
|
return 0;
|
|
|
|
hugetlb_add_hstate(shift - PAGE_SHIFT);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __init hugepage_setup_sz(char *str)
|
|
{
|
|
unsigned long long size;
|
|
|
|
size = memparse(str, &str);
|
|
|
|
if (add_huge_page_size(size) != 0)
|
|
printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
|
|
|
|
return 1;
|
|
}
|
|
__setup("hugepagesz=", hugepage_setup_sz);
|
|
|
|
static int __init hugetlbpage_init(void)
|
|
{
|
|
int psize;
|
|
|
|
if (!cpu_has_feature(CPU_FTR_16M_PAGE))
|
|
return -ENODEV;
|
|
|
|
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
|
|
unsigned shift;
|
|
unsigned pdshift;
|
|
|
|
if (!mmu_psize_defs[psize].shift)
|
|
continue;
|
|
|
|
shift = mmu_psize_to_shift(psize);
|
|
|
|
if (add_huge_page_size(1ULL << shift) < 0)
|
|
continue;
|
|
|
|
if (shift < PMD_SHIFT)
|
|
pdshift = PMD_SHIFT;
|
|
else if (shift < PUD_SHIFT)
|
|
pdshift = PUD_SHIFT;
|
|
else
|
|
pdshift = PGDIR_SHIFT;
|
|
|
|
pgtable_cache_add(pdshift - shift, NULL);
|
|
if (!PGT_CACHE(pdshift - shift))
|
|
panic("hugetlbpage_init(): could not create "
|
|
"pgtable cache for %d bit pagesize\n", shift);
|
|
}
|
|
|
|
/* Set default large page size. Currently, we pick 16M or 1M
|
|
* depending on what is available
|
|
*/
|
|
if (mmu_psize_defs[MMU_PAGE_16M].shift)
|
|
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
|
|
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
|
|
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
|
|
|
|
return 0;
|
|
}
|
|
|
|
module_init(hugetlbpage_init);
|
|
|
|
void flush_dcache_icache_hugepage(struct page *page)
|
|
{
|
|
int i;
|
|
|
|
BUG_ON(!PageCompound(page));
|
|
|
|
for (i = 0; i < (1UL << compound_order(page)); i++)
|
|
__flush_dcache_icache(page_address(page+i));
|
|
}
|