diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 72cd942edb22..0160201993d4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +extern int madvise_free_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 882b04449904..1a4989fef08f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1501,6 +1501,77 @@ out: return 0; } +int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next) + +{ + spinlock_t *ptl; + pmd_t orig_pmd; + struct page *page; + struct mm_struct *mm = tlb->mm; + int ret = 0; + + if (!pmd_trans_huge_lock(pmd, vma, &ptl)) + goto out; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) { + ret = 1; + goto out; + } + + page = pmd_page(orig_pmd); + /* + * If other processes are mapping this page, we couldn't discard + * the page unless they all do MADV_FREE so let's skip the page. + */ + if (page_mapcount(page) != 1) + goto out; + + if (!trylock_page(page)) + goto out; + + /* + * If user want to discard part-pages of THP, split it so MADV_FREE + * will deactivate only them. + */ + if (next - addr != HPAGE_PMD_SIZE) { + get_page(page); + spin_unlock(ptl); + if (split_huge_page(page)) { + put_page(page); + unlock_page(page); + goto out_unlocked; + } + put_page(page); + unlock_page(page); + ret = 1; + goto out_unlocked; + } + + if (PageDirty(page)) + ClearPageDirty(page); + unlock_page(page); + + if (PageActive(page)) + deactivate_page(page); + + if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + ret = 1; +out: + spin_unlock(ptl); +out_unlocked: + return ret; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -2710,7 +2781,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write; + bool young, write, dirty; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); @@ -2734,6 +2805,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, atomic_add(HPAGE_PMD_NR - 1, &page->_count); write = pmd_write(*pmd); young = pmd_young(*pmd); + dirty = pmd_dirty(*pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2751,12 +2823,14 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = swp_entry_to_pte(swp_entry); } else { entry = mk_pte(page + i, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + entry = maybe_mkwrite(entry, vma); if (!write) entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); } + if (dirty) + SetPageDirty(page + i); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -2962,6 +3036,8 @@ static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, continue; flush_cache_page(vma, address, page_to_pfn(page)); entry = ptep_clear_flush(vma, address, pte + i); + if (pte_dirty(entry)) + SetPageDirty(page); swp_entry = make_migration_entry(page, pte_write(entry)); swp_pte = swp_entry_to_pte(swp_entry); if (pte_soft_dirty(entry)) @@ -3028,7 +3104,8 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, page_add_anon_rmap(page, vma, address, false); entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); - entry = pte_mkdirty(entry); + if (PageDirty(page)) + entry = pte_mkdirty(entry); if (is_write_migration_entry(swp_entry)) entry = maybe_mkwrite(entry, vma); @@ -3089,8 +3166,8 @@ static int __split_huge_page_tail(struct page *head, int tail, (1L << PG_uptodate) | (1L << PG_active) | (1L << PG_locked) | - (1L << PG_unevictable))); - page_tail->flags |= (1L << PG_dirty); + (1L << PG_unevictable) | + (1L << PG_dirty))); /* * After clearing PageTail the gup refcount can be released. diff --git a/mm/madvise.c b/mm/madvise.c index 4e9454622801..f56825b6d2e1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -271,8 +271,13 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, pte_t *orig_pte, *pte, ptent; struct page *page; int nr_swap = 0; + unsigned long next; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) + if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) + goto next; - split_huge_pmd(vma, pmd, addr); if (pmd_trans_unstable(pmd)) return 0; @@ -383,6 +388,7 @@ out: arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_pte, ptl); cond_resched(); +next: return 0; }