233 lines
5.4 KiB
C
233 lines
5.4 KiB
C
|
#include <linux/init.h>
|
||
|
#include <linux/bootmem.h>
|
||
|
#include <linux/fs.h>
|
||
|
#include <linux/sysfs.h>
|
||
|
#include <linux/kobject.h>
|
||
|
#include <linux/mm.h>
|
||
|
#include <linux/mmzone.h>
|
||
|
#include <linux/pagemap.h>
|
||
|
#include <linux/rmap.h>
|
||
|
#include <linux/mmu_notifier.h>
|
||
|
#include <linux/page_ext.h>
|
||
|
#include <linux/page_idle.h>
|
||
|
|
||
|
#define BITMAP_CHUNK_SIZE sizeof(u64)
|
||
|
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
|
||
|
|
||
|
/*
|
||
|
* Idle page tracking only considers user memory pages, for other types of
|
||
|
* pages the idle flag is always unset and an attempt to set it is silently
|
||
|
* ignored.
|
||
|
*
|
||
|
* We treat a page as a user memory page if it is on an LRU list, because it is
|
||
|
* always safe to pass such a page to rmap_walk(), which is essential for idle
|
||
|
* page tracking. With such an indicator of user pages we can skip isolated
|
||
|
* pages, but since there are not usually many of them, it will hardly affect
|
||
|
* the overall result.
|
||
|
*
|
||
|
* This function tries to get a user memory page by pfn as described above.
|
||
|
*/
|
||
|
static struct page *page_idle_get_page(unsigned long pfn)
|
||
|
{
|
||
|
struct page *page;
|
||
|
struct zone *zone;
|
||
|
|
||
|
if (!pfn_valid(pfn))
|
||
|
return NULL;
|
||
|
|
||
|
page = pfn_to_page(pfn);
|
||
|
if (!page || !PageLRU(page) ||
|
||
|
!get_page_unless_zero(page))
|
||
|
return NULL;
|
||
|
|
||
|
zone = page_zone(page);
|
||
|
spin_lock_irq(&zone->lru_lock);
|
||
|
if (unlikely(!PageLRU(page))) {
|
||
|
put_page(page);
|
||
|
page = NULL;
|
||
|
}
|
||
|
spin_unlock_irq(&zone->lru_lock);
|
||
|
return page;
|
||
|
}
|
||
|
|
||
|
static int page_idle_clear_pte_refs_one(struct page *page,
|
||
|
struct vm_area_struct *vma,
|
||
|
unsigned long addr, void *arg)
|
||
|
{
|
||
|
struct mm_struct *mm = vma->vm_mm;
|
||
|
spinlock_t *ptl;
|
||
|
pmd_t *pmd;
|
||
|
pte_t *pte;
|
||
|
bool referenced = false;
|
||
|
|
||
|
if (unlikely(PageTransHuge(page))) {
|
||
|
pmd = page_check_address_pmd(page, mm, addr,
|
||
|
PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
|
||
|
if (pmd) {
|
||
|
referenced = pmdp_clear_young_notify(vma, addr, pmd);
|
||
|
spin_unlock(ptl);
|
||
|
}
|
||
|
} else {
|
||
|
pte = page_check_address(page, mm, addr, &ptl, 0);
|
||
|
if (pte) {
|
||
|
referenced = ptep_clear_young_notify(vma, addr, pte);
|
||
|
pte_unmap_unlock(pte, ptl);
|
||
|
}
|
||
|
}
|
||
|
if (referenced) {
|
||
|
clear_page_idle(page);
|
||
|
/*
|
||
|
* We cleared the referenced bit in a mapping to this page. To
|
||
|
* avoid interference with page reclaim, mark it young so that
|
||
|
* page_referenced() will return > 0.
|
||
|
*/
|
||
|
set_page_young(page);
|
||
|
}
|
||
|
return SWAP_AGAIN;
|
||
|
}
|
||
|
|
||
|
static void page_idle_clear_pte_refs(struct page *page)
|
||
|
{
|
||
|
/*
|
||
|
* Since rwc.arg is unused, rwc is effectively immutable, so we
|
||
|
* can make it static const to save some cycles and stack.
|
||
|
*/
|
||
|
static const struct rmap_walk_control rwc = {
|
||
|
.rmap_one = page_idle_clear_pte_refs_one,
|
||
|
.anon_lock = page_lock_anon_vma_read,
|
||
|
};
|
||
|
bool need_lock;
|
||
|
|
||
|
if (!page_mapped(page) ||
|
||
|
!page_rmapping(page))
|
||
|
return;
|
||
|
|
||
|
need_lock = !PageAnon(page) || PageKsm(page);
|
||
|
if (need_lock && !trylock_page(page))
|
||
|
return;
|
||
|
|
||
|
rmap_walk(page, (struct rmap_walk_control *)&rwc);
|
||
|
|
||
|
if (need_lock)
|
||
|
unlock_page(page);
|
||
|
}
|
||
|
|
||
|
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
|
||
|
struct bin_attribute *attr, char *buf,
|
||
|
loff_t pos, size_t count)
|
||
|
{
|
||
|
u64 *out = (u64 *)buf;
|
||
|
struct page *page;
|
||
|
unsigned long pfn, end_pfn;
|
||
|
int bit;
|
||
|
|
||
|
if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
|
||
|
return -EINVAL;
|
||
|
|
||
|
pfn = pos * BITS_PER_BYTE;
|
||
|
if (pfn >= max_pfn)
|
||
|
return 0;
|
||
|
|
||
|
end_pfn = pfn + count * BITS_PER_BYTE;
|
||
|
if (end_pfn > max_pfn)
|
||
|
end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
|
||
|
|
||
|
for (; pfn < end_pfn; pfn++) {
|
||
|
bit = pfn % BITMAP_CHUNK_BITS;
|
||
|
if (!bit)
|
||
|
*out = 0ULL;
|
||
|
page = page_idle_get_page(pfn);
|
||
|
if (page) {
|
||
|
if (page_is_idle(page)) {
|
||
|
/*
|
||
|
* The page might have been referenced via a
|
||
|
* pte, in which case it is not idle. Clear
|
||
|
* refs and recheck.
|
||
|
*/
|
||
|
page_idle_clear_pte_refs(page);
|
||
|
if (page_is_idle(page))
|
||
|
*out |= 1ULL << bit;
|
||
|
}
|
||
|
put_page(page);
|
||
|
}
|
||
|
if (bit == BITMAP_CHUNK_BITS - 1)
|
||
|
out++;
|
||
|
cond_resched();
|
||
|
}
|
||
|
return (char *)out - buf;
|
||
|
}
|
||
|
|
||
|
static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
|
||
|
struct bin_attribute *attr, char *buf,
|
||
|
loff_t pos, size_t count)
|
||
|
{
|
||
|
const u64 *in = (u64 *)buf;
|
||
|
struct page *page;
|
||
|
unsigned long pfn, end_pfn;
|
||
|
int bit;
|
||
|
|
||
|
if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
|
||
|
return -EINVAL;
|
||
|
|
||
|
pfn = pos * BITS_PER_BYTE;
|
||
|
if (pfn >= max_pfn)
|
||
|
return -ENXIO;
|
||
|
|
||
|
end_pfn = pfn + count * BITS_PER_BYTE;
|
||
|
if (end_pfn > max_pfn)
|
||
|
end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
|
||
|
|
||
|
for (; pfn < end_pfn; pfn++) {
|
||
|
bit = pfn % BITMAP_CHUNK_BITS;
|
||
|
if ((*in >> bit) & 1) {
|
||
|
page = page_idle_get_page(pfn);
|
||
|
if (page) {
|
||
|
page_idle_clear_pte_refs(page);
|
||
|
set_page_idle(page);
|
||
|
put_page(page);
|
||
|
}
|
||
|
}
|
||
|
if (bit == BITMAP_CHUNK_BITS - 1)
|
||
|
in++;
|
||
|
cond_resched();
|
||
|
}
|
||
|
return (char *)in - buf;
|
||
|
}
|
||
|
|
||
|
static struct bin_attribute page_idle_bitmap_attr =
|
||
|
__BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
|
||
|
page_idle_bitmap_read, page_idle_bitmap_write, 0);
|
||
|
|
||
|
static struct bin_attribute *page_idle_bin_attrs[] = {
|
||
|
&page_idle_bitmap_attr,
|
||
|
NULL,
|
||
|
};
|
||
|
|
||
|
static struct attribute_group page_idle_attr_group = {
|
||
|
.bin_attrs = page_idle_bin_attrs,
|
||
|
.name = "page_idle",
|
||
|
};
|
||
|
|
||
|
#ifndef CONFIG_64BIT
|
||
|
static bool need_page_idle(void)
|
||
|
{
|
||
|
return true;
|
||
|
}
|
||
|
struct page_ext_operations page_idle_ops = {
|
||
|
.need = need_page_idle,
|
||
|
};
|
||
|
#endif
|
||
|
|
||
|
static int __init page_idle_init(void)
|
||
|
{
|
||
|
int err;
|
||
|
|
||
|
err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
|
||
|
if (err) {
|
||
|
pr_err("page_idle: register sysfs failed\n");
|
||
|
return err;
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
subsys_initcall(page_idle_init);
|