tmpfs: convert shmem_getpage_gfp to radix-swap

Convert shmem_getpage_gfp(), the engine-room of shmem, to expect page or
swap entry returned from radix tree by find_lock_page().

Whereas the repetitive old method proceeded mainly under info->lock,
dropping and repeating whenever one of the conditions needed was not
met, now we can proceed without it, leaving shmem_add_to_page_cache() to
check for a race.

This way there is no need to preallocate a page, no need for an early
radix_tree_preload(), no need for mem_cgroup_shmem_charge_fallback().

Move the error unwinding down to the bottom instead of repeating it
throughout.  ENOSPC handling is a little different from before: there is
no longer any race between find_lock_page() and finding swap, but we can
arrive at ENOSPC before calling shmem_recalc_inode(), which might
occasionally discover freed space.

Be stricter to check i_size before returning.  info->lock is used for
little but alloced, swapped, i_blocks updates.  Move i_blocks updates
out from under the max_blocks check, so even an unlimited size=0 mount
can show accurate du.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Hugh Dickins 2011-08-03 16:21:24 -07:00 committed by Linus Torvalds
parent 46f65ec15c
commit 54af604218
1 changed files with 114 additions and 149 deletions

View File

@ -166,15 +166,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
static LIST_HEAD(shmem_swaplist); static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex); static DEFINE_MUTEX(shmem_swaplist_mutex);
static void shmem_free_blocks(struct inode *inode, long pages)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
percpu_counter_add(&sbinfo->used_blocks, -pages);
inode->i_blocks -= pages*BLOCKS_PER_PAGE;
}
}
static int shmem_reserve_inode(struct super_block *sb) static int shmem_reserve_inode(struct super_block *sb)
{ {
struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@ -219,9 +210,12 @@ static void shmem_recalc_inode(struct inode *inode)
freed = info->alloced - info->swapped - inode->i_mapping->nrpages; freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
if (freed > 0) { if (freed > 0) {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks)
percpu_counter_add(&sbinfo->used_blocks, -freed);
info->alloced -= freed; info->alloced -= freed;
inode->i_blocks -= freed * BLOCKS_PER_PAGE;
shmem_unacct_blocks(info->flags, freed); shmem_unacct_blocks(info->flags, freed);
shmem_free_blocks(inode, freed);
} }
} }
@ -888,205 +882,180 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
{ {
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo; struct shmem_sb_info *sbinfo;
struct page *page; struct page *page;
struct page *prealloc_page = NULL;
swp_entry_t swap; swp_entry_t swap;
int error; int error;
int once = 0;
if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
return -EFBIG; return -EFBIG;
repeat: repeat:
swap.val = 0;
page = find_lock_page(mapping, index); page = find_lock_page(mapping, index);
if (page) { if (radix_tree_exceptional_entry(page)) {
swap = radix_to_swp_entry(page);
page = NULL;
}
if (sgp != SGP_WRITE &&
((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
goto failed;
}
if (page || (sgp == SGP_READ && !swap.val)) {
/* /*
* Once we can get the page lock, it must be uptodate: * Once we can get the page lock, it must be uptodate:
* if there were an error in reading back from swap, * if there were an error in reading back from swap,
* the page would not be inserted into the filecache. * the page would not be inserted into the filecache.
*/ */
BUG_ON(!PageUptodate(page)); BUG_ON(page && !PageUptodate(page));
goto done; *pagep = page;
return 0;
} }
/* /*
* Try to preload while we can wait, to not make a habit of * Fast cache lookup did not find it:
* draining atomic reserves; but don't latch on to this cpu. * bring it back from swap or allocate.
*/ */
error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); info = SHMEM_I(inode);
if (error) sbinfo = SHMEM_SB(inode->i_sb);
goto out;
radix_tree_preload_end();
if (sgp != SGP_READ && !prealloc_page) {
prealloc_page = shmem_alloc_page(gfp, info, index);
if (prealloc_page) {
SetPageSwapBacked(prealloc_page);
if (mem_cgroup_cache_charge(prealloc_page,
current->mm, GFP_KERNEL)) {
page_cache_release(prealloc_page);
prealloc_page = NULL;
}
}
}
spin_lock(&info->lock);
shmem_recalc_inode(inode);
swap = shmem_get_swap(info, index);
if (swap.val) { if (swap.val) {
/* Look it up and read it in.. */ /* Look it up and read it in.. */
page = lookup_swap_cache(swap); page = lookup_swap_cache(swap);
if (!page) { if (!page) {
spin_unlock(&info->lock);
/* here we actually do the io */ /* here we actually do the io */
if (fault_type) if (fault_type)
*fault_type |= VM_FAULT_MAJOR; *fault_type |= VM_FAULT_MAJOR;
page = shmem_swapin(swap, gfp, info, index); page = shmem_swapin(swap, gfp, info, index);
if (!page) { if (!page) {
swp_entry_t nswap = shmem_get_swap(info, index);
if (nswap.val == swap.val) {
error = -ENOMEM; error = -ENOMEM;
goto out; goto failed;
} }
goto repeat;
}
wait_on_page_locked(page);
page_cache_release(page);
goto repeat;
} }
/* We have to do this with page locked to prevent races */ /* We have to do this with page locked to prevent races */
if (!trylock_page(page)) { lock_page(page);
spin_unlock(&info->lock);
wait_on_page_locked(page);
page_cache_release(page);
goto repeat;
}
if (PageWriteback(page)) {
spin_unlock(&info->lock);
wait_on_page_writeback(page);
unlock_page(page);
page_cache_release(page);
goto repeat;
}
if (!PageUptodate(page)) { if (!PageUptodate(page)) {
spin_unlock(&info->lock);
unlock_page(page);
page_cache_release(page);
error = -EIO; error = -EIO;
goto out; goto failed;
}
wait_on_page_writeback(page);
/* Someone may have already done it for us */
if (page->mapping) {
if (page->mapping == mapping &&
page->index == index)
goto done;
error = -EEXIST;
goto failed;
} }
error = add_to_page_cache_locked(page, mapping, error = shmem_add_to_page_cache(page, mapping, index,
index, GFP_NOWAIT); gfp, swp_to_radix_entry(swap));
if (error) { if (error)
goto failed;
spin_lock(&info->lock);
info->swapped--;
shmem_recalc_inode(inode);
spin_unlock(&info->lock); spin_unlock(&info->lock);
if (error == -ENOMEM) {
/*
* reclaim from proper memory cgroup and
* call memcg's OOM if needed.
*/
error = mem_cgroup_shmem_charge_fallback(
page, current->mm, gfp);
if (error) {
unlock_page(page);
page_cache_release(page);
goto out;
}
}
unlock_page(page);
page_cache_release(page);
goto repeat;
}
delete_from_swap_cache(page); delete_from_swap_cache(page);
shmem_put_swap(info, index, (swp_entry_t){0});
info->swapped--;
spin_unlock(&info->lock);
set_page_dirty(page); set_page_dirty(page);
swap_free(swap); swap_free(swap);
} else if (sgp == SGP_READ) { } else {
page = find_get_page(mapping, index); if (shmem_acct_block(info->flags)) {
if (page && !trylock_page(page)) { error = -ENOSPC;
spin_unlock(&info->lock); goto failed;
wait_on_page_locked(page);
page_cache_release(page);
goto repeat;
} }
spin_unlock(&info->lock);
} else if (prealloc_page) {
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) { if (sbinfo->max_blocks) {
if (percpu_counter_compare(&sbinfo->used_blocks, if (percpu_counter_compare(&sbinfo->used_blocks,
sbinfo->max_blocks) >= 0 || sbinfo->max_blocks) >= 0) {
shmem_acct_block(info->flags)) error = -ENOSPC;
goto nospace; goto unacct;
}
percpu_counter_inc(&sbinfo->used_blocks); percpu_counter_inc(&sbinfo->used_blocks);
inode->i_blocks += BLOCKS_PER_PAGE;
} else if (shmem_acct_block(info->flags))
goto nospace;
page = prealloc_page;
prealloc_page = NULL;
swap = shmem_get_swap(info, index);
if (swap.val)
mem_cgroup_uncharge_cache_page(page);
else
error = add_to_page_cache_lru(page, mapping,
index, GFP_NOWAIT);
/*
* At add_to_page_cache_lru() failure,
* uncharge will be done automatically.
*/
if (swap.val || error) {
shmem_unacct_blocks(info->flags, 1);
shmem_free_blocks(inode, 1);
spin_unlock(&info->lock);
page_cache_release(page);
goto repeat;
} }
page = shmem_alloc_page(gfp, info, index);
if (!page) {
error = -ENOMEM;
goto decused;
}
SetPageSwapBacked(page);
__set_page_locked(page);
error = shmem_add_to_page_cache(page, mapping, index,
gfp, NULL);
if (error)
goto decused;
lru_cache_add_anon(page);
spin_lock(&info->lock);
info->alloced++; info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
spin_unlock(&info->lock); spin_unlock(&info->lock);
clear_highpage(page); clear_highpage(page);
flush_dcache_page(page); flush_dcache_page(page);
SetPageUptodate(page); SetPageUptodate(page);
if (sgp == SGP_DIRTY) if (sgp == SGP_DIRTY)
set_page_dirty(page); set_page_dirty(page);
} else {
spin_unlock(&info->lock);
error = -ENOMEM;
goto out;
} }
done: done:
*pagep = page; /* Perhaps the file has been truncated since we checked */
error = 0; if (sgp != SGP_WRITE &&
out: ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
if (prealloc_page) { error = -EINVAL;
mem_cgroup_uncharge_cache_page(prealloc_page); goto trunc;
page_cache_release(prealloc_page);
} }
return error; *pagep = page;
return 0;
nospace:
/* /*
* Perhaps the page was brought in from swap between find_lock_page * Error recovery.
* and taking info->lock? We allow for that at add_to_page_cache_lru,
* but must also avoid reporting a spurious ENOSPC while working on a
* full tmpfs.
*/ */
page = find_get_page(mapping, index); trunc:
ClearPageDirty(page);
delete_from_page_cache(page);
spin_lock(&info->lock);
info->alloced--;
inode->i_blocks -= BLOCKS_PER_PAGE;
spin_unlock(&info->lock); spin_unlock(&info->lock);
decused:
if (sbinfo->max_blocks)
percpu_counter_add(&sbinfo->used_blocks, -1);
unacct:
shmem_unacct_blocks(info->flags, 1);
failed:
if (swap.val && error != -EINVAL) {
struct page *test = find_get_page(mapping, index);
if (test && !radix_tree_exceptional_entry(test))
page_cache_release(test);
/* Have another try if the entry has changed */
if (test != swp_to_radix_entry(swap))
error = -EEXIST;
}
if (page) { if (page) {
unlock_page(page);
page_cache_release(page); page_cache_release(page);
}
if (error == -ENOSPC && !once++) {
info = SHMEM_I(inode);
spin_lock(&info->lock);
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
goto repeat; goto repeat;
} }
error = -ENOSPC; if (error == -EEXIST)
goto out; goto repeat;
return error;
} }
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@ -1095,9 +1064,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int error; int error;
int ret = VM_FAULT_LOCKED; int ret = VM_FAULT_LOCKED;
if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error) if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@ -2164,8 +2130,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
if (config.max_inodes < inodes) if (config.max_inodes < inodes)
goto out; goto out;
/* /*
* Those tests also disallow limited->unlimited while any are in * Those tests disallow limited->unlimited while any are in use;
* use, so i_blocks will always be zero when max_blocks is zero;
* but we must separately disallow unlimited->limited, because * but we must separately disallow unlimited->limited, because
* in that case we have no record of how much is already in use. * in that case we have no record of how much is already in use.
*/ */