Merge branch 'page-refs' (page ref overflow)
Merge page ref overflow branch. Jann Horn reported that he can overflow the page ref count with sufficient memory (and a filesystem that is intentionally extremely slow). Admittedly it's not exactly easy. To have more than four billion references to a page requires a minimum of 32GB of kernel memory just for the pointers to the pages, much less any metadata to keep track of those pointers. Jann needed a total of 140GB of memory and a specially crafted filesystem that leaves all reads pending (in order to not ever free the page references and just keep adding more). Still, we have a fairly straightforward way to limit the two obvious user-controllable sources of page references: direct-IO like page references gotten through get_user_pages(), and the splice pipe page duplication. So let's just do that. * branch page-refs: fs: prevent page refcount overflow in pipe_buf_get mm: prevent get_user_pages() from overflowing page refcount mm: add 'try_get_page()' helper function mm: make page ref count overflow check tighter and more explicit
This commit is contained in:
commit
6b3a707736
|
@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
|
||||||
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
|
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
|
||||||
|
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
if (rem < len) {
|
if (rem < len)
|
||||||
pipe_unlock(pipe);
|
goto out_free;
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
rem = len;
|
rem = len;
|
||||||
while (rem) {
|
while (rem) {
|
||||||
|
@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
|
||||||
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
|
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
|
||||||
pipe->nrbufs--;
|
pipe->nrbufs--;
|
||||||
} else {
|
} else {
|
||||||
pipe_buf_get(pipe, ibuf);
|
if (!pipe_buf_get(pipe, ibuf))
|
||||||
|
goto out_free;
|
||||||
|
|
||||||
*obuf = *ibuf;
|
*obuf = *ibuf;
|
||||||
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
|
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
|
||||||
obuf->len = rem;
|
obuf->len = rem;
|
||||||
|
@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
|
||||||
ret = fuse_dev_do_write(fud, &cs, len);
|
ret = fuse_dev_do_write(fud, &cs, len);
|
||||||
|
|
||||||
pipe_lock(pipe);
|
pipe_lock(pipe);
|
||||||
|
out_free:
|
||||||
for (idx = 0; idx < nbuf; idx++)
|
for (idx = 0; idx < nbuf; idx++)
|
||||||
pipe_buf_release(pipe, &bufs[idx]);
|
pipe_buf_release(pipe, &bufs[idx]);
|
||||||
pipe_unlock(pipe);
|
pipe_unlock(pipe);
|
||||||
|
|
||||||
out:
|
|
||||||
kvfree(bufs);
|
kvfree(bufs);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
|
||||||
* in the tee() system call, when we duplicate the buffers in one
|
* in the tee() system call, when we duplicate the buffers in one
|
||||||
* pipe into another.
|
* pipe into another.
|
||||||
*/
|
*/
|
||||||
void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
|
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
|
||||||
{
|
{
|
||||||
get_page(buf->page);
|
return try_get_page(buf->page);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(generic_pipe_buf_get);
|
EXPORT_SYMBOL(generic_pipe_buf_get);
|
||||||
|
|
||||||
|
|
12
fs/splice.c
12
fs/splice.c
|
@ -1593,7 +1593,11 @@ retry:
|
||||||
* Get a reference to this pipe buffer,
|
* Get a reference to this pipe buffer,
|
||||||
* so we can copy the contents over.
|
* so we can copy the contents over.
|
||||||
*/
|
*/
|
||||||
pipe_buf_get(ipipe, ibuf);
|
if (!pipe_buf_get(ipipe, ibuf)) {
|
||||||
|
if (ret == 0)
|
||||||
|
ret = -EFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
*obuf = *ibuf;
|
*obuf = *ibuf;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
|
||||||
* Get a reference to this pipe buffer,
|
* Get a reference to this pipe buffer,
|
||||||
* so we can copy the contents over.
|
* so we can copy the contents over.
|
||||||
*/
|
*/
|
||||||
pipe_buf_get(ipipe, ibuf);
|
if (!pipe_buf_get(ipipe, ibuf)) {
|
||||||
|
if (ret == 0)
|
||||||
|
ret = -EFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
obuf = opipe->bufs + nbuf;
|
obuf = opipe->bufs + nbuf;
|
||||||
*obuf = *ibuf;
|
*obuf = *ibuf;
|
||||||
|
|
|
@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||||
|
|
||||||
|
/* 127: arbitrary random number, small enough to assemble well */
|
||||||
|
#define page_ref_zero_or_close_to_overflow(page) \
|
||||||
|
((unsigned int) page_ref_count(page) + 127u <= 127u)
|
||||||
|
|
||||||
static inline void get_page(struct page *page)
|
static inline void get_page(struct page *page)
|
||||||
{
|
{
|
||||||
page = compound_head(page);
|
page = compound_head(page);
|
||||||
|
@ -973,10 +977,19 @@ static inline void get_page(struct page *page)
|
||||||
* Getting a normal page or the head of a compound page
|
* Getting a normal page or the head of a compound page
|
||||||
* requires to already have an elevated page->_refcount.
|
* requires to already have an elevated page->_refcount.
|
||||||
*/
|
*/
|
||||||
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
|
VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
|
||||||
page_ref_inc(page);
|
page_ref_inc(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline __must_check bool try_get_page(struct page *page)
|
||||||
|
{
|
||||||
|
page = compound_head(page);
|
||||||
|
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
|
||||||
|
return false;
|
||||||
|
page_ref_inc(page);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static inline void put_page(struct page *page)
|
static inline void put_page(struct page *page)
|
||||||
{
|
{
|
||||||
page = compound_head(page);
|
page = compound_head(page);
|
||||||
|
|
|
@ -101,18 +101,20 @@ struct pipe_buf_operations {
|
||||||
/*
|
/*
|
||||||
* Get a reference to the pipe buffer.
|
* Get a reference to the pipe buffer.
|
||||||
*/
|
*/
|
||||||
void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
|
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* pipe_buf_get - get a reference to a pipe_buffer
|
* pipe_buf_get - get a reference to a pipe_buffer
|
||||||
* @pipe: the pipe that the buffer belongs to
|
* @pipe: the pipe that the buffer belongs to
|
||||||
* @buf: the buffer to get a reference to
|
* @buf: the buffer to get a reference to
|
||||||
|
*
|
||||||
|
* Return: %true if the reference was successfully obtained.
|
||||||
*/
|
*/
|
||||||
static inline void pipe_buf_get(struct pipe_inode_info *pipe,
|
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
|
||||||
struct pipe_buffer *buf)
|
struct pipe_buffer *buf)
|
||||||
{
|
{
|
||||||
buf->ops->get(pipe, buf);
|
return buf->ops->get(pipe, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void);
|
||||||
void free_pipe_info(struct pipe_inode_info *);
|
void free_pipe_info(struct pipe_inode_info *);
|
||||||
|
|
||||||
/* Generic pipe buffer ops functions */
|
/* Generic pipe buffer ops functions */
|
||||||
void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
|
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
|
||||||
int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
|
int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
|
||||||
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
|
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
|
||||||
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
|
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
|
||||||
|
|
|
@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
|
||||||
buf->private = 0;
|
buf->private = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
|
static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
|
||||||
struct pipe_buffer *buf)
|
struct pipe_buffer *buf)
|
||||||
{
|
{
|
||||||
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
|
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
|
||||||
|
|
||||||
|
if (ref->ref > INT_MAX/2)
|
||||||
|
return false;
|
||||||
|
|
||||||
ref->ref++;
|
ref->ref++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Pipe buffer operations for a buffer. */
|
/* Pipe buffer operations for a buffer. */
|
||||||
|
|
48
mm/gup.c
48
mm/gup.c
|
@ -160,8 +160,12 @@ retry:
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & FOLL_GET)
|
if (flags & FOLL_GET) {
|
||||||
get_page(page);
|
if (unlikely(!try_get_page(page))) {
|
||||||
|
page = ERR_PTR(-ENOMEM);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (flags & FOLL_TOUCH) {
|
if (flags & FOLL_TOUCH) {
|
||||||
if ((flags & FOLL_WRITE) &&
|
if ((flags & FOLL_WRITE) &&
|
||||||
!pte_dirty(pte) && !PageDirty(page))
|
!pte_dirty(pte) && !PageDirty(page))
|
||||||
|
@ -298,7 +302,10 @@ retry_locked:
|
||||||
if (pmd_trans_unstable(pmd))
|
if (pmd_trans_unstable(pmd))
|
||||||
ret = -EBUSY;
|
ret = -EBUSY;
|
||||||
} else {
|
} else {
|
||||||
get_page(page);
|
if (unlikely(!try_get_page(page))) {
|
||||||
|
spin_unlock(ptl);
|
||||||
|
return ERR_PTR(-ENOMEM);
|
||||||
|
}
|
||||||
spin_unlock(ptl);
|
spin_unlock(ptl);
|
||||||
lock_page(page);
|
lock_page(page);
|
||||||
ret = split_huge_page(page);
|
ret = split_huge_page(page);
|
||||||
|
@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
|
||||||
if (is_device_public_page(*page))
|
if (is_device_public_page(*page))
|
||||||
goto unmap;
|
goto unmap;
|
||||||
}
|
}
|
||||||
get_page(*page);
|
if (unlikely(!try_get_page(*page))) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto unmap;
|
||||||
|
}
|
||||||
out:
|
out:
|
||||||
ret = 0;
|
ret = 0;
|
||||||
unmap:
|
unmap:
|
||||||
|
@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return the compund head page with ref appropriately incremented,
|
||||||
|
* or NULL if that failed.
|
||||||
|
*/
|
||||||
|
static inline struct page *try_get_compound_head(struct page *page, int refs)
|
||||||
|
{
|
||||||
|
struct page *head = compound_head(page);
|
||||||
|
if (WARN_ON_ONCE(page_ref_count(head) < 0))
|
||||||
|
return NULL;
|
||||||
|
if (unlikely(!page_cache_add_speculative(head, refs)))
|
||||||
|
return NULL;
|
||||||
|
return head;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
|
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
|
||||||
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||||
int write, struct page **pages, int *nr)
|
int write, struct page **pages, int *nr)
|
||||||
|
@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||||
|
|
||||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||||
page = pte_page(pte);
|
page = pte_page(pte);
|
||||||
head = compound_head(page);
|
|
||||||
|
|
||||||
if (!page_cache_get_speculative(head))
|
head = try_get_compound_head(page, 1);
|
||||||
|
if (!head)
|
||||||
goto pte_unmap;
|
goto pte_unmap;
|
||||||
|
|
||||||
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
||||||
|
@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||||
refs++;
|
refs++;
|
||||||
} while (addr += PAGE_SIZE, addr != end);
|
} while (addr += PAGE_SIZE, addr != end);
|
||||||
|
|
||||||
head = compound_head(pmd_page(orig));
|
head = try_get_compound_head(pmd_page(orig), refs);
|
||||||
if (!page_cache_add_speculative(head, refs)) {
|
if (!head) {
|
||||||
*nr -= refs;
|
*nr -= refs;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
||||||
refs++;
|
refs++;
|
||||||
} while (addr += PAGE_SIZE, addr != end);
|
} while (addr += PAGE_SIZE, addr != end);
|
||||||
|
|
||||||
head = compound_head(pud_page(orig));
|
head = try_get_compound_head(pud_page(orig), refs);
|
||||||
if (!page_cache_add_speculative(head, refs)) {
|
if (!head) {
|
||||||
*nr -= refs;
|
*nr -= refs;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
|
||||||
refs++;
|
refs++;
|
||||||
} while (addr += PAGE_SIZE, addr != end);
|
} while (addr += PAGE_SIZE, addr != end);
|
||||||
|
|
||||||
head = compound_head(pgd_page(orig));
|
head = try_get_compound_head(pgd_page(orig), refs);
|
||||||
if (!page_cache_add_speculative(head, refs)) {
|
if (!head) {
|
||||||
*nr -= refs;
|
*nr -= refs;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
13
mm/hugetlb.c
13
mm/hugetlb.c
|
@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
|
|
||||||
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
|
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
|
||||||
page = pte_page(huge_ptep_get(pte));
|
page = pte_page(huge_ptep_get(pte));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Instead of doing 'try_get_page()' below in the same_page
|
||||||
|
* loop, just check the count once here.
|
||||||
|
*/
|
||||||
|
if (unlikely(page_count(page) <= 0)) {
|
||||||
|
if (pages) {
|
||||||
|
spin_unlock(ptl);
|
||||||
|
remainder = 0;
|
||||||
|
err = -ENOMEM;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
same_page:
|
same_page:
|
||||||
if (pages) {
|
if (pages) {
|
||||||
pages[i] = mem_map_offset(page, pfn_offset);
|
pages[i] = mem_map_offset(page, pfn_offset);
|
||||||
|
|
Loading…
Reference in New Issue