fuse update for 5.8

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQSQHSd0lITzzeNWNm3h3BK/laaZPAUCXt/0GAAKCRDh3BK/laaZ
 PIJjAP48TurDqomsQMBLiOsSUy0YIhd5QC/G5MYLKSBojXoR+gD+KfqXhVIDz0En
 OI+K4674cNhf4CXNzUedU3qSOaJLfAU=
 =PqbB
 -----END PGP SIGNATURE-----

Merge tag 'fuse-update-5.8' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse

Pull fuse updates from Miklos Szeredi:

 - Fix a rare deadlock in virtiofs

 - Fix st_blocks in writeback cache mode

 - Fix wrong checks in splice move causing spurious warnings

 - Fix a race between a GETATTR request and a FUSE_NOTIFY_INVAL_INODE
   notification

 - Use rb-tree instead of linear search for pages currently under
   writeout by userspace

 - Fix copy_file_range() inconsistencies

* tag 'fuse-update-5.8' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse:
  fuse: copy_file_range should truncate cache
  fuse: fix copy_file_range cache issues
  fuse: optimize writepages search
  fuse: update attr_version counter on fuse_notify_inval_inode()
  fuse: don't check refcount after stealing page
  fuse: fix weird page warning
  fuse: use dump_page
  virtiofs: do not use fuse_fill_super_common() for device installation
  fuse: always allow query of st_dev
  fuse: always flush dirty data on close(2)
  fuse: invalidate inode attr in writeback cache mode
  fuse: Update stale comment in queue_interrupt()
  fuse: BUG_ON correction in fuse_dev_splice_write()
  virtiofs: Add mount option and atime behavior to the doc
  virtiofs: schedule blocking async replies in separate worker
This commit is contained in:
Linus Torvalds 2020-06-09 15:48:24 -07:00
commit 5b14671be5
7 changed files with 219 additions and 85 deletions

View File

@ -39,6 +39,20 @@ Mount file system with tag ``myfs`` on ``/mnt``:
Please see https://virtio-fs.gitlab.io/ for details on how to configure QEMU
and the virtiofsd daemon.
Mount options
-------------
virtiofs supports general VFS mount options, for example, remount,
ro, rw, context, etc. It also supports FUSE mount options.
atime behavior
^^^^^^^^^^^^^^
The atime-related mount options, for example, noatime, strictatime,
are ignored. The atime behavior for virtiofs is the same as the
underlying filesystem of the directory that has been exported
on the host.
Internals
=========
Since the virtio-fs device uses the FUSE protocol for file system requests, the

View File

@ -342,7 +342,7 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
list_add_tail(&req->intr_entry, &fiq->interrupts);
/*
* Pairs with smp_mb() implied by test_and_set_bit()
* from request_end().
* from fuse_request_end().
*/
smp_mb();
if (test_bit(FR_FINISHED, &req->flags)) {
@ -764,16 +764,15 @@ static int fuse_check_page(struct page *page)
{
if (page_mapcount(page) ||
page->mapping != NULL ||
page_count(page) != 1 ||
(page->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
1 << PG_reclaim))) {
pr_warn("trying to steal weird page\n");
pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
1 << PG_reclaim |
1 << PG_waiters))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
return 0;
@ -1977,8 +1976,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;
BUG_ON(nbuf >= pipe->ring_size);
BUG_ON(tail == head);
if (WARN_ON(nbuf >= count || tail == head))
goto out_free;
ibuf = &pipe->bufs[tail & mask];
obuf = &bufs[nbuf];

View File

@ -1689,8 +1689,18 @@ static int fuse_getattr(const struct path *path, struct kstat *stat,
struct inode *inode = d_inode(path->dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
if (!fuse_allow_current_process(fc))
if (!fuse_allow_current_process(fc)) {
if (!request_mask) {
/*
* If user explicitly requested *nothing* then don't
* error out, but return st_dev only.
*/
stat->result_mask = 0;
stat->dev = inode->i_sb->s_dev;
return 0;
}
return -EACCES;
}
return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
}

View File

@ -357,7 +357,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
struct fuse_writepage_args {
struct fuse_io_args ia;
struct list_head writepages_entry;
struct rb_node writepages_entry;
struct list_head queue_entry;
struct fuse_writepage_args *next;
struct inode *inode;
@ -366,17 +366,23 @@ struct fuse_writepage_args {
static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
pgoff_t idx_from, pgoff_t idx_to)
{
struct fuse_writepage_args *wpa;
struct rb_node *n;
list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
n = fi->writepages.rb_node;
while (n) {
struct fuse_writepage_args *wpa;
pgoff_t curr_index;
wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
WARN_ON(get_fuse_inode(wpa->inode) != fi);
curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
if (idx_from < curr_index + wpa->ia.ap.num_pages &&
curr_index <= idx_to) {
if (idx_from >= curr_index + wpa->ia.ap.num_pages)
n = n->rb_right;
else if (idx_to < curr_index)
n = n->rb_left;
else
return wpa;
}
}
return NULL;
}
@ -445,9 +451,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (is_bad_inode(inode))
return -EIO;
if (fc->no_flush)
return 0;
err = write_inode_now(inode, 1);
if (err)
return err;
@ -460,6 +463,10 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;
err = 0;
if (fc->no_flush)
goto inval_attr_out;
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.lock_owner = fuse_lock_owner_id(fc, id);
@ -475,6 +482,14 @@ static int fuse_flush(struct file *file, fl_owner_t id)
fc->no_flush = 1;
err = 0;
}
inval_attr_out:
/*
* In memory i_blocks is not maintained by fuse, if writeback cache is
* enabled, i_blocks from cached attr may not be accurate.
*/
if (!err && fc->writeback_cache)
fuse_invalidate_attr(inode);
return err;
}
@ -712,6 +727,7 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc,
spin_unlock(&io->lock);
ia->ap.args.end = fuse_aio_complete_req;
ia->ap.args.may_block = io->should_dirty;
err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
if (err)
fuse_aio_complete_req(fc, &ia->ap.args, err);
@ -1570,7 +1586,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc,
struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
list_del(&wpa->writepages_entry);
rb_erase(&wpa->writepages_entry, &fi->writepages);
for (i = 0; i < ap->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
@ -1658,6 +1674,36 @@ __acquires(fi->lock)
}
}
static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
{
pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
WARN_ON(!wpa->ia.ap.num_pages);
while (*p) {
struct fuse_writepage_args *curr;
pgoff_t curr_index;
parent = *p;
curr = rb_entry(parent, struct fuse_writepage_args,
writepages_entry);
WARN_ON(curr->inode != wpa->inode);
curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
if (idx_from >= curr_index + curr->ia.ap.num_pages)
p = &(*p)->rb_right;
else if (idx_to < curr_index)
p = &(*p)->rb_left;
else
return (void) WARN_ON(true);
}
rb_link_node(&wpa->writepages_entry, parent, p);
rb_insert_color(&wpa->writepages_entry, root);
}
static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
int error)
{
@ -1676,7 +1722,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
wpa->next = next->next;
next->next = NULL;
next->ia.ff = fuse_file_get(wpa->ia.ff);
list_add(&next->writepages_entry, &fi->writepages);
tree_insert(&fi->writepages, next);
/*
* Skip fuse_flush_writepages() to make it easy to crop requests
@ -1811,7 +1857,7 @@ static int fuse_writepage_locked(struct page *page)
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fi->lock);
list_add(&wpa->writepages_entry, &fi->writepages);
tree_insert(&fi->writepages, wpa);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
@ -1923,10 +1969,10 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
WARN_ON(new_ap->num_pages != 0);
spin_lock(&fi->lock);
list_del(&new_wpa->writepages_entry);
rb_erase(&new_wpa->writepages_entry, &fi->writepages);
old_wpa = fuse_find_writeback(fi, page->index, page->index);
if (!old_wpa) {
list_add(&new_wpa->writepages_entry, &fi->writepages);
tree_insert(&fi->writepages, new_wpa);
spin_unlock(&fi->lock);
return false;
}
@ -2041,7 +2087,7 @@ static int fuse_writepages_fill(struct page *page,
wpa->inode = inode;
spin_lock(&fi->lock);
list_add(&wpa->writepages_entry, &fi->writepages);
tree_insert(&fi->writepages, wpa);
spin_unlock(&fi->lock);
data->wpa = wpa;
@ -3235,13 +3281,11 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
if (fc->writeback_cache) {
inode_lock(inode_in);
err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
inode_unlock(inode_in);
if (err)
return err;
}
inode_lock(inode_in);
err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
inode_unlock(inode_in);
if (err)
return err;
inode_lock(inode_out);
@ -3249,11 +3293,27 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (err)
goto out;
if (fc->writeback_cache) {
err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
if (err)
goto out;
}
/*
* Write out dirty pages in the destination file before sending the COPY
* request to userspace. After the request is completed, truncate off
* pages (including partial ones) from the cache that have been copied,
* since these contain stale data at that point.
*
* This should be mostly correct, but if the COPY writes to partial
* pages (at the start or end) and the parts not covered by the COPY are
* written through a memory map after calling fuse_writeback_range(),
* then these partial page modifications will be lost on truncation.
*
* It is unlikely that someone would rely on such mixed style
* modifications. Yet this does give less guarantees than if the
* copying was performed with write(2).
*
* To fix this a i_mmap_sem style lock could be used to prevent new
* faults while the copy is ongoing.
*/
err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
if (err)
goto out;
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
@ -3274,6 +3334,10 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (err)
goto out;
truncate_inode_pages_range(inode_out->i_mapping,
ALIGN_DOWN(pos_out, PAGE_SIZE),
ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
if (fc->writeback_cache) {
fuse_write_update_size(inode_out, pos_out + outarg.size);
file_update_time(file_out);
@ -3351,5 +3415,5 @@ void fuse_init_file_inode(struct inode *inode)
INIT_LIST_HEAD(&fi->queued_writes);
fi->writectr = 0;
init_waitqueue_head(&fi->page_waitq);
INIT_LIST_HEAD(&fi->writepages);
fi->writepages = RB_ROOT;
}

View File

@ -111,7 +111,7 @@ struct fuse_inode {
wait_queue_head_t page_waitq;
/* List of writepage requestst (pending or sent) */
struct list_head writepages;
struct rb_root writepages;
};
/* readdir cache (directory only) */
@ -249,6 +249,7 @@ struct fuse_args {
bool out_argvar:1;
bool page_zeroing:1;
bool page_replace:1;
bool may_block:1;
struct fuse_in_arg in_args[3];
struct fuse_arg out_args[2];
void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);

View File

@ -321,6 +321,8 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
loff_t offset, loff_t len)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
struct fuse_inode *fi;
struct inode *inode;
pgoff_t pg_start;
pgoff_t pg_end;
@ -329,6 +331,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
if (!inode)
return -ENOENT;
fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
forget_all_cached_acls(inode);
if (offset >= 0) {
@ -1113,7 +1120,7 @@ EXPORT_SYMBOL_GPL(fuse_dev_free);
int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
{
struct fuse_dev *fud;
struct fuse_dev *fud = NULL;
struct fuse_conn *fc = get_fuse_conn_super(sb);
struct inode *root;
struct dentry *root_dentry;
@ -1155,9 +1162,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
if (sb->s_user_ns != &init_user_ns)
sb->s_xattr = fuse_no_acl_xattr_handlers;
fud = fuse_dev_alloc_install(fc);
if (!fud)
goto err;
if (ctx->fudptr) {
err = -ENOMEM;
fud = fuse_dev_alloc_install(fc);
if (!fud)
goto err;
}
fc->dev = sb->s_dev;
fc->sb = sb;
@ -1191,7 +1201,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
mutex_lock(&fuse_mutex);
err = -EINVAL;
if (*ctx->fudptr)
if (ctx->fudptr && *ctx->fudptr)
goto err_unlock;
err = fuse_ctl_add_conn(fc);
@ -1200,7 +1210,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
list_add_tail(&fc->entry, &fuse_conn_list);
sb->s_root = root_dentry;
*ctx->fudptr = fud;
if (ctx->fudptr)
*ctx->fudptr = fud;
mutex_unlock(&fuse_mutex);
return 0;
@ -1208,7 +1219,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
mutex_unlock(&fuse_mutex);
dput(root_dentry);
err_dev_free:
fuse_dev_free(fud);
if (fud)
fuse_dev_free(fud);
err:
return err;
}

View File

@ -60,6 +60,12 @@ struct virtio_fs_forget {
struct virtio_fs_forget_req req;
};
struct virtio_fs_req_work {
struct fuse_req *req;
struct virtio_fs_vq *fsvq;
struct work_struct done_work;
};
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
struct fuse_req *req, bool in_flight);
@ -485,19 +491,67 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
}
/* Work function for request completion */
static void virtio_fs_request_complete(struct fuse_req *req,
struct virtio_fs_vq *fsvq)
{
struct fuse_pqueue *fpq = &fsvq->fud->pq;
struct fuse_conn *fc = fsvq->fud->fc;
struct fuse_args *args;
struct fuse_args_pages *ap;
unsigned int len, i, thislen;
struct page *page;
/*
* TODO verify that server properly follows FUSE protocol
* (oh.uniq, oh.len)
*/
args = req->args;
copy_args_from_argbuf(args, req);
if (args->out_pages && args->page_zeroing) {
len = args->out_args[args->out_numargs - 1].size;
ap = container_of(args, typeof(*ap), args);
for (i = 0; i < ap->num_pages; i++) {
thislen = ap->descs[i].length;
if (len < thislen) {
WARN_ON(ap->descs[i].offset);
page = ap->pages[i];
zero_user_segment(page, len, thislen);
len = 0;
} else {
len -= thislen;
}
}
}
spin_lock(&fpq->lock);
clear_bit(FR_SENT, &req->flags);
spin_unlock(&fpq->lock);
fuse_request_end(fc, req);
spin_lock(&fsvq->lock);
dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
}
static void virtio_fs_complete_req_work(struct work_struct *work)
{
struct virtio_fs_req_work *w =
container_of(work, typeof(*w), done_work);
virtio_fs_request_complete(w->req, w->fsvq);
kfree(w);
}
static void virtio_fs_requests_done_work(struct work_struct *work)
{
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
done_work);
struct fuse_pqueue *fpq = &fsvq->fud->pq;
struct fuse_conn *fc = fsvq->fud->fc;
struct virtqueue *vq = fsvq->vq;
struct fuse_req *req;
struct fuse_args_pages *ap;
struct fuse_req *next;
struct fuse_args *args;
unsigned int len, i, thislen;
struct page *page;
unsigned int len;
LIST_HEAD(reqs);
/* Collect completed requests off the virtqueue */
@ -515,38 +569,20 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
/* End requests */
list_for_each_entry_safe(req, next, &reqs, list) {
/*
* TODO verify that server properly follows FUSE protocol
* (oh.uniq, oh.len)
*/
args = req->args;
copy_args_from_argbuf(args, req);
if (args->out_pages && args->page_zeroing) {
len = args->out_args[args->out_numargs - 1].size;
ap = container_of(args, typeof(*ap), args);
for (i = 0; i < ap->num_pages; i++) {
thislen = ap->descs[i].length;
if (len < thislen) {
WARN_ON(ap->descs[i].offset);
page = ap->pages[i];
zero_user_segment(page, len, thislen);
len = 0;
} else {
len -= thislen;
}
}
}
spin_lock(&fpq->lock);
clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
spin_unlock(&fpq->lock);
fuse_request_end(fc, req);
spin_lock(&fsvq->lock);
dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
/* blocking async request completes in a worker context */
if (req->args->may_block) {
struct virtio_fs_req_work *w;
w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL);
INIT_WORK(&w->done_work, virtio_fs_complete_req_work);
w->fsvq = fsvq;
w->req = req;
schedule_work(&w->done_work);
} else {
virtio_fs_request_complete(req, fsvq);
}
}
}
@ -1067,7 +1103,7 @@ static int virtio_fs_fill_super(struct super_block *sb)
err = -ENOMEM;
/* Allocate fuse_dev for hiprio and notification queues */
for (i = 0; i < VQ_REQUEST; i++) {
for (i = 0; i < fs->nvqs; i++) {
struct virtio_fs_vq *fsvq = &fs->vqs[i];
fsvq->fud = fuse_dev_alloc();
@ -1075,18 +1111,15 @@ static int virtio_fs_fill_super(struct super_block *sb)
goto err_free_fuse_devs;
}
ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
/* virtiofs allocates and installs its own fuse devices */
ctx.fudptr = NULL;
err = fuse_fill_super_common(sb, &ctx);
if (err < 0)
goto err_free_fuse_devs;
fc = fs->vqs[VQ_REQUEST].fud->fc;
for (i = 0; i < fs->nvqs; i++) {
struct virtio_fs_vq *fsvq = &fs->vqs[i];
if (i == VQ_REQUEST)
continue; /* already initialized */
fuse_dev_install(fsvq->fud, fc);
}