From bb737bbe48bea9854455cb61ea1dc06e92ce586c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 20 Apr 2020 17:01:34 +0200 Subject: [PATCH 01/15] virtiofs: schedule blocking async replies in separate worker In virtiofs (unlike in regular fuse) processing of async replies is serialized. This can result in a deadlock in rare corner cases when there's a circular dependency between the completion of two or more async replies. Such a deadlock can be reproduced with xfstests:generic/503 if TEST_DIR == SCRATCH_MNT (which is a misconfiguration): - Process A is waiting for page lock in worker thread context and blocked (virtio_fs_requests_done_work()). - Process B is holding page lock and waiting for pending writes to finish (fuse_wait_on_page_writeback()). - Write requests are waiting in virtqueue and can't complete because worker thread is blocked on page lock (process A). Fix this by creating a unique work_struct for each async reply that can block (O_DIRECT read). Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 1 + fs/fuse/fuse_i.h | 1 + fs/fuse/virtio_fs.c | 106 +++++++++++++++++++++++++++++--------------- 3 files changed, 73 insertions(+), 35 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d67b830fb7a..d400b71b98d5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -712,6 +712,7 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, spin_unlock(&io->lock); ia->ap.args.end = fuse_aio_complete_req; + ia->ap.args.may_block = io->should_dirty; err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); if (err) fuse_aio_complete_req(fc, &ia->ap.args, err); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ca344bf71404..d7cde216fc87 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -249,6 +249,7 @@ struct fuse_args { bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; + bool may_block:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index bade74768903..0c6ef5d3c6ab 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -60,6 +60,12 @@ struct virtio_fs_forget { struct virtio_fs_forget_req req; }; +struct virtio_fs_req_work { + struct fuse_req *req; + struct virtio_fs_vq *fsvq; + struct work_struct done_work; +}; + static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); @@ -485,19 +491,67 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) } /* Work function for request completion */ +static void virtio_fs_request_complete(struct fuse_req *req, + struct virtio_fs_vq *fsvq) +{ + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct fuse_conn *fc = fsvq->fud->fc; + struct fuse_args *args; + struct fuse_args_pages *ap; + unsigned int len, i, thislen; + struct page *page; + + /* + * TODO verify that server properly follows FUSE protocol + * (oh.uniq, oh.len) + */ + args = req->args; + copy_args_from_argbuf(args, req); + + if (args->out_pages && args->page_zeroing) { + len = args->out_args[args->out_numargs - 1].size; + ap = container_of(args, typeof(*ap), args); + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } + + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); + + fuse_request_end(fc, req); + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_complete_req_work(struct work_struct *work) +{ + struct virtio_fs_req_work *w = + container_of(work, typeof(*w), done_work); + + virtio_fs_request_complete(w->req, w->fsvq); + kfree(w); +} + static void virtio_fs_requests_done_work(struct work_struct *work) { struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, done_work); struct fuse_pqueue *fpq = &fsvq->fud->pq; - struct fuse_conn *fc = fsvq->fud->fc; struct virtqueue *vq = fsvq->vq; struct fuse_req *req; - struct fuse_args_pages *ap; struct fuse_req *next; - struct fuse_args *args; - unsigned int len, i, thislen; - struct page *page; + unsigned int len; LIST_HEAD(reqs); /* Collect completed requests off the virtqueue */ @@ -515,38 +569,20 @@ static void virtio_fs_requests_done_work(struct work_struct *work) /* End requests */ list_for_each_entry_safe(req, next, &reqs, list) { - /* - * TODO verify that server properly follows FUSE protocol - * (oh.uniq, oh.len) - */ - args = req->args; - copy_args_from_argbuf(args, req); - - if (args->out_pages && args->page_zeroing) { - len = args->out_args[args->out_numargs - 1].size; - ap = container_of(args, typeof(*ap), args); - for (i = 0; i < ap->num_pages; i++) { - thislen = ap->descs[i].length; - if (len < thislen) { - WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); - len = 0; - } else { - len -= thislen; - } - } - } - - spin_lock(&fpq->lock); - clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - spin_unlock(&fpq->lock); - fuse_request_end(fc, req); - spin_lock(&fsvq->lock); - dec_in_flight_req(fsvq); - spin_unlock(&fsvq->lock); + /* blocking async request completes in a worker context */ + if (req->args->may_block) { + struct virtio_fs_req_work *w; + + w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL); + INIT_WORK(&w->done_work, virtio_fs_complete_req_work); + w->fsvq = fsvq; + w->req = req; + schedule_work(&w->done_work); + } else { + virtio_fs_request_complete(req, fsvq); + } } } From a5d8422cc9598b11054faf34772d9ce68ae204fd Mon Sep 17 00:00:00 2001 From: Masayoshi Mizuma Date: Tue, 17 Dec 2019 15:55:20 -0500 Subject: [PATCH 02/15] virtiofs: Add mount option and atime behavior to the doc Add a section to show the mount option and a subsection to show the atime behavior. Signed-off-by: Masayoshi Mizuma Signed-off-by: Miklos Szeredi --- Documentation/filesystems/virtiofs.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/filesystems/virtiofs.rst b/Documentation/filesystems/virtiofs.rst index e06e4951cb39..fd4d2484e949 100644 --- a/Documentation/filesystems/virtiofs.rst +++ b/Documentation/filesystems/virtiofs.rst @@ -39,6 +39,20 @@ Mount file system with tag ``myfs`` on ``/mnt``: Please see https://virtio-fs.gitlab.io/ for details on how to configure QEMU and the virtiofsd daemon. +Mount options +------------- + +virtiofs supports general VFS mount options, for example, remount, +ro, rw, context, etc. It also supports FUSE mount options. + +atime behavior +^^^^^^^^^^^^^^ + +The atime-related mount options, for example, noatime, strictatime, +are ignored. The atime behavior for virtiofs is the same as the +underlying filesystem of the directory that has been exported +on the host. + Internals ========= Since the virtio-fs device uses the FUSE protocol for file system requests, the From 0e9fb6f17ad5b386b75451328975a07d7d953c6d Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 19 Aug 2019 09:53:50 +0300 Subject: [PATCH 03/15] fuse: BUG_ON correction in fuse_dev_splice_write() commit 963545357202 ("fuse: reduce allocation size for splice_write") changed size of bufs array, so BUG_ON which checks the index of the array shold also be fixed. [SzM: turn BUG_ON into WARN_ON] Fixes: 963545357202 ("fuse: reduce allocation size for splice_write") Signed-off-by: Vasily Averin Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..5c155437a455 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1977,8 +1977,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - BUG_ON(nbuf >= pipe->ring_size); - BUG_ON(tail == head); + if (WARN_ON(nbuf >= count || tail == head)) + goto out_free; + ibuf = &pipe->bufs[tail & mask]; obuf = &bufs[nbuf]; From 75d892588e959573b321895003462d88cae2cff3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 28 Feb 2020 15:15:24 +0300 Subject: [PATCH 04/15] fuse: Update stale comment in queue_interrupt() Fixes: 04ec5af0776e "fuse: export fuse_end_request()" Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5c155437a455..6bda7d498f1a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -342,7 +342,7 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) list_add_tail(&req->intr_entry, &fiq->interrupts); /* * Pairs with smp_mb() implied by test_and_set_bit() - * from request_end(). + * from fuse_request_end(). */ smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { From cf576c58b3a283333fc6e9a7c1c8e5342fa59b97 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 12 May 2020 10:29:04 +0800 Subject: [PATCH 05/15] fuse: invalidate inode attr in writeback cache mode Under writeback mode, inode->i_blocks is not updated, making utils du read st.blocks as 0. For example, when using virtiofs (cache=always & nondax mode) with writeback_cache enabled, writing a new file and check its disk usage with du, du reports 0 usage. # uname -r 5.6.0-rc6+ # mount -t virtiofs virtiofs /mnt/virtiofs # rm -f /mnt/virtiofs/testfile # create new file and do extend write # xfs_io -fc "pwrite 0 4k" /mnt/virtiofs/testfile wrote 4096/4096 bytes at offset 0 4 KiB, 1 ops; 0.0001 sec (28.103 MiB/sec and 7194.2446 ops/sec) # du -k /mnt/virtiofs/testfile 0 <==== disk usage is 0 # stat -c %s,%b /mnt/virtiofs/testfile 4096,0 <==== i_size is correct, but st_blocks is 0 Fix it by invalidating attr in fuse_flush(), so we get up-to-date attr from server on next getattr. Signed-off-by: Eryu Guan Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d400b71b98d5..262c5e20f324 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -445,8 +445,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (is_bad_inode(inode)) return -EIO; + err = 0; if (fc->no_flush) - return 0; + goto inval_attr_out; err = write_inode_now(inode, 1); if (err) @@ -475,6 +476,14 @@ static int fuse_flush(struct file *file, fl_owner_t id) fc->no_flush = 1; err = 0; } + +inval_attr_out: + /* + * In memory i_blocks is not maintained by fuse, if writeback cache is + * enabled, i_blocks from cached attr may not be accurate. + */ + if (!err && fc->writeback_cache) + fuse_invalidate_attr(inode); return err; } From 614c026e8a46636198da93ec30719f93975bb26a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2020 14:50:37 +0200 Subject: [PATCH 06/15] fuse: always flush dirty data on close(2) We want cached data to synced with the userspace filesystem on close(), for example to allow getting correct st_blocks value. Do this regardless of whether the userspace filesystem implements a FLUSH method or not. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 262c5e20f324..4aa750d08d62 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -445,10 +445,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (is_bad_inode(inode)) return -EIO; - err = 0; - if (fc->no_flush) - goto inval_attr_out; - err = write_inode_now(inode, 1); if (err) return err; @@ -461,6 +457,10 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; + err = 0; + if (fc->no_flush) + goto inval_attr_out; + memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); From 5157da2ca42cbeca01709e29ce7b797cffed2432 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2020 14:50:37 +0200 Subject: [PATCH 07/15] fuse: always allow query of st_dev Fuse mounts without "allow_other" are off-limits to all non-owners. Yet it makes sense to allow querying st_dev on the root, since this value is provided by the kernel, not the userspace filesystem. Allow statx(2) with a zero request mask to succeed on a fuse mounts for all users. Reported-by: Nikolaus Rath Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index de1e2fde60bd..26f028bc760b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1689,8 +1689,18 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(fc)) { + if (!request_mask) { + /* + * If user explicitly requested *nothing* then don't + * error out, but return st_dev only. + */ + stat->result_mask = 0; + stat->dev = inode->i_sb->s_dev; + return 0; + } return -EACCES; + } return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); } From 7fd3abfa8dd7c08ecacd25b2f9f9e1d3fb642440 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 4 May 2020 14:33:15 -0400 Subject: [PATCH 08/15] virtiofs: do not use fuse_fill_super_common() for device installation fuse_fill_super_common() allocates and installs one fuse_device. Hence virtiofs allocates and install all fuse devices by itself except one. This makes logic little twisted. There does not seem to be any real need that why virtiofs can't allocate and install all fuse devices itself. So opt out of fuse device allocation and installation while calling fuse_fill_super_common(). Regular fuse still wants fuse_fill_super_common() to install fuse_device. It needs to prevent against races where two mounters are trying to mount fuse using same fd. In that case one will succeed while other will get -EINVAL. virtiofs does not have this issue because sget_fc() resolves the race w.r.t multiple mounters and only one instance of virtio_fs_fill_super() should be in progress for same filesystem. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 19 ++++++++++++------- fs/fuse/virtio_fs.c | 9 +++------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 95d712d44ca1..6fae66cc096a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1113,7 +1113,7 @@ EXPORT_SYMBOL_GPL(fuse_dev_free); int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { - struct fuse_dev *fud; + struct fuse_dev *fud = NULL; struct fuse_conn *fc = get_fuse_conn_super(sb); struct inode *root; struct dentry *root_dentry; @@ -1155,9 +1155,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fud = fuse_dev_alloc_install(fc); - if (!fud) - goto err; + if (ctx->fudptr) { + err = -ENOMEM; + fud = fuse_dev_alloc_install(fc); + if (!fud) + goto err; + } fc->dev = sb->s_dev; fc->sb = sb; @@ -1191,7 +1194,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (*ctx->fudptr) + if (ctx->fudptr && *ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); @@ -1200,7 +1203,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - *ctx->fudptr = fud; + if (ctx->fudptr) + *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); return 0; @@ -1208,7 +1212,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_unlock(&fuse_mutex); dput(root_dentry); err_dev_free: - fuse_dev_free(fud); + if (fud) + fuse_dev_free(fud); err: return err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 0c6ef5d3c6ab..4c4ef5d69298 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1103,7 +1103,7 @@ static int virtio_fs_fill_super(struct super_block *sb) err = -ENOMEM; /* Allocate fuse_dev for hiprio and notification queues */ - for (i = 0; i < VQ_REQUEST; i++) { + for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; fsvq->fud = fuse_dev_alloc(); @@ -1111,18 +1111,15 @@ static int virtio_fs_fill_super(struct super_block *sb) goto err_free_fuse_devs; } - ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; + /* virtiofs allocates and installs its own fuse devices */ + ctx.fudptr = NULL; err = fuse_fill_super_common(sb, &ctx); if (err < 0) goto err_free_fuse_devs; - fc = fs->vqs[VQ_REQUEST].fud->fc; - for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; - if (i == VQ_REQUEST) - continue; /* already initialized */ fuse_dev_install(fsvq->fud, fc); } From 00589386172ac577e64e3d67a3c5d4968174dcad Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2020 14:50:37 +0200 Subject: [PATCH 09/15] fuse: use dump_page Instead of custom page dumping, use the standard helper. Reported-by: Matthew Wilcox Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6bda7d498f1a..1cfc68f8fea9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -772,8 +772,7 @@ static int fuse_check_page(struct page *page) 1 << PG_lru | 1 << PG_active | 1 << PG_reclaim))) { - pr_warn("trying to steal weird page\n"); - pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + dump_page(page, "fuse: trying to steal weird page"); return 1; } return 0; From a5005c3cda6eeb6b95645e6cc32f58dafeffc976 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2020 14:50:37 +0200 Subject: [PATCH 10/15] fuse: fix weird page warning When PageWaiters was added, updating this check was missed. Reported-by: Nikolaus Rath Reported-by: Hugh Dickins Fixes: 62906027091f ("mm: add PageWaiters indicating tasks are waiting for a page bit") Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1cfc68f8fea9..b8962581c699 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -771,7 +771,8 @@ static int fuse_check_page(struct page *page) 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | - 1 << PG_reclaim))) { + 1 << PG_reclaim | + 1 << PG_waiters))) { dump_page(page, "fuse: trying to steal weird page"); return 1; } From 32f98877c57bee6bc27f443a96f49678a2cd6a50 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2020 14:50:37 +0200 Subject: [PATCH 11/15] fuse: don't check refcount after stealing page page_count() is unstable. Unless there has been an RCU grace period between when the page was removed from the page cache and now, a speculative reference may exist from the page cache. Reported-by: Matthew Wilcox Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b8962581c699..ec97cabe51ce 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -764,7 +764,6 @@ static int fuse_check_page(struct page *page) { if (page_mapcount(page) || page->mapping != NULL || - page_count(page) != 1 || (page->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | From 5ddd9ced9aef6cfa76af27d384c17c9e2d610ce8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2020 14:50:38 +0200 Subject: [PATCH 12/15] fuse: update attr_version counter on fuse_notify_inval_inode() A GETATTR request can race with FUSE_NOTIFY_INVAL_INODE, resulting in the attribute cache being updated with stale information after the invalidation. Fix this by bumping the attribute version in fuse_reverse_inval_inode(). Reported-by: Krzysztof Rusek Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6fae66cc096a..5b4aebf5821f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -321,6 +321,8 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, loff_t offset, loff_t len) { + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; @@ -329,6 +331,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, if (!inode) return -ENOENT; + fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { From 6b2fb79963fbed7db3ef850926d913518fd5c62f Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 19 Sep 2019 17:11:20 +0300 Subject: [PATCH 13/15] fuse: optimize writepages search Re-work fi->writepages, replacing list with rb-tree. This improves performance because kernel fuse iterates through fi->writepages for each writeback page and typical number of entries is about 800 (for 100MB of fuse writeback). Before patch: 10240+0 records in 10240+0 records out 10737418240 bytes (11 GB) copied, 41.3473 s, 260 MB/s 2 1 0 57445400 40416 6323676 0 0 33 374743 8633 19210 1 8 88 3 0 29.86% [kernel] [k] _raw_spin_lock 26.62% [fuse] [k] fuse_page_is_writeback After patch: 10240+0 records in 10240+0 records out 10737418240 bytes (11 GB) copied, 21.4954 s, 500 MB/s 2 9 0 53676040 31744 10265984 0 0 64 854790 10956 48387 1 6 88 6 0 23.55% [kernel] [k] copy_user_enhanced_fast_string 9.87% [kernel] [k] __memcpy 3.10% [kernel] [k] _raw_spin_lock Signed-off-by: Maxim Patlasov Signed-off-by: Vasily Averin Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 62 ++++++++++++++++++++++++++++++++++++++---------- fs/fuse/fuse_i.h | 2 +- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4aa750d08d62..15812a8b3834 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -357,7 +357,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct list_head writepages_entry; + struct rb_node writepages_entry; struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; @@ -366,17 +366,23 @@ struct fuse_writepage_args { static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_writepage_args *wpa; + struct rb_node *n; - list_for_each_entry(wpa, &fi->writepages, writepages_entry) { + n = fi->writepages.rb_node; + + while (n) { + struct fuse_writepage_args *wpa; pgoff_t curr_index; + wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + wpa->ia.ap.num_pages && - curr_index <= idx_to) { + if (idx_from >= curr_index + wpa->ia.ap.num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else return wpa; - } } return NULL; } @@ -1624,7 +1630,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&wpa->writepages_entry); + rb_erase(&wpa->writepages_entry, &fi->writepages); for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); @@ -1712,6 +1718,36 @@ __acquires(fi->lock) } } +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + WARN_ON(!wpa->ia.ap.num_pages); + while (*p) { + struct fuse_writepage_args *curr; + pgoff_t curr_index; + + parent = *p; + curr = rb_entry(parent, struct fuse_writepage_args, + writepages_entry); + WARN_ON(curr->inode != wpa->inode); + curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; + + if (idx_from >= curr_index + curr->ia.ap.num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + return (void) WARN_ON(true); + } + + rb_link_node(&wpa->writepages_entry, parent, p); + rb_insert_color(&wpa->writepages_entry, root); +} + static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, int error) { @@ -1730,7 +1766,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, wpa->next = next->next; next->next = NULL; next->ia.ff = fuse_file_get(wpa->ia.ff); - list_add(&next->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, next); /* * Skip fuse_flush_writepages() to make it easy to crop requests @@ -1865,7 +1901,7 @@ static int fuse_writepage_locked(struct page *page) inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1977,10 +2013,10 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_wpa->writepages_entry); + rb_erase(&new_wpa->writepages_entry, &fi->writepages); old_wpa = fuse_find_writeback(fi, page->index, page->index); if (!old_wpa) { - list_add(&new_wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, new_wpa); spin_unlock(&fi->lock); return false; } @@ -2095,7 +2131,7 @@ static int fuse_writepages_fill(struct page *page, wpa->inode = inode; spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); spin_unlock(&fi->lock); data->wpa = wpa; @@ -3405,5 +3441,5 @@ void fuse_init_file_inode(struct inode *inode) INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); - INIT_LIST_HEAD(&fi->writepages); + fi->writepages = RB_ROOT; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d7cde216fc87..740a8a7d7ae6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -111,7 +111,7 @@ struct fuse_inode { wait_queue_head_t page_waitq; /* List of writepage requestst (pending or sent) */ - struct list_head writepages; + struct rb_root writepages; }; /* readdir cache (directory only) */ From 2c4656dfd994538176db30ce09c02cc0dfc361ae Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 20 May 2020 11:39:35 +0200 Subject: [PATCH 14/15] fuse: fix copy_file_range cache issues a) Dirty cache needs to be written back not just in the writeback_cache case, since the dirty pages may come from memory maps. b) The fuse_writeback_range() helper takes an inclusive interval, so the end position needs to be pos+len-1 instead of pos+len. Fixes: 88bc7d5097a1 ("fuse: add support for copy_file_range()") Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 15812a8b3834..d365008fda44 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3325,13 +3325,11 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; - if (fc->writeback_cache) { - inode_lock(inode_in); - err = fuse_writeback_range(inode_in, pos_in, pos_in + len); - inode_unlock(inode_in); - if (err) - return err; - } + inode_lock(inode_in); + err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); + inode_unlock(inode_in); + if (err) + return err; inode_lock(inode_out); @@ -3339,11 +3337,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; - if (fc->writeback_cache) { - err = fuse_writeback_range(inode_out, pos_out, pos_out + len); - if (err) - goto out; - } + err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); + if (err) + goto out; if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); From 9b46418c40fe910e6537618f9932a8be78a3dd6c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 20 May 2020 11:39:35 +0200 Subject: [PATCH 15/15] fuse: copy_file_range should truncate cache After the copy operation completes the cache is not up-to-date. Truncate all pages in the interval that has successfully been copied. Truncating completely copied dirty pages is okay, since the data has been overwritten anyway. Truncating partially copied dirty pages is not okay; add a comment for now. Fixes: 88bc7d5097a1 ("fuse: add support for copy_file_range()") Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d365008fda44..336d1cf72da0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3337,6 +3337,24 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; + /* + * Write out dirty pages in the destination file before sending the COPY + * request to userspace. After the request is completed, truncate off + * pages (including partial ones) from the cache that have been copied, + * since these contain stale data at that point. + * + * This should be mostly correct, but if the COPY writes to partial + * pages (at the start or end) and the parts not covered by the COPY are + * written through a memory map after calling fuse_writeback_range(), + * then these partial page modifications will be lost on truncation. + * + * It is unlikely that someone would rely on such mixed style + * modifications. Yet this does give less guarantees than if the + * copying was performed with write(2). + * + * To fix this a i_mmap_sem style lock could be used to prevent new + * faults while the copy is ongoing. + */ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); if (err) goto out; @@ -3360,6 +3378,10 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; + truncate_inode_pages_range(inode_out->i_mapping, + ALIGN_DOWN(pos_out, PAGE_SIZE), + ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + if (fc->writeback_cache) { fuse_write_update_size(inode_out, pos_out + outarg.size); file_update_time(file_out);