diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9918ba3ec2b2..fc73e86235e8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3480,8 +3480,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); -int btrfs_start_nocow_write(struct btrfs_root *root); -void btrfs_end_nocow_write(struct btrfs_root *root); +int btrfs_start_write_no_snapshoting(struct btrfs_root *root); +void btrfs_end_write_no_snapshoting(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5e81e3694d92..b4e3ab115f5f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9656,12 +9656,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) } /* - * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), - * they are used to prevent the some tasks writing data into the page cache - * by nocow before the subvolume is snapshoted, but flush the data into - * the disk after the snapshot creation. + * btrfs_{start,end}_write_no_snapshoting() are similar to + * mnt_{want,drop}_write(), they are used to prevent some tasks from writing + * data into the page cache through nocow before the subvolume is snapshoted, + * but flush the data into disk after the snapshot creation, or to prevent + * operations while snapshoting is ongoing and that cause the snapshot to be + * inconsistent (writes followed by expanding truncates for example). */ -void btrfs_end_nocow_write(struct btrfs_root *root) +void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* @@ -9673,7 +9675,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root) wake_up(&root->subv_writers->wait); } -int btrfs_start_nocow_write(struct btrfs_root *root) +int btrfs_start_write_no_snapshoting(struct btrfs_root *root) { if (atomic_read(&root->will_be_snapshoted)) return 0; @@ -9684,7 +9686,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root) */ smp_mb(); if (atomic_read(&root->will_be_snapshoted)) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); return 0; } return 1; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0fbf0e7bc606..e4090259569b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_nocow_write(root); + ret = btrfs_start_write_no_snapshoting(root); if (!ret) return -ENOSPC; @@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, btrfs_free_reserved_data_space(inode, reserve_bytes); else - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); break; } @@ -1632,7 +1632,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); @@ -1661,7 +1661,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { btrfs_delalloc_release_space(inode, release_bytes); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a5374c2bb943..8de23355f6cf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1337,7 +1337,7 @@ next_slot: * we fall into common COW way. */ if (!nolock) { - err = btrfs_start_nocow_write(root); + err = btrfs_start_write_no_snapshoting(root); if (!err) goto out_check; } @@ -1361,7 +1361,7 @@ out_check: if (extent_end <= start) { path->slots[0]++; if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto next_slot; } if (!nocow) { @@ -1381,7 +1381,7 @@ out_check: page_started, nr_written, 1); if (ret) { if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto error; } cow_start = (u64)-1; @@ -1432,7 +1432,7 @@ out_check: num_bytes); if (ret) { if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto error; } } @@ -1443,7 +1443,7 @@ out_check: EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_SET_PRIVATE2); if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; if (cur_offset > end) break; @@ -4599,6 +4599,26 @@ next: return err; } +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +static void wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} + static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4623,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize > oldsize) { truncate_pagecache(inode, newsize); + /* + * Don't do an expanding truncate while snapshoting is ongoing. + * This is to ensure the snapshot captures a fully consistent + * state of this file - if the snapshot captures this expanding + * truncation, it must capture all writes that happened before + * this truncation. + */ + wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); - if (ret) + if (ret) { + btrfs_end_write_no_snapshoting(root); return ret; + } trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_end_write_no_snapshoting(root); return PTR_ERR(trans); + } i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); ret = btrfs_update_inode(trans, root, inode); + btrfs_end_write_no_snapshoting(root); btrfs_end_transaction(trans, root); } else { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3abc068c5543..b590e23fa03e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -617,7 +617,7 @@ fail: return ret; } -static void btrfs_wait_nocow_write(struct btrfs_root *root) +static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) { s64 writers; DEFINE_WAIT(wait); @@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); - btrfs_wait_nocow_write(root); + btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) @@ -732,7 +732,8 @@ fail: free: kfree(pending_snapshot); out: - atomic_dec(&root->will_be_snapshoted); + if (atomic_dec_and_test(&root->will_be_snapshoted)) + wake_up_atomic_t(&root->will_be_snapshoted); return ret; }