From b1bf862e9dad431175a1174379476299dbfdc017 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 28 Feb 2011 09:52:08 -0500 Subject: [PATCH 1/6] Btrfs: fix regressions in copy_from_user handling Commit 914ee295af418e936ec20a08c1663eaabe4cd07a fixed deadlocks in btrfs_file_write where we would catch page faults on pages we had locked. But, there were a few problems: 1) The x86-32 iov_iter_copy_from_user_atomic code always fails to copy data when the amount to copy is more than 4K and the offset to start copying from is not page aligned. The result was btrfs_file_write looping forever retrying the iov_iter_copy_from_user_atomic We deal with this by changing btrfs_file_write to drop down to single page copies when iov_iter_copy_from_user_atomic starts returning failure. 2) The btrfs_file_write code was leaking delalloc reservations when iov_iter_copy_from_user_atomic returned zero. The looping above would result in the entire filesystem running out of delalloc reservations and constantly trying to flush things to disk. 3) btrfs_file_write will lock down page cache pages, make sure any writeback is finished, do the copy_from_user and then release them. Before the loop runs we check the first and last pages in the write to see if they are only being partially modified. If the start or end of the write isn't aligned, we make sure the corresponding pages are up to date so that we don't introduce garbage into the file. With the copy_from_user changes, we're allowing the VM to reclaim the pages after a partial update from copy_from_user, but we're not making sure the page cache page is up to date when we loop around to resume the write. We deal with this by pushing the up to date checks down into the page prep code. This fits better with how the rest of file_write works. Signed-off-by: Chris Mason Reported-by: Mitch Harder cc: stable@kernel.org --- fs/btrfs/file.c | 101 ++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 42 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 65338a1d14ad..13664b315fe2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -761,6 +761,27 @@ out: return 0; } +/* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct page *page, u64 pos) +{ + int ret = 0; + + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + /* * this gets pages into the page cache and locks them down, it also properly * waits for data=ordered extents to finish before allowing the pages to be @@ -776,6 +797,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; int err = 0; + int faili = 0; u64 start_pos; u64 last_pos; @@ -793,15 +815,24 @@ again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { - int c; - for (c = i - 1; c >= 0; c--) { - unlock_page(pages[c]); - page_cache_release(pages[c]); - } - return -ENOMEM; + faili = i - 1; + err = -ENOMEM; + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; } wait_on_page_writeback(pages[i]); } + err = 0; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, @@ -841,6 +872,14 @@ again: WARN_ON(!PageLocked(pages[i])); } return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + } static ssize_t btrfs_file_aio_write(struct kiocb *iocb, @@ -850,7 +889,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *pinned[2]; struct page **pages = NULL; struct iov_iter i; loff_t *ppos = &iocb->ki_pos; @@ -871,9 +909,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - pinned[0] = NULL; - pinned[1] = NULL; - start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); @@ -961,32 +996,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; - /* - * there are lots of better ways to do this, but this code - * makes sure the first and last page in the file range are - * up to date and ready for cow - */ - if ((pos & (PAGE_CACHE_SIZE - 1))) { - pinned[0] = grab_cache_page(inode->i_mapping, first_index); - if (!PageUptodate(pinned[0])) { - ret = btrfs_readpage(NULL, pinned[0]); - BUG_ON(ret); - wait_on_page_locked(pinned[0]); - } else { - unlock_page(pinned[0]); - } - } - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { - pinned[1] = grab_cache_page(inode->i_mapping, last_index); - if (!PageUptodate(pinned[1])) { - ret = btrfs_readpage(NULL, pinned[1]); - BUG_ON(ret); - wait_on_page_locked(pinned[1]); - } else { - unlock_page(pinned[1]); - } - } - while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(&i), @@ -1023,8 +1032,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) + dirty_pages = 0; + else + dirty_pages = (copied + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; if (num_pages > dirty_pages) { if (copied > 0) @@ -1068,10 +1089,6 @@ out: err = ret; kfree(pages); - if (pinned[0]) - page_cache_release(pinned[0]); - if (pinned[1]) - page_cache_release(pinned[1]); *ppos = pos; /* From 31339acd07b4ba687906702085127895a56eb920 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 7 Mar 2011 11:10:24 -0500 Subject: [PATCH 2/6] Btrfs: deal with short returns from copy_from_user When copy_from_user is only able to copy some of the bytes we requested, we may end up creating a partially up to date page. To avoid garbage in the page, we need to treat a partial copy as a zero length copy. This makes the rest of the file_write code drop the page and retry the whole copy instead of marking the partially up to date page as dirty. Signed-off-by: Chris Mason cc: stable@kernel.org --- fs/btrfs/file.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 13664b315fe2..ab22ca4f237f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -69,6 +69,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* Flush processor's dcache for this page */ flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; From ea8efc74bd0402b4d5f663d007b4e25fa29ea778 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 8 Mar 2011 11:54:40 -0500 Subject: [PATCH 3/6] Btrfs: make sure not to return overlapping extents to fiemap The btrfs fiemap code was incorrectly returning duplicate or overlapping extents in some cases. cp was blindly trusting this result and we would end up with a destination file that was bigger than the original because some bytes were copied twice. The fix here adjusts our offsets to make sure we're always moving forward in the fiemap results. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ff45b80d90f0..9fcb5ede6b72 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3046,17 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - off = extent_map_end(em); - if (off >= max) - end = 1; + u64 offset_in_extent; - em_start = em->start; - em_len = em->len; + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; + + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); + + /* + * record the offset from the start of the extent + * for adjusting the disk offset below + */ + offset_in_extent = em_start - em->start; em_end = extent_map_end(em); + em_len = em_end - em_start; emflags = em->flags; disko = 0; flags = 0; + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; @@ -3067,7 +3088,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { - disko = em->block_start; + disko = em->block_start + offset_in_extent; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; From b4966b7770349deb05e3dd2bd2c65d2d044abbbb Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Wed, 9 Mar 2011 16:46:42 +0000 Subject: [PATCH 4/6] btrfs: fix dip leak The btrfs DIO code leaks dip structs when dip->csums allocation fails; bio->bi_end_io isn't set at the point where the free_ordered branch is consequently taken, thus bio_endio doesn't call the function which would free it in the normal case. Fix. Signed-off-by: Daniel J Blueman Acked-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 44b926646e33..e7a8303328b2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6058,6 +6058,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, if (!skip_sum) { dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); if (!dip->csums) { + kfree(dip); ret = -ENOMEM; goto free_ordered; } From 7e6b6465e6efbca3985258996be9c189da96c8bf Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 18 Feb 2011 09:21:17 +0000 Subject: [PATCH 5/6] btrfs: fix not enough reserved space btrfs_link() will insert 3 items(inode ref, dir name item and dir index item) into the b+ tree and update 2 items(its inode, and parent's inode) in the b+ tree. So we should reserve space for these 5 items, not 3 items. Reported-by: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e7a8303328b2..db67821ccac2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4823,10 +4823,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; /* - * 1 item for inode ref + * 2 items for inode and inode ref * 2 items for dir items + * 1 item for parent inode */ - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto fail; From 36e39c40b3facc9b489a13f1d301fc53ff6960a3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 12 Mar 2011 07:08:42 -0500 Subject: [PATCH 6/6] Btrfs: break out of shrink_delalloc earlier Josef had changed shrink_delalloc to exit after three shrink attempts, which wasn't quite enough because new writers could race in and steal free space. But it also fixed deadlocks and stalls as we tried to recover delalloc reservations. The code was tweaked to loop 1024 times, and would reset the counter any time a small amount of progress was made. This was too drastic, and with a lot of writers we can end up stuck in shrink_delalloc forever. The shrink_delalloc loop is fairly complex because the caller is looping too, and the caller will go ahead and force a transaction commit to make sure we reclaim space. This reworks things to exit shrink_delalloc when we've forced some writeback and the delalloc reservations have gone down. This means the writeback has not just started but has also finished at least some of the metadata changes required to reclaim delalloc space. If we've got this wrong, we're returning ENOSPC too early, which is a big improvement over the current behavior of hanging the machine. Test 224 in xfstests hammers on this nicely, and with 1000 writers trying to fill a 1GB drive we get our first ENOSPC at 93% full. The other writers are able to continue until we get 100%. This is a worst case test for btrfs because the 1000 writers are doing small IO, and the small FS size means we don't have a lot of room for metadata chunks. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/extent-tree.c | 35 +++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 28188a786da0..8b4b9d158a0a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -729,6 +729,15 @@ struct btrfs_space_info { u64 disk_total; /* total bytes on disk, takes mirrors into account */ + /* + * we bump reservation progress every time we decrement + * bytes_reserved. This way people waiting for reservations + * know something good has happened and they can check + * for progress. The number here isn't to be trusted, it + * just shows reclaim activity + */ + unsigned long reservation_progress; + int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 100e409e9053..f1db57d4a016 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3343,15 +3343,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 max_reclaim; u64 reclaimed = 0; long time_left; - int pause = 1; int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; + unsigned long progress; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); reserved = space_info->bytes_reserved; + progress = space_info->reservation_progress; if (reserved == 0) return 0; @@ -3366,31 +3367,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) { - loops = 0; + if (reserved > space_info->bytes_reserved) reclaimed += reserved - space_info->bytes_reserved; - } else { - loops++; - } reserved = space_info->bytes_reserved; spin_unlock(&space_info->lock); + loops++; + if (reserved == 0 || reclaimed >= max_reclaim) break; if (trans && trans->transaction->blocked) return -EAGAIN; - __set_current_state(TASK_INTERRUPTIBLE); - time_left = schedule_timeout(pause); + time_left = schedule_timeout_interruptible(1); /* We were interrupted, exit */ if (time_left) break; - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; + /* we've kicked the IO a few times, if anything has been freed, + * exit. There is no sense in looping here for a long time + * when we really need to commit the transaction, or there are + * just too many writers without enough free space + */ + + if (loops > 3) { + smp_mb(); + if (progress != space_info->reservation_progress) + break; + } } return reclaimed >= to_reclaim; @@ -3613,6 +3619,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; spin_unlock(&space_info->lock); } } @@ -3845,6 +3852,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_reserved -= num_bytes; + sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } @@ -4006,7 +4014,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve = 0; } spin_unlock(&BTRFS_I(inode)->accounting_lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) @@ -4134,6 +4141,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4185,6 +4193,7 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4235,6 +4244,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -4713,6 +4723,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (ret) { spin_lock(&cache->space_info->lock); cache->space_info->bytes_reserved -= buf->len; + cache->space_info->reservation_progress++; spin_unlock(&cache->space_info->lock); } goto out;