btrfs: track DIO bytes in flight

When diagnosing a slowdown of generic/224 I noticed we were not doing
anything when calling into shrink_delalloc().  This is because all
writes in 224 are O_DIRECT, not delalloc, and thus our delalloc_bytes
counter is 0, which short circuits most of the work inside of
shrink_delalloc().  However O_DIRECT writes still consume metadata
resources and generate ordered extents, which we can still wait on.

Fix this by tracking outstanding DIO write bytes, and use this as well
as the delalloc bytes counter to decide if we need to lookup and wait on
any ordered extents.  If we have more DIO writes than delalloc bytes
we'll go ahead and wait on any ordered extents regardless of our flush
state as flushing delalloc is likely to not gain us anything.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ use dio instead of odirect in identifiers ]
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Josef Bacik 2019-04-10 15:56:09 -04:00 committed by David Sterba
parent da9b6ec829
commit 4297ff84dc
4 changed files with 36 additions and 4 deletions

View File

@ -1016,6 +1016,7 @@ struct btrfs_fs_info {
/* used to keep from writing metadata until there is a nice batch */ /* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes; struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes; struct percpu_counter delalloc_bytes;
struct percpu_counter dio_bytes;
s32 dirty_metadata_batch; s32 dirty_metadata_batch;
s32 delalloc_batch; s32 delalloc_batch;

View File

@ -2633,11 +2633,17 @@ int open_ctree(struct super_block *sb,
goto fail; goto fail;
} }
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
if (ret) { if (ret) {
err = ret; err = ret;
goto fail_srcu; goto fail_srcu;
} }
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
goto fail_dio_bytes;
}
fs_info->dirty_metadata_batch = PAGE_SIZE * fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids)); (1 + ilog2(nr_cpu_ids));
@ -3336,6 +3342,8 @@ fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes: fail_dirty_metadata_bytes:
percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
fail_dio_bytes:
percpu_counter_destroy(&fs_info->dio_bytes);
fail_srcu: fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu); cleanup_srcu_struct(&fs_info->subvol_srcu);
fail: fail:
@ -4017,6 +4025,10 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_sum(&fs_info->delalloc_bytes)); percpu_counter_sum(&fs_info->delalloc_bytes));
} }
if (percpu_counter_sum(&fs_info->dio_bytes))
btrfs_info(fs_info, "at unmount dio bytes count %lld",
percpu_counter_sum(&fs_info->dio_bytes));
btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@ -4048,6 +4060,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->dio_bytes);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
cleanup_srcu_struct(&fs_info->subvol_srcu); cleanup_srcu_struct(&fs_info->subvol_srcu);

View File

@ -4633,6 +4633,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
struct btrfs_space_info *space_info; struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans; struct btrfs_trans_handle *trans;
u64 delalloc_bytes; u64 delalloc_bytes;
u64 dio_bytes;
u64 async_pages; u64 async_pages;
u64 items; u64 items;
long time_left; long time_left;
@ -4648,7 +4649,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
delalloc_bytes = percpu_counter_sum_positive( delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes); &fs_info->delalloc_bytes);
if (delalloc_bytes == 0) { dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
if (delalloc_bytes == 0 && dio_bytes == 0) {
if (trans) if (trans)
return; return;
if (wait_ordered) if (wait_ordered)
@ -4656,8 +4658,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
return; return;
} }
/*
* If we are doing more ordered than delalloc we need to just wait on
* ordered extents, otherwise we'll waste time trying to flush delalloc
* that likely won't give us the space back we need.
*/
if (dio_bytes > delalloc_bytes)
wait_ordered = true;
loops = 0; loops = 0;
while (delalloc_bytes && loops < 3) { while ((delalloc_bytes || dio_bytes) && loops < 3) {
nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
/* /*
@ -4707,6 +4717,7 @@ skip_async:
} }
delalloc_bytes = percpu_counter_sum_positive( delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes); &fs_info->delalloc_bytes);
dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
} }
} }

View File

@ -195,8 +195,11 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags); set_bit(type, &entry->flags);
if (dio) if (dio) {
percpu_counter_add_batch(&fs_info->dio_bytes, len,
fs_info->delalloc_batch);
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
}
/* one ref for the tree */ /* one ref for the tree */
refcount_set(&entry->refs, 1); refcount_set(&entry->refs, 1);
@ -468,6 +471,10 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (root != fs_info->tree_root) if (root != fs_info->tree_root)
btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len,
fs_info->delalloc_batch);
tree = &btrfs_inode->ordered_tree; tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock); spin_lock_irq(&tree->lock);
node = &entry->rb_node; node = &entry->rb_node;