diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ced5fd85dc36..08d9f8d15538 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -720,6 +720,7 @@ struct btrfs_fs_info { struct mutex drop_mutex; struct mutex volume_mutex; struct mutex tree_reloc_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 3e7eeaf86408..759fa247ced8 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -93,7 +93,8 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, * ref if it was able to find one, or NULL if nothing was in that spot */ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, - u64 bytenr, u64 parent) + u64 bytenr, u64 parent, + struct btrfs_delayed_ref_node **last) { struct rb_node *n = root->rb_node; struct btrfs_delayed_ref_node *entry; @@ -102,6 +103,8 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); + if (last) + *last = entry; cmp = comp_entry(entry, bytenr, parent); if (cmp < 0) @@ -114,45 +117,99 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, return NULL; } -/* - * Locking on delayed refs is done by taking a lock on the head node, - * which has the (impossible) parent id of (u64)-1. Once a lock is held - * on the head node, you're allowed (and required) to process all the - * delayed refs for a given byte number in the tree. - * - * This will walk forward in the rbtree until it finds a head node it - * is able to lock. It might not lock the delayed ref you asked for, - * and so it will return the one it did lock in next_ret and return 0. - * - * If no locks are taken, next_ret is set to null and 1 is returned. This - * means there are no more unlocked head nodes in the rbtree. - */ -int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - struct btrfs_delayed_ref_head **next_ret) +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) { - struct rb_node *node; - struct btrfs_delayed_ref_head *head; - int ret = 0; + struct btrfs_delayed_ref_root *delayed_refs; - while (1) { + delayed_refs = &trans->transaction->delayed_refs; + assert_spin_locked(&delayed_refs->lock); + if (mutex_trylock(&head->mutex)) + return 0; + + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + mutex_lock(&head->mutex); + spin_lock(&delayed_refs->lock); + if (!head->node.in_tree) { + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + btrfs_put_delayed_ref(&head->node); + return 0; +} + +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 start) +{ + int count = 0; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; + + delayed_refs = &trans->transaction->delayed_refs; + if (start == 0) { + node = rb_first(&delayed_refs->root); + } else { + ref = NULL; + tree_search(&delayed_refs->root, start, (u64)-1, &ref); + if (ref) { + struct btrfs_delayed_ref_node *tmp; + + node = rb_prev(&ref->rb_node); + while (node) { + tmp = rb_entry(node, + struct btrfs_delayed_ref_node, + rb_node); + if (tmp->bytenr < start) + break; + ref = tmp; + node = rb_prev(&ref->rb_node); + } + node = &ref->rb_node; + } else + node = rb_first(&delayed_refs->root); + } +again: + while (node && count < 32) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); if (btrfs_delayed_ref_is_head(ref)) { head = btrfs_delayed_node_to_head(ref); - if (mutex_trylock(&head->mutex)) { - *next_ret = head; - ret = 0; + if (list_empty(&head->cluster)) { + list_add_tail(&head->cluster, cluster); + delayed_refs->run_delayed_start = + head->node.bytenr; + count++; + + WARN_ON(delayed_refs->num_heads_ready == 0); + delayed_refs->num_heads_ready--; + } else if (count) { + /* the goal of the clustering is to find extents + * that are likely to end up in the same extent + * leaf on disk. So, we don't want them spread + * all over the tree. Stop now if we've hit + * a head that was already in use + */ break; } } - node = rb_next(&ref->rb_node); - if (!node) { - ret = 1; - *next_ret = NULL; - break; - } - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); } - return ret; + if (count) { + return 0; + } else if (start) { + /* + * we've gone to the end of the rbtree without finding any + * clusters. start from the beginning and try again + */ + start = 0; + node = rb_first(&delayed_refs->root); + goto again; + } + return 1; } /* @@ -178,7 +235,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) { prev_node = rb_prev(&ref->rb_node); if (!prev_node) @@ -240,7 +297,7 @@ again: } spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) { head = btrfs_delayed_node_to_head(ref); if (mutex_trylock(&head->mutex)) { @@ -384,7 +441,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_ref *full_ref; - struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; @@ -428,6 +485,7 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, if (btrfs_delayed_ref_is_head(ref)) { head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; + INIT_LIST_HEAD(&head_ref->cluster); mutex_init(&head_ref->mutex); } else { full_ref = btrfs_delayed_node_to_ref(ref); @@ -453,6 +511,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, */ kfree(ref); } else { + if (btrfs_delayed_ref_is_head(ref)) { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + } delayed_refs->num_entries++; trans->delayed_ref_updates++; } @@ -522,7 +584,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1); + ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c345fee9f96b..57153fcc347b 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -68,6 +68,8 @@ struct btrfs_delayed_ref_head { */ struct mutex mutex; + struct list_head cluster; + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -115,12 +117,20 @@ struct btrfs_delayed_ref_root { */ unsigned long num_entries; + /* total number of head nodes in tree */ + unsigned long num_heads; + + /* total number of head nodes ready for processing */ + unsigned long num_heads_ready; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need * to be run right away */ int flushing; + + u64 run_delayed_start; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -140,9 +150,6 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - struct btrfs_delayed_ref_head **next_ret); int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u32 *refs); @@ -151,6 +158,10 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 parent, u64 orig_ref_root, u64 ref_root, u64 orig_ref_generation, u64 ref_generation, u64 owner_objectid, int pin); +int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head); +int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, + struct list_head *cluster, u64 search_start); /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4f43e227a297..1f1d89b18818 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1458,7 +1458,6 @@ static int transaction_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; - struct btrfs_fs_info *info = root->fs_info; unsigned long now; unsigned long delay; int ret; @@ -1481,24 +1480,8 @@ static int transaction_kthread(void *arg) now = get_seconds(); if (now < cur->start_time || now - cur->start_time < 30) { - unsigned long num_delayed; - num_delayed = cur->delayed_refs.num_entries; mutex_unlock(&root->fs_info->trans_mutex); delay = HZ * 5; - - /* - * we may have been woken up early to start - * processing the delayed extent ref updates - * If so, run some of them and then loop around again - * to see if we need to force a commit - */ - if (num_delayed > 64) { - mutex_unlock(&info->transaction_kthread_mutex); - trans = btrfs_start_transaction(root, 1); - btrfs_run_delayed_refs(trans, root, 256); - btrfs_end_transaction(trans, root); - continue; - } goto sleep; } mutex_unlock(&root->fs_info->trans_mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8471c79b0877..3b8b6c212701 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -926,52 +926,41 @@ again: return NULL; } -/* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. - */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long count) +static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct list_head *cluster) { - struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; + int count = 0; int must_insert_reserved = 0; - int run_all = count == (unsigned long)-1; - - if (root == root->fs_info->extent_root) - root = root->fs_info->tree_root; delayed_refs = &trans->transaction->delayed_refs; -again: - spin_lock(&delayed_refs->lock); - if (count == 0) - count = delayed_refs->num_entries; while (1) { if (!locked_ref) { - /* - * no locked ref, go find something we can - * process in the rbtree. We start at - * the beginning of the tree, there may be less - * lock contention if we do something smarter here. - */ - node = rb_first(&delayed_refs->root); - if (!node) { - spin_unlock(&delayed_refs->lock); + /* pick a new head ref from the cluster list */ + if (list_empty(cluster)) break; - } - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref); - if (ret) { - spin_unlock(&delayed_refs->lock); - break; + locked_ref = list_entry(cluster->next, + struct btrfs_delayed_ref_head, cluster); + + /* grab the lock that says we are going to process + * all the refs for this head */ + ret = btrfs_delayed_ref_lock(trans, locked_ref); + + /* + * we may have dropped the spin lock to get the head + * mutex lock, and that might have given someone else + * time to free the head. If that's true, it has been + * removed from our list and we can move on. + */ + if (ret == -EAGAIN) { + locked_ref = NULL; + count++; + continue; } } @@ -986,7 +975,6 @@ again: * locked_ref is the head node, so we have to go one * node back for any delayed ref updates */ - ref = select_delayed_ref(locked_ref); if (!ref) { /* All delayed refs have been processed, Go ahead @@ -994,6 +982,7 @@ again: * so that any accounting fixes can happen */ ref = &locked_ref->node; + list_del_init(&locked_ref->cluster); locked_ref = NULL; } @@ -1007,30 +996,72 @@ again: BUG_ON(ret); btrfs_put_delayed_ref(ref); - /* once we lock the head ref, we have to process all the - * entries for it. So, we might end up doing more entries - * that count was asking us to do. - */ - if (count > 0) - count--; - - /* - * we set locked_ref to null above if we're all done - * with this bytenr - */ - if (!locked_ref && count == 0) - break; - + count++; cond_resched(); spin_lock(&delayed_refs->lock); } + return count; +} + +/* + * this starts processing the delayed reference count updates and + * extent insertions we have queued up so far. count can be + * 0, which means to process everything in the tree at the start + * of the run (but not newly added entries), or it can be some target + * number you'd like to process. + */ +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long count) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + struct list_head cluster; + int ret; + int run_all = count == (unsigned long)-1; + int run_most = 0; + + if (root == root->fs_info->extent_root) + root = root->fs_info->tree_root; + + delayed_refs = &trans->transaction->delayed_refs; + INIT_LIST_HEAD(&cluster); +again: + spin_lock(&delayed_refs->lock); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + while (1) { + if (!(run_all || run_most) && + delayed_refs->num_heads_ready < 64) + break; + + /* + * go find something we can process in the rbtree. We start at + * the beginning of the tree, and then build a cluster + * of refs to process starting at the first one we are able to + * lock + */ + ret = btrfs_find_ref_cluster(trans, &cluster, + delayed_refs->run_delayed_start); + if (ret) + break; + + ret = run_clustered_refs(trans, root, &cluster); + BUG_ON(ret < 0); + + count -= min_t(unsigned long, ret, count); + + if (count == 0) + break; + } + if (run_all) { - spin_lock(&delayed_refs->lock); node = rb_first(&delayed_refs->root); - if (!node) { - spin_unlock(&delayed_refs->lock); + if (!node) goto out; - } + count = (unsigned long)-1; while (node) { ref = rb_entry(node, struct btrfs_delayed_ref_node, @@ -1052,11 +1083,11 @@ again: node = rb_next(node); } spin_unlock(&delayed_refs->lock); - count = (unsigned long)-1; schedule_timeout(1); goto again; } out: + spin_unlock(&delayed_refs->lock); return 0; } @@ -2407,12 +2438,18 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, */ head->node.in_tree = 0; rb_erase(&head->node.rb_node, &delayed_refs->root); + delayed_refs->num_entries--; /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + + list_del_init(&head->cluster); spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root->fs_info->tree_root, @@ -3705,6 +3742,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_path *path; int i; int orig_level; + int update_count; struct btrfs_root_item *root_item = &root->root_item; WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); @@ -3746,6 +3784,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root } } while (1) { + unsigned long update; wret = walk_down_tree(trans, root, path, &level); if (wret > 0) break; @@ -3764,6 +3803,14 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root } atomic_inc(&root->fs_info->throttle_gen); wake_up(&root->fs_info->transaction_throttle); + for (update_count = 0; update_count < 16; update_count++) { + update = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (update) + btrfs_run_delayed_refs(trans, root, update); + else + break; + } } for (i = 0; i <= orig_level; i++) { if (path->nodes[i]) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f94c2ad8996c..903edab3659a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -68,7 +68,10 @@ static noinline int join_transaction(struct btrfs_root *root) cur_trans->delayed_refs.root.rb_node = NULL; cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); @@ -287,13 +290,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *info = root->fs_info; + int count = 0; - if (trans->delayed_ref_updates && - (trans->transaction->delayed_refs.flushing || - trans->transaction->delayed_refs.num_entries > 16384)) { - btrfs_run_delayed_refs(trans, root, trans->delayed_ref_updates); - } else if (trans->transaction->delayed_refs.num_entries > 64) { - wake_up_process(root->fs_info->transaction_kthread); + while (count < 4) { + unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (cur && + trans->transaction->delayed_refs.num_heads_ready > 64) { + trans->delayed_ref_updates = 0; + btrfs_run_delayed_refs(trans, root, cur); + } else { + break; + } + count++; } mutex_lock(&info->trans_mutex); @@ -929,7 +938,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ trans->transaction->delayed_refs.flushing = 1; - ret = btrfs_run_delayed_refs(trans, root, (u64)-1); + ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); INIT_LIST_HEAD(&dirty_fs_roots);