diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bcec06750232..3f75895c919b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -24,22 +24,135 @@ #include "delayed-ref.h" #include "locking.h" +struct extent_inode_elem { + u64 inum; + u64 offset; + struct extent_inode_elem *next; +}; + +static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, + struct btrfs_file_extent_item *fi, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 data_offset; + u64 data_len; + struct extent_inode_elem *e; + + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + + e = kmalloc(sizeof(*e), GFP_NOFS); + if (!e) + return -ENOMEM; + + e->next = *eie; + e->inum = key->objectid; + e->offset = key->offset + (extent_item_pos - data_offset); + *eie = e; + + return 0; +} + +static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int slot; + int nritems; + int extent_type; + int ret; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != wanted_disk_byte) + continue; + + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + if (ret < 0) + return ret; + } + + return 0; +} + /* * this structure records all encountered refs on the way up to the root */ struct __prelim_ref { struct list_head list; u64 root_id; - struct btrfs_key key; + struct btrfs_key key_for_search; int level; int count; + struct extent_inode_elem *inode_list; u64 parent; u64 wanted_disk_byte; }; +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + * block later to set a correct key + * + * delayed refs + * ============ + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | - | - + * key to resolve | - | y | y | y + * tree block logical | - | - | - | - + * root for resolving | y | y | y | y + * + * - column 1: we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | y | - + * key to resolve | - | - | - | y + * tree block logical | y | y | y | y + * root for resolving | - | y | y | y + * + * - column 1, 3: we've the parent -> done + * - column 2: we take the first key from the block to find the parent + * (see __add_missing_keys) + * - column 4: we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ + static int __add_prelim_ref(struct list_head *head, u64 root_id, - struct btrfs_key *key, int level, u64 parent, - u64 wanted_disk_byte, int count) + struct btrfs_key *key, int level, + u64 parent, u64 wanted_disk_byte, int count) { struct __prelim_ref *ref; @@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, ref->root_id = root_id; if (key) - ref->key = *key; + ref->key_for_search = *key; else - memset(&ref->key, 0, sizeof(ref->key)); + memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + ref->inode_list = NULL; ref->level = level; ref->count = count; ref->parent = parent; @@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, } static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, - struct extent_buffer *eb, int level, - u64 wanted_objectid, u64 wanted_disk_byte) + struct ulist *parents, int level, + struct btrfs_key *key, u64 wanted_disk_byte, + const u64 *extent_item_pos) { int ret; - int slot; + int slot = path->slots[level]; + struct extent_buffer *eb = path->nodes[level]; struct btrfs_file_extent_item *fi; - struct btrfs_key key; + struct extent_inode_elem *eie = NULL; u64 disk_byte; + u64 wanted_objectid = key->objectid; add_parent: - ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (level == 0 && extent_item_pos) { + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie); + if (ret < 0) + return ret; + } + ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS); if (ret < 0) return ret; @@ -89,6 +211,7 @@ add_parent: * repeat this until we don't find any additional EXTENT_DATA items. */ while (1) { + eie = NULL; ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; @@ -97,9 +220,9 @@ add_parent: eb = path->nodes[0]; for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != wanted_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) + btrfs_item_key_to_cpu(eb, key, slot); + if (key->objectid != wanted_objectid || + key->type != BTRFS_EXTENT_DATA_KEY) return 0; fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); @@ -118,8 +241,10 @@ add_parent: */ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int search_commit_root, + u64 time_seq, struct __prelim_ref *ref, - struct ulist *parents) + struct ulist *parents, + const u64 *extent_item_pos) { struct btrfs_path *path; struct btrfs_root *root; @@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", (unsigned long long)ref->root_id, level, ref->count, ret, - (unsigned long long)ref->key.objectid, ref->key.type, - (unsigned long long)ref->key.offset); + (unsigned long long)ref->key_for_search.objectid, + ref->key_for_search.type, + (unsigned long long)ref->key_for_search.offset); if (ret < 0) goto out; @@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, btrfs_item_key_to_cpu(eb, &key, path->slots[0]); } - /* the last two parameters will only be used for level == 0 */ - ret = add_all_parents(root, path, parents, eb, level, key.objectid, - ref->wanted_disk_byte); + ret = add_all_parents(root, path, parents, level, &key, + ref->wanted_disk_byte, extent_item_pos); out: btrfs_free_path(path); return ret; @@ -191,8 +316,9 @@ out: * resolve all indirect backrefs from the list */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - int search_commit_root, - struct list_head *head) + int search_commit_root, u64 time_seq, + struct list_head *head, + const u64 *extent_item_pos) { int err; int ret = 0; @@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct __prelim_ref *new_ref; struct ulist *parents; struct ulist_node *node; + struct ulist_iterator uiter; parents = ulist_alloc(GFP_NOFS); if (!parents) @@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, if (ref->count == 0) continue; err = __resolve_indirect_ref(fs_info, search_commit_root, - ref, parents); + time_seq, ref, parents, + extent_item_pos); if (err) { if (ret == 0) ret = err; @@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } /* we put the first parent into the ref at hand */ - node = ulist_next(parents, NULL); + ULIST_ITER_INIT(&uiter); + node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; + ref->inode_list = + node ? (struct extent_inode_elem *)node->aux : 0; /* additional parents require new refs being added here */ - while ((node = ulist_next(parents, node))) { + while ((node = ulist_next(parents, &uiter))) { new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); if (!new_ref) { ret = -ENOMEM; @@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; + new_ref->inode_list = + (struct extent_inode_elem *)node->aux; list_add(&new_ref->list, &ref->list); } ulist_reinit(parents); @@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, return ret; } +static inline int ref_for_same_block(struct __prelim_ref *ref1, + struct __prelim_ref *ref2) +{ + if (ref1->level != ref2->level) + return 0; + if (ref1->root_id != ref2->root_id) + return 0; + if (ref1->key_for_search.type != ref2->key_for_search.type) + return 0; + if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) + return 0; + if (ref1->key_for_search.offset != ref2->key_for_search.offset) + return 0; + if (ref1->parent != ref2->parent) + return 0; + + return 1; +} + +/* + * read tree blocks and add keys where required. + */ +static int __add_missing_keys(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + struct list_head *pos; + struct extent_buffer *eb; + + list_for_each(pos, head) { + struct __prelim_ref *ref; + ref = list_entry(pos, struct __prelim_ref, list); + + if (ref->parent) + continue; + if (ref->key_for_search.type) + continue; + BUG_ON(!ref->wanted_disk_byte); + eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, + fs_info->tree_root->leafsize, 0); + BUG_ON(!eb); + btrfs_tree_read_lock(eb); + if (btrfs_header_level(eb) == 0) + btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); + else + btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return 0; +} + /* * merge two lists of backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set + * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. + * additionally, we could even add a key range for the blocks we + * looked into to merge even more (-> replace unresolved refs by those + * having a parent). * mode = 2: merge identical parents */ static int __merge_refs(struct list_head *head, int mode) @@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode) ref1 = list_entry(pos1, struct __prelim_ref, list); - if (mode == 1 && ref1->key.type == 0) - continue; for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; pos2 = n2, n2 = pos2->next) { struct __prelim_ref *ref2; + struct __prelim_ref *xchg; ref2 = list_entry(pos2, struct __prelim_ref, list); if (mode == 1) { - if (memcmp(&ref1->key, &ref2->key, - sizeof(ref1->key)) || - ref1->level != ref2->level || - ref1->root_id != ref2->root_id) + if (!ref_for_same_block(ref1, ref2)) continue; + if (!ref1->parent && ref2->parent) { + xchg = ref1; + ref1 = ref2; + ref2 = xchg; + } ref1->count += ref2->count; } else { if (ref1->parent != ref2->parent) @@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode) * smaller or equal that seq to the list */ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct btrfs_key *info_key, struct list_head *prefs) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; + struct btrfs_key key; + struct btrfs_key op_key = {0}; int sgn; int ret = 0; if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(info_key, &extent_op->key); + btrfs_disk_key_to_cpu(&op_key, &extent_op->key); while ((n = rb_prev(n))) { struct btrfs_delayed_ref_node *node; @@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, + ret = __add_prelim_ref(prefs, ref->root, &op_key, ref->level + 1, 0, node->bytenr, node->ref_mod * sgn); break; @@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, + ret = __add_prelim_ref(prefs, ref->root, NULL, ref->level + 1, ref->parent, node->bytenr, node->ref_mod * sgn); @@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, } case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; - ref = btrfs_delayed_node_to_data_ref(node); key.objectid = ref->objectid; @@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, } case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; ref = btrfs_delayed_node_to_data_ref(node); @@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, */ static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int *info_level, - struct list_head *prefs) + int *info_level, struct list_head *prefs) { int ret = 0; int slot; @@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, * enumerate all inline refs */ leaf = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; item_size = btrfs_item_size_nr(leaf, slot); BUG_ON(item_size < sizeof(*ei)); @@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; - struct btrfs_disk_key disk_key; info = (struct btrfs_tree_block_info *)ptr; *info_level = btrfs_tree_block_level(leaf, info); - btrfs_tree_block_key(leaf, info, &disk_key); - btrfs_disk_key_to_cpu(info_key, &disk_key); ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); } else { @@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, + ret = __add_prelim_ref(prefs, 0, NULL, *info_level + 1, offset, bytenr, 1); break; @@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, offset, info_key, - *info_level + 1, 0, bytenr, 1); + ret = __add_prelim_ref(prefs, offset, NULL, + *info_level + 1, 0, + bytenr, 1); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, - count); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count); break; } default: @@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, */ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int info_level, - struct list_head *prefs) + int info_level, struct list_head *prefs) { struct btrfs_root *extent_root = fs_info->extent_root; int ret; @@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, switch (key.type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, + ret = __add_prelim_ref(prefs, 0, NULL, info_level + 1, key.offset, bytenr, 1); break; @@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, key.offset, info_key, - info_level + 1, 0, bytenr, 1); + ret = __add_prelim_ref(prefs, key.offset, NULL, + info_level + 1, 0, + bytenr, 1); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); + bytenr, count); break; } default: @@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist *refs, struct ulist *roots) + u64 delayed_ref_seq, u64 time_seq, + struct ulist *refs, struct ulist *roots, + const u64 *extent_item_pos) { struct btrfs_key key; struct btrfs_path *path; - struct btrfs_key info_key = { 0 }; struct btrfs_delayed_ref_root *delayed_refs = NULL; struct btrfs_delayed_ref_head *head; int info_level = 0; @@ -645,7 +830,7 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } - ret = __add_delayed_refs(head, seq, &info_key, + ret = __add_delayed_refs(head, delayed_ref_seq, &prefs_delayed); if (ret) { spin_unlock(&delayed_refs->lock); @@ -659,16 +844,17 @@ again: struct extent_buffer *leaf; int slot; + path->slots[0]--; leaf = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY) { ret = __add_inline_refs(fs_info, path, bytenr, - &info_key, &info_level, &prefs); + &info_level, &prefs); if (ret) goto out; - ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, + ret = __add_keyed_refs(fs_info, path, bytenr, info_level, &prefs); if (ret) goto out; @@ -676,21 +862,18 @@ again: } btrfs_release_path(path); - /* - * when adding the delayed refs above, the info_key might not have - * been known yet. Go over the list and replace the missing keys - */ - list_for_each_entry(ref, &prefs_delayed, list) { - if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) - memcpy(&ref->key, &info_key, sizeof(ref->key)); - } list_splice_init(&prefs_delayed, &prefs); + ret = __add_missing_keys(fs_info, &prefs); + if (ret) + goto out; + ret = __merge_refs(&prefs, 1); if (ret) goto out; - ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); + ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, + &prefs, extent_item_pos); if (ret) goto out; @@ -709,7 +892,33 @@ again: BUG_ON(ret < 0); } if (ref->count && ref->parent) { - ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); + struct extent_inode_elem *eie = NULL; + if (extent_item_pos && !ref->inode_list) { + u32 bsz; + struct extent_buffer *eb; + bsz = btrfs_level_size(fs_info->extent_root, + info_level); + eb = read_tree_block(fs_info->extent_root, + ref->parent, bsz, 0); + BUG_ON(!eb); + ret = find_extent_in_eb(eb, bytenr, + *extent_item_pos, &eie); + ref->inode_list = eie; + free_extent_buffer(eb); + } + ret = ulist_add_merge(refs, ref->parent, + (unsigned long)ref->inode_list, + (unsigned long *)&eie, GFP_NOFS); + if (!ret && extent_item_pos) { + /* + * we've recorded that parent, so we must extend + * its inode list here + */ + BUG_ON(!eie); + while (eie->next) + eie = eie->next; + eie->next = ref->inode_list; + } BUG_ON(ret < 0); } kfree(ref); @@ -734,6 +943,28 @@ out: return ret; } +static void free_leaf_list(struct ulist *blocks) +{ + struct ulist_node *node = NULL; + struct extent_inode_elem *eie; + struct extent_inode_elem *eie_next; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(blocks, &uiter))) { + if (!node->aux) + continue; + eie = (struct extent_inode_elem *)node->aux; + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } + node->aux = 0; + } + + ulist_free(blocks); +} + /* * Finds all leafs with a reference to the specified combination of bytenr and * offset. key_list_head will point to a list of corresponding keys (caller must @@ -744,7 +975,9 @@ out: */ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **leafs) + u64 delayed_ref_seq, u64 time_seq, + struct ulist **leafs, + const u64 *extent_item_pos) { struct ulist *tmp; int ret; @@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + time_seq, *leafs, tmp, extent_item_pos); ulist_free(tmp); if (ret < 0 && ret != -ENOENT) { - ulist_free(*leafs); + free_leaf_list(*leafs); return ret; } @@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots) + u64 delayed_ref_seq, u64 time_seq, + struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; + struct ulist_iterator uiter; int ret; tmp = ulist_alloc(GFP_NOFS); @@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return -ENOMEM; } + ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, seq, - tmp, *roots); + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + time_seq, tmp, *roots, NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); return ret; } - node = ulist_next(tmp, node); + node = ulist_next(tmp, &uiter); if (!node) break; bytenr = node->val; @@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, - u64 orig_extent_item_objectid, - u64 extent_item_pos, u64 root, +static int iterate_leaf_refs(struct extent_inode_elem *inode_list, + u64 root, u64 extent_item_objectid, iterate_extent_inodes_t *iterate, void *ctx) { - u64 disk_byte; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *eb; - int slot; - int nritems; + struct extent_inode_elem *eie; int ret = 0; - int extent_type; - u64 data_offset; - u64 data_len; - - eb = read_tree_block(fs_info->tree_root, logical, - fs_info->tree_root->leafsize, 0); - if (!eb) - return -EIO; - - /* - * from the shared data ref, we only have the leaf but we need - * the key. thus, we must look into all items and see that we - * find one (some) with a reference to our extent item. - */ - nritems = btrfs_header_nritems(eb); - for (slot = 0; slot < nritems; ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) - continue; - /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) - continue; - - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); - - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - continue; + for (eie = inode_list; eie; eie = eie->next) { pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " - "root %llu\n", orig_extent_item_objectid, - key.objectid, key.offset, root); - ret = iterate(key.objectid, - key.offset + (extent_item_pos - data_offset), - root, ctx); + "root %llu\n", extent_item_objectid, + eie->inum, eie->offset, root); + ret = iterate(eie->inum, eie->offset, root, ctx); if (ret) { - pr_debug("stopping iteration because ret=%d\n", ret); + pr_debug("stopping iteration for %llu due to ret=%d\n", + extent_item_objectid, ret); break; } } - free_extent_buffer(eb); - return ret; } @@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list seq_elem; + struct seq_list seq_elem = {}; + struct seq_list tree_mod_seq_elem = {}; + struct ulist_iterator ref_uiter; + struct ulist_iterator root_uiter; struct btrfs_delayed_ref_root *delayed_refs = NULL; pr_debug("resolving all inodes for extent %llu\n", @@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); btrfs_get_delayed_seq(delayed_refs, &seq_elem); spin_unlock(&delayed_refs->lock); + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - extent_item_pos, seq_elem.seq, - &refs); - + seq_elem.seq, tree_mod_seq_elem.seq, &refs, + &extent_item_pos); if (ret) goto out; - while (!ret && (ref_node = ulist_next(refs, ref_node))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, - seq_elem.seq, &roots); + ULIST_ITER_INIT(&ref_uiter); + while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { + ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, + seq_elem.seq, + tree_mod_seq_elem.seq, &roots); if (ret) break; - while (!ret && (root_node = ulist_next(roots, root_node))) { - pr_debug("root %llu references leaf %llu\n", - root_node->val, ref_node->val); - ret = iterate_leaf_refs(fs_info, ref_node->val, - extent_item_objectid, - extent_item_pos, root_node->val, - iterate, ctx); + ULIST_ITER_INIT(&root_uiter); + while (!ret && (root_node = ulist_next(roots, &root_uiter))) { + pr_debug("root %llu references leaf %llu, data list " + "%#lx\n", root_node->val, ref_node->val, + ref_node->aux); + ret = iterate_leaf_refs( + (struct extent_inode_elem *)ref_node->aux, + root_node->val, extent_item_objectid, + iterate, ctx); } + ulist_free(roots); + roots = NULL; } - ulist_free(refs); + free_leaf_list(refs); ulist_free(roots); out: if (!search_commit_root) { + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_put_delayed_seq(delayed_refs, &seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 57ea2e959e4d..c18d8ac7b795 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots); + u64 delayed_ref_seq, u64 time_seq, + struct ulist **roots); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 99fcad631a21..d7a96cfdc50a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@ #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot); + struct btrfs_path *path, int level, int slot, + int tree_mod_log); +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb); +struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid, + u64 time_seq); +struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 time_seq); struct btrfs_path *btrfs_alloc_path(void) { @@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, 0, new_root_objectid, &disk_key, level, - buf->start, 0, 1); + buf->start, 0); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } +enum mod_log_op { + MOD_LOG_KEY_REPLACE, + MOD_LOG_KEY_ADD, + MOD_LOG_KEY_REMOVE, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, + MOD_LOG_MOVE_KEYS, + MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { + int dst_slot; + int nr_items; +}; + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 index; /* shifted logical */ + struct seq_list elem; + enum mod_log_op op; + + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ + int slot; + + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ + u64 generation; + + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ + struct btrfs_disk_key key; + u64 blockptr; + + /* this is used for op == MOD_LOG_MOVE_KEYS */ + struct tree_mod_move move; + + /* this is used for op == MOD_LOG_ROOT_REPLACE */ + struct tree_mod_root old_root; +}; + +static inline void +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +{ + elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +} + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + elem->flags = 1; + spin_lock(&fs_info->tree_mod_seq_lock); + __get_tree_mod_seq(fs_info, elem); + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct seq_list *cur_elem; + struct tree_mod_elem *tm; + u64 min_seq = (u64)-1; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + BUG_ON(!(elem->flags & 1)); + spin_lock(&fs_info->tree_mod_seq_lock); + list_del(&elem->list); + + list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { + if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { + if (seq_putting > cur_elem->seq) { + /* + * blocker with lower sequence number exists, we + * cannot remove anything from the log + */ + goto out; + } + min_seq = cur_elem->seq; + } + } + + /* + * anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = container_of(node, struct tree_mod_elem, node); + if (tm->elem.seq > min_seq) + continue; + rb_erase(node, tm_root); + list_del(&tm->elem.list); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +out: + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +/* + * key order of the log: + * index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + int ret = 0; + + BUG_ON(!tm || !tm->elem.seq); + + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = container_of(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->index < tm->index) + new = &((*new)->rb_left); + else if (cur->index > tm->index) + new = &((*new)->rb_right); + else if (cur->elem.seq < tm->elem.seq) + new = &((*new)->rb_left); + else if (cur->elem.seq > tm->elem.seq) + new = &((*new)->rb_right); + else { + kfree(tm); + ret = -EEXIST; + goto unlock; + } + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); +unlock: + write_unlock(&fs_info->tree_mod_log_lock); + return ret; +} + +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) { + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 1; + if (!eb) + return 0; + if (btrfs_header_level(eb) == 0) + return 1; + return 0; +} + +static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, + struct tree_mod_elem **tm_ret) +{ + struct tree_mod_elem *tm; + int seq; + + if (tree_mod_dont_log(fs_info, NULL)) + return 0; + + tm = *tm_ret = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; + + tm->elem.flags = 0; + spin_lock(&fs_info->tree_mod_seq_lock); + if (list_empty(&fs_info->tree_mod_seq_list)) { + /* + * someone emptied the list while we were waiting for the lock. + * we must not add to the list, because no blocker exists. items + * are removed from the list only when the existing blocker is + * removed from the list. + */ + kfree(tm); + seq = 0; + } else { + __get_tree_mod_seq(fs_info, &tm->elem); + seq = tm->elem.seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + return seq; +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + if (op != MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + int slot, enum mod_log_op op) +{ + return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int dst_slot, int src_slot, + int nr_items, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + int i; + + if (tree_mod_dont_log(fs_info, eb)) + return 0; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING); + BUG_ON(ret < 0); + } + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *old_root, + struct extent_buffer *new_root, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = new_root->start >> PAGE_CACHE_SHIFT; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = MOD_LOG_ROOT_REPLACE; + + return __tree_mod_log_insert(fs_info, tm); +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, + int smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + u64 index = start >> PAGE_CACHE_SHIFT; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = container_of(node, struct tree_mod_elem, node); + if (cur->index < index) { + node = node->rb_left; + } else if (cur->index > index) { + node = node->rb_right; + } else if (cur->elem.seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* we want the node with the highest seq */ + if (found) + BUG_ON(found->elem.seq > cur->elem.seq); + found = cur; + node = node->rb_left; + } else if (cur->elem.seq > min_seq) { + /* we want the node with the smallest seq */ + if (found) + BUG_ON(found->elem.seq < cur->elem.seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, + u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static inline void +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + struct extent_buffer *src, unsigned long dst_offset, + unsigned long src_offset, int nr_items) +{ + int ret; + int i; + + if (tree_mod_dont_log(fs_info, NULL)) + return; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return; + + /* speed this up by single seq for all operations? */ + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + int dst_offset, int src_offset, int nr_items) +{ + int ret; + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, + nr_items, GFP_NOFS); + BUG_ON(ret < 0); +} + +static inline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int slot, int atomic) +{ + int ret; + + ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); + BUG_ON(ret < 0); +} + +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + int i; + int ret; + u32 nritems; + + if (tree_mod_dont_log(fs_info, eb)) + return; + + nritems = btrfs_header_nritems(eb); + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert_key(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING); + BUG_ON(ret < 0); + } +} + +static inline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, + struct extent_buffer *new_root_node) +{ + int ret; + tree_mod_log_free_eb(root->fs_info, root->node); + ret = tree_mod_log_insert_root(root->fs_info, root->node, + new_root_node, GFP_NOFS); + BUG_ON(ret < 0); +} + /* * check if the tree block can be shared by multiple trees */ @@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } + /* + * don't log freeing in case we're freeing the root node, this + * is done by tree_mod_log_set_root_pointer later + */ + if (buf != root->node && btrfs_header_level(buf) != 0) + tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, root->root_key.objectid, &disk_key, - level, search_start, empty_size, 1); + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; extent_buffer_get(cow); + tree_mod_log_set_root_pointer(root, cow); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); + last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; WARN_ON(trans->transid != btrfs_header_generation(parent)); + tree_mod_log_insert_key(root->fs_info, parent, parent_slot, + MOD_LOG_KEY_REPLACE); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); + last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = root->node->start; + int looped = 0; + + if (!time_seq) + return 0; + + /* + * the very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). this has the index of the *new* + * root, making it the very first operation that's logged for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(fs_info, root_logical, + time_seq); + if (!looped && !tm) + return 0; + /* + * we must have key remove operations in the log before the + * replace operation. + */ + BUG_ON(!tm); + + if (tm->op != MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + BUG_ON(root_logical == root->node->start); + looped = 1; + } + + return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, + struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + while (tm && tm->elem.seq >= time_seq) { + /* + * all the operations are recorded with the operator used for + * the modification. as we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case MOD_LOG_KEY_ADD: + if (tm->slot != n - 1) { + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->slot + 1); + memmove_extent_buffer(eb, o_dst, o_src, p_size); + } + n--; + break; + case MOD_LOG_MOVE_KEYS: + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + break; + case MOD_LOG_ROOT_REPLACE: + /* + * this operation is special. for roots, this must be + * handled explicitly before rewinding. + * for non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. in the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. we simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = container_of(next, struct tree_mod_elem, node); + if (tm->index != first_tm->index) + break; + } + btrfs_set_header_nritems(eb, n); +} + +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(eb->start, + fs_info->tree_root->nodesize); + BUG_ON(!eb_rewin); + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + BUG_ON(!eb_rewin); + } + + extent_buffer_get(eb_rewin); + free_extent_buffer(eb); + + __tree_mod_log_rewind(eb_rewin, time_seq, tm); + + return eb_rewin; +} + +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct extent_buffer *eb; + struct tree_mod_root *old_root; + u64 old_generation; + + tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + if (!tm) + return root->node; + + old_root = &tm->old_root; + old_generation = tm->generation; + + tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); + /* + * there was an item in the log when __tree_mod_log_oldest_root + * returned. this one must not go away, because the time_seq passed to + * us must be blocking its removal. + */ + BUG_ON(!tm); + + if (old_root->logical == root->node->start) { + /* there are logged operations for the current root */ + eb = btrfs_clone_extent_buffer(root->node); + } else { + /* there's a root replace operation for the current root */ + eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, + root->nodesize); + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, root->root_key.objectid); + } + if (!eb) + return NULL; + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + __tree_mod_log_rewind(eb, time_seq, tm); + + return eb; +} + static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -976,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } + tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -989,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); + btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); return 0; @@ -1042,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1); + del_ptr(trans, root, path, level + 1, pslot + 1, 1); root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1, 0); + btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &right_key, pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1084,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot); + del_ptr(trans, root, path, level + 1, pslot, 1); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); + btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, + pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -1190,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &disk_key, pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1241,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &disk_key, pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1498,7 +2158,7 @@ static int read_block_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer **eb_ret, int level, int slot, - struct btrfs_key *key) + struct btrfs_key *key, u64 time_seq) { u64 blocknr; u64 gen; @@ -1852,7 +2512,7 @@ cow_done: } err = read_block_for_search(trans, root, p, - &b, level, slot, key); + &b, level, slot, key, 0); if (err == -EAGAIN) goto again; if (err) { @@ -1923,6 +2583,115 @@ done: return ret; } +/* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + + lowest_level = p->lowest_level; + WARN_ON(p->nodes[0] != NULL); + + if (p->search_commit_root) { + BUG_ON(time_seq); + return btrfs_search_slot(NULL, root, key, p, 0, 0); + } + +again: + b = get_old_root(root, time_seq); + extent_buffer_get(b); + level = btrfs_header_level(b); + btrfs_tree_read_lock(b); + p->locks[level] = BTRFS_READ_LOCK; + + while (b) { + level = btrfs_header_level(b); + p->nodes[level] = b; + btrfs_clear_path_blocking(p, NULL, 0); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + */ + btrfs_unlock_up_safe(p, level + 1); + + ret = bin_search(b, key, level, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(NULL, root, p, &b, level, + slot, key, time_seq); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; + b = tree_mod_log_rewind(root->fs_info, b, time_seq); + if (b != p->nodes[level]) { + btrfs_tree_unlock_rw(p->nodes[level], + p->locks[level]); + p->locks[level] = 0; + p->nodes[level] = b; + } + } else { + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } + } + ret = 1; +done: + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(p); + + return ret; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -1943,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; + tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -2025,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { + tree_mod_log_eb_move(root->fs_info, src, 0, push_items, + src_nritems - push_items); memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), (src_nritems - push_items) * @@ -2084,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); + tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -2131,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, &lower_key, - level, root->node->start, 0, 0); + level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -2163,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; + tree_mod_log_set_root_pointer(root, c); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -2186,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, static void insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) + int slot, int level, int tree_mod_log) { struct extent_buffer *lower; int nritems; + int ret; BUG_ON(!path->nodes[level]); btrfs_assert_tree_locked(path->nodes[level]); @@ -2198,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(slot > nritems); BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { + if (tree_mod_log && level) + tree_mod_log_eb_move(root->fs_info, lower, slot + 1, + slot, nritems - slot); memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), btrfs_node_key_ptr_offset(slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } + if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, lower, slot, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); WARN_ON(trans->transid == 0); @@ -2254,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - &disk_key, level, c->start, 0, 0); + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -2273,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -2286,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(split); insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); + path->slots[level + 1] + 1, level + 1, 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2823,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + path->slots[1] + 1, 1, 0); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -3006,7 +3793,7 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0, 0); + &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); @@ -3030,7 +3817,7 @@ again: if (mid <= slot) { btrfs_set_header_nritems(right, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + path->slots[1] + 1, 1, 0); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3039,7 +3826,7 @@ again: } else { btrfs_set_header_nritems(right, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1], 1); + path->slots[1], 1, 0); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3751,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * empty a node. */ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) + struct btrfs_path *path, int level, int slot, + int tree_mod_log) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; + int ret; nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { + if (tree_mod_log && level) + tree_mod_log_eb_move(root->fs_info, parent, slot, + slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(slot), btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); + } else if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); } + nritems--; btrfs_set_header_nritems(parent, nritems); if (nritems == 0 && parent == root->node) { @@ -3795,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1]); + del_ptr(trans, root, path, 1, path->slots[1], 1); /* * btrfs_free_extent is expensive, we want to make sure we @@ -3806,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); extent_buffer_get(leaf); - btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); + btrfs_free_tree_block(trans, root, leaf, 0, 1); free_extent_buffer_stale(leaf); } /* @@ -4273,7 +5070,7 @@ again: next = c; next_rw_lock = path->locks[level]; ret = read_block_for_search(NULL, root, path, &next, level, - slot, &key); + slot, &key, 0); if (ret == -EAGAIN) goto again; @@ -4310,7 +5107,7 @@ again: break; ret = read_block_for_search(NULL, root, path, &next, level, - 0, &key); + 0, &key, 0); if (ret == -EAGAIN) goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1c665ebe47e0..0151ca1ac657 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1140,6 +1140,15 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; + /* this protects tree_mod_seq_list */ + spinlock_t tree_mod_seq_lock; + atomic_t tree_mod_seq; + struct list_head tree_mod_seq_list; + + /* this protects tree_mod_log */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + atomic_t nr_async_submits; atomic_t async_submit_draining; atomic_t nr_async_bios; @@ -2537,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow); + u64 hint, u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow); + u64 parent, int last_ref); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2700,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, @@ -3139,4 +3150,23 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); +/* delayed seq elem */ +struct seq_list { + struct list_head list; + u64 seq; + u32 flags; +}; + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); + +static inline int is_fstree(u64 rootid) +{ + if (rootid == BTRFS_FS_TREE_OBJECTID || + (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 69f22e3ab3bc..13ae7b04790e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) + if (is_fstree(ref_root)) seq = inc_delayed_seq(delayed_refs); ref->seq = seq; @@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) + if (is_fstree(ref_root)) seq = inc_delayed_seq(delayed_refs); ref->seq = seq; @@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && + if (!is_fstree(ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); + return 0; } @@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && + if (!is_fstree(ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); + return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d8f244d94925..413927fb9957 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); -struct seq_list { - struct list_head list; - u64 seq; -}; - static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) { assert_spin_locked(&delayed_refs->lock); @@ -229,25 +224,6 @@ btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, u64 seq); -/* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b0d49e21b0b1..b99d5127ba18 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, BTRFS_TREE_LOG_OBJECTID, NULL, - 0, 0, 0, 0); + 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); @@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->free_chunk_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv); btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); @@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; fs_info->free_chunk_space = 0; + fs_info->tree_mod_log = RB_ROOT; /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1902726fa70a..4b5a1e1bdefb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5218,7 +5218,7 @@ out: void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow) + u64 parent, int last_ref) { struct btrfs_block_group_cache *cache = NULL; int ret; @@ -5228,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL, 0); BUG_ON(ret); /* -ENOMEM */ } @@ -6250,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow) + u64 hint, u64 empty_size) { struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; @@ -6298,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, for_cow); + extent_op, 0); BUG_ON(ret); /* -ENOMEM */ } return buf; @@ -6716,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b3692c1373aa..2c8f7b204617 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3924,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, eb->start = start; eb->len = len; eb->tree = tree; + eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3961,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return eb; } +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) +{ + unsigned long i; + struct page *p; + struct extent_buffer *new; + unsigned long num_pages = num_extent_pages(src->start, src->len); + + new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); + if (new == NULL) + return NULL; + + for (i = 0; i < num_pages; i++) { + p = alloc_page(GFP_ATOMIC); + BUG_ON(!p); + attach_extent_buffer_page(new, p); + WARN_ON(PageDirty(p)); + SetPageUptodate(p); + new->pages[i] = p; + } + + copy_extent_buffer(new, src, 0, 0, src->len); + set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); + set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + + return new; +} + +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +{ + struct extent_buffer *eb; + unsigned long num_pages = num_extent_pages(0, len); + unsigned long i; + + eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); + if (!eb) + return NULL; + + for (i = 0; i < num_pages; i++) { + eb->pages[i] = alloc_page(GFP_ATOMIC); + if (!eb->pages[i]) + goto err; + } + set_extent_buffer_uptodate(eb); + btrfs_set_header_nritems(eb, 0); + set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + return eb; +err: + for (i--; i > 0; i--) + __free_page(eb->pages[i]); + __free_extent_buffer(eb); + return NULL; +} + static int extent_buffer_under_io(struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || @@ -3977,6 +4032,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long index; unsigned long num_pages; struct page *page; + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); BUG_ON(extent_buffer_under_io(eb)); @@ -3988,7 +4044,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, do { index--; page = extent_buffer_page(eb, index); - if (page) { + if (page && mapped) { spin_lock(&page->mapping->private_lock); /* * We do this since we'll remove the pages after we've @@ -4013,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, } spin_unlock(&page->mapping->private_lock); + } + if (page) { /* One for when we alloced the page */ page_cache_release(page); } @@ -4231,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - struct extent_io_tree *tree = eb->tree; + if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + } else { + struct extent_io_tree *tree = eb->tree; - spin_unlock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); - spin_lock(&tree->buffer_lock); - radix_tree_delete(&tree->buffer, - eb->start >> PAGE_CACHE_SHIFT); - spin_unlock(&tree->buffer_lock); + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + } /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb, 0); @@ -4255,6 +4317,10 @@ void free_extent_buffer(struct extent_buffer *eb) return; spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) + atomic_dec(&eb->refs); + if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4d8124b64577..25900af5b15d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -39,6 +39,7 @@ #define EXTENT_BUFFER_STALE 6 #define EXTENT_BUFFER_WRITEBACK 7 #define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_DUMMY 9 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -264,6 +265,8 @@ void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f8c354c4c76..24b776c08d99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -368,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root, return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0, 0); + 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 82b03afcbd92..1791c6e3d834 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -56,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root) static noinline int join_transaction(struct btrfs_root *root, int nofail) { struct btrfs_transaction *cur_trans; + struct btrfs_fs_info *fs_info = root->fs_info; - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - spin_unlock(&root->fs_info->trans_lock); + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&fs_info->trans_lock); return -EROFS; } - if (root->fs_info->trans_no_join) { + if (fs_info->trans_no_join) { if (!nofail) { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return -EBUSY; } } - cur_trans = root->fs_info->running_transaction; + cur_trans = fs_info->running_transaction; if (cur_trans) { if (cur_trans->aborted) { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return 0; } - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; - spin_lock(&root->fs_info->trans_lock); - if (root->fs_info->running_transaction) { + spin_lock(&fs_info->trans_lock); + if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure * to redo the trans_no_join checks above */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); - cur_trans = root->fs_info->running_transaction; + cur_trans = fs_info->running_transaction; goto loop; } @@ -122,20 +123,38 @@ loop: cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; cur_trans->delayed_refs.seq = 1; + + /* + * although the tree mod log is per file system and not per transaction, + * the log must never go across transaction boundaries. + */ + smp_mb(); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " + "creating a fresh transaction\n"); + WARN_ON(1); + } + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { + printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + "creating a fresh transaction\n"); + WARN_ON(1); + } + atomic_set(&fs_info->tree_mod_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, - root->fs_info->btree_inode->i_mapping); - root->fs_info->generation++; - cur_trans->transid = root->fs_info->generation; - root->fs_info->running_transaction = cur_trans; + fs_info->btree_inode->i_mapping); + fs_info->generation++; + cur_trans->transid = fs_info->generation; + fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return 0; } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index ad993bc2df93..ab942f46b3dd 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -23,9 +23,9 @@ * * ulist = ulist_alloc(); * ulist_add(ulist, root); - * elem = NULL; + * ULIST_ITER_INIT(&uiter); * - * while ((elem = ulist_next(ulist, elem)) { + * while ((elem = ulist_next(ulist, &uiter)) { * for (all child nodes n in elem) * ulist_add(ulist, n); * do something useful with the node; @@ -145,12 +145,21 @@ EXPORT_SYMBOL(ulist_free); */ int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, gfp_t gfp_mask) +{ + return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); +} + +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long *old_aux, gfp_t gfp_mask) { int i; for (i = 0; i < ulist->nnodes; ++i) { - if (ulist->nodes[i].val == val) + if (ulist->nodes[i].val == val) { + if (old_aux) + *old_aux = ulist->nodes[i].aux; return 0; + } } if (ulist->nnodes >= ulist->nodes_alloced) { @@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add); /** * ulist_next - iterate ulist * @ulist: ulist to iterate - * @prev: previously returned element or %NULL to start iteration + * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) * * Note: locking must be provided by the caller. In case of rwlocks only read * locking is needed * - * This function is used to iterate an ulist. The iteration is started with - * @prev = %NULL. It returns the next element from the ulist or %NULL when the + * This function is used to iterate an ulist. + * It returns the next element from the ulist or %NULL when the * end is reached. No guarantee is made with respect to the order in which * the elements are returned. They might neither be returned in order of * addition nor in ascending order. * It is allowed to call ulist_add during an enumeration. Newly added items * are guaranteed to show up in the running enumeration. */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) { - int next; - if (ulist->nnodes == 0) return NULL; - - if (!prev) - return &ulist->nodes[0]; - - next = (prev - ulist->nodes) + 1; - if (next < 0 || next >= ulist->nnodes) + if (uiter->i < 0 || uiter->i >= ulist->nnodes) return NULL; - return &ulist->nodes[next]; + return &ulist->nodes[uiter->i++]; } EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 6568c3527732..21bdc8ec8130 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -24,6 +24,10 @@ */ #define ULIST_SIZE 16 +struct ulist_iterator { + int i; +}; + /* * element of the list */ @@ -61,7 +65,13 @@ void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, gfp_t gfp_mask); -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, + gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long *old_aux, gfp_t gfp_mask); +struct ulist_node *ulist_next(struct ulist *ulist, + struct ulist_iterator *uiter); + +#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0) #endif