From 84850e8d8a5ec7b9d3c47d224e9a10c9da52ff1b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 29 Aug 2011 09:25:53 +0800 Subject: [PATCH 01/19] btrfs: check file extent backref offset underflow Offset field in data extent backref can underflow if clone range ioctl is used. We can reliably detect the underflow because max file size is limited to 2^63 and max data extent size is limited by block group size. Signed-off-by: Zheng Yan --- fs/btrfs/relocation.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 10af6a0e0865..24d654ce7a06 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3322,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc, } key.objectid = ref_objectid; - key.offset = ref_offset; key.type = BTRFS_EXTENT_DATA_KEY; + if (ref_offset > ((u64)-1 << 32)) + key.offset = 0; + else + key.offset = ref_offset; path->search_commit_root = 1; path->skip_locking = 1; From 60ccf82f5b6e26e10d41783464ca469c070c7d49 Mon Sep 17 00:00:00 2001 From: Diego Calleja Date: Thu, 1 Sep 2011 16:33:57 +0200 Subject: [PATCH 02/19] btrfs: fix memory leak in btrfs_defrag_file kmemleak found this: unreferenced object 0xffff8801b64af968 (size 512): comm "btrfs-cleaner", pid 3317, jiffies 4306810886 (age 903.272s) hex dump (first 32 bytes): 00 82 01 07 00 ea ff ff c0 83 01 07 00 ea ff ff ................ 80 82 01 07 00 ea ff ff c0 87 01 07 00 ea ff ff ................ backtrace: [] kmemleak_alloc+0x5c/0xc0 [] kmem_cache_alloc_trace+0x163/0x240 [] btrfs_defrag_file+0xf0/0xb20 [] btrfs_run_defrag_inodes+0x165/0x210 [] cleaner_kthread+0x177/0x190 [] kthread+0x8d/0xa0 [] kernel_thread_helper+0x4/0x10 [] 0xffffffffffffffff "pages" is not always freed. Fix it removing the unnecesary additional return. Signed-off-by: Diego Calleja --- fs/btrfs/ioctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d2b53eb8a8c2..8ccc106f4e18 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1140,9 +1140,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, btrfs_set_super_incompat_flags(disk_super, features); } - if (!file) - kfree(ra); - return defrag_count; + ret = defrag_count; out_ra: if (!file) From cbcc83265d929ac71553c1b5dafdb830171af947 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Sep 2011 15:56:25 +0800 Subject: [PATCH 03/19] Btrfs: fix defragmentation regression There's an off-by-one bug: # create a file with lots of 4K file extents # btrfs fi defrag /mnt/file # sync # filefrag -v /mnt/file Filesystem type is: 9123683e File size of /mnt/file is 1228800 (300 blocks, blocksize 4096) ext logical physical expected length flags 0 0 3372 64 1 64 3136 3435 1 2 65 3436 3136 64 3 129 3201 3499 1 4 130 3500 3201 64 5 194 3266 3563 1 6 195 3564 3266 64 7 259 3331 3627 1 8 260 3628 3331 40 eof After this patch: ... # filefrag -v /mnt/file Filesystem type is: 9123683e File size of /mnt/file is 1228800 (300 blocks, blocksize 4096) ext logical physical expected length flags 0 0 3372 300 eof /mnt/file: 1 extent found Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8ccc106f4e18..b39f7bf92704 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1087,7 +1087,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, defrag_count += ret; balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); - i += ret; if (newer_than) { if (newer_off == (u64)-1) @@ -1107,7 +1106,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; } } else { - i++; + if (ret > 0) + i += ret; + else + i++; } } From 151a31b25e5c941bdd9fdefed650effca223c716 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Sep 2011 15:56:39 +0800 Subject: [PATCH 04/19] Btrfs: use i_size_read() in btrfs_defrag_file() Don't use inode->i_size directly, since we're not holding i_mutex. This also fixes another bug, that i_size can change after it's checked against 0 and then (i_size - 1) can be negative. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b39f7bf92704..323d77f09258 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -978,6 +978,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_super_block *disk_super; struct file_ra_state *ra = NULL; unsigned long last_index; + u64 isize = i_size_read(inode); u64 features; u64 last_len = 0; u64 skip = 0; @@ -1003,7 +1004,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, compress_type = range->compress_type; } - if (inode->i_size == 0) + if (isize == 0) return 0; /* @@ -1028,10 +1029,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, /* find the last page to defrag */ if (range->start + range->len > range->start) { - last_index = min_t(u64, inode->i_size - 1, + last_index = min_t(u64, isize - 1, range->start + range->len - 1) >> PAGE_CACHE_SHIFT; } else { - last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + last_index = (isize - 1) >> PAGE_CACHE_SHIFT; } if (newer_than) { From 5ca496604b5975d371bb669ee6c2394bcbea818f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Sep 2011 15:56:55 +0800 Subject: [PATCH 05/19] Btrfs: fix wrong max_to_defrag in btrfs_defrag_file() It's off-by-one, and thus we may skip the last page while defragmenting. An example case: # create /mnt/file with 2 4K file extents # btrfs fi defrag /mnt/file # sync # filefrag /mnt/file /mnt/file: 2 extents found So it's not defragmented. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 323d77f09258..f9026413bcf1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1052,7 +1052,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index - 1; + max_to_defrag = last_index; while (i <= last_index && defrag_count < max_to_defrag) { /* From 83c8c9bde0add721f7509aa446455183b040b931 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Wed, 14 Sep 2011 14:11:21 +0800 Subject: [PATCH 06/19] btrfs: trivial fix, a potential memory leak in btrfs_parse_early_options() Signed-off-by: Jie Liu --- fs/btrfs/super.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 266d1f35465d..09ce951666ea 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -419,7 +419,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; + char *device_name, *opts, *orig, *p; int error = 0; int intarg; @@ -470,8 +470,14 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } break; case Opt_device: - error = btrfs_scan_one_device(match_strdup(&args[0]), + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, flags, holder, fs_devices); + kfree(device_name); if (error) goto out; break; From 008873eafbc77deb1702aedece33756c58486c6a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 2 Sep 2011 15:57:07 +0800 Subject: [PATCH 07/19] Btrfs: honor extent thresh during defragmentation We won't defrag an extent, if it's bigger than the threshold we specified and there's no small extent before it, but actually the code doesn't work this way. There are three bugs: - When should_defrag_range() decides we should keep on defragmenting an extent, last_len is not incremented. (old bug) - The length that passes to should_defrag_range() is not the length we're going to defrag. (new bug) - We always defrag 256K bytes data, and a big extent can be part of this range. (new bug) For a file with 4 extents: | 4K | 4K | 256K | 256K | The result of defrag with (the default) 256K extent thresh should be: | 264K | 256K | but with those bugs, we'll get: | 520K | Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f9026413bcf1..d524b6697ad9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -765,7 +765,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, int ret = 1; /* - * make sure that once we start defragging and extent, we keep on + * make sure that once we start defragging an extent, we keep on * defragging it */ if (start < *defrag_end) @@ -810,7 +810,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, * extent will force at least part of that big extent to be defragged. */ if (ret) { - *last_len += len; *defrag_end = extent_map_end(em); } else { *last_len = 0; @@ -984,13 +983,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, u64 skip = 0; u64 defrag_end = 0; u64 newer_off = range->start; - int newer_left = 0; unsigned long i; + unsigned long ra_index = 0; int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; int extent_thresh = range->extent_thresh; - int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; @@ -1020,7 +1020,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra = &file->f_ra; } - pages = kmalloc(sizeof(struct page *) * newer_cluster, + pages = kmalloc(sizeof(struct page *) * max_cluster, GFP_NOFS); if (!pages) { ret = -ENOMEM; @@ -1045,7 +1045,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the extents in the file evenly spaced */ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else goto out_ra; } else { @@ -1077,12 +1076,26 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = max(i + 1, next); continue; } + + if (!newer_than) { + cluster = (PAGE_CACHE_ALIGN(defrag_end) >> + PAGE_CACHE_SHIFT) - i; + cluster = min(cluster, max_cluster); + } else { + cluster = max_cluster; + } + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; - btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); + if (i + cluster > ra_index) { + ra_index = max(i, ra_index); + btrfs_force_ra(inode->i_mapping, ra, file, ra_index, + cluster); + ra_index += max_cluster; + } - ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); + ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) goto out_ra; @@ -1102,15 +1115,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else { break; } } else { - if (ret > 0) + if (ret > 0) { i += ret; - else + last_len += ret << PAGE_CACHE_SHIFT; + } else { i++; + last_len = 0; + } } } From f4c697e6406da5dd445eda8d923c53e1138793dd Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 5 Sep 2011 16:34:54 +0200 Subject: [PATCH 08/19] btrfs: return EINVAL if start > total_bytes in fitrim ioctl We should retirn EINVAL if the start is beyond the end of the file system in the btrfs_ioctl_fitrim(). Fix that by adding the appropriate check for it. Also in the btrfs_trim_fs() it is possible that len+start might overflow if big values are passed. Fix it by decrementing the len so that start+len is equal to the file system size in the worst case. Signed-off-by: Lukas Czerner --- fs/btrfs/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d524b6697ad9..136a2f980e21 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -282,6 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; + u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -300,12 +301,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) } } rcu_read_unlock(); + if (!num_devices) return -EOPNOTSUPP; - if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; + if (range.start > total_bytes) + return -EINVAL; + range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(root, &range); if (ret < 0) From a05a9bb18ae0abec0b513b5fde876c47905fa13e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 6 Sep 2011 16:55:34 +0800 Subject: [PATCH 09/19] Btrfs: fix array bound checking Otherwise we can execced the array bound of path->slots[]. Signed-off-by: Li Zefan --- fs/btrfs/ctree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 011cab3aca8d..0fe615e4ea38 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, orig_ptr = btrfs_node_blockptr(mid, orig_slot); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } /* * deal with the case where there is only one pointer in the root @@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } if (!parent) return 1; From 560f7d75457f86a43970aa413e334e394082dce4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 8 Sep 2011 10:22:01 +0800 Subject: [PATCH 10/19] Btrfs: remove BUG_ON() in compress_file_range() It's not a big deal if we fail to allocate the array, and instead of panic we can just give up compressing. Signed-off-by: Li Zefan --- fs/btrfs/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f12747c9447b..81d4f68f35c9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -393,7 +393,10 @@ again: (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - BUG_ON(!pages); + if (!pages) { + /* just bail out to the uncompressed code */ + goto cont; + } if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; @@ -424,6 +427,7 @@ again: will_compress = 1; } } +cont: if (start == 0) { trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); From f0dd9592a1aa014b3a01aa2be7e795aae040d65b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 8 Sep 2011 10:26:51 +0800 Subject: [PATCH 11/19] Btrfs: fix direct-io vs nodatacow To reproduce the bug: # mount -o nodatacow /dev/sda7 /mnt/ # dd if=/dev/zero of=/mnt/tmp bs=4K count=1 1+0 records in 1+0 records out 4096 bytes (4.1 kB) copied, 0.000136115 s, 30.1 MB/s # dd if=/dev/zero of=/mnt/tmp bs=4K count=1 conv=notrunc oflag=direct dd: writing `/mnt/tmp': Input/output error 1+0 records in 0+0 records out btrfs_ordered_update_i_size() may return 1, but btrfs_endio_direct_write() mistakenly takes it as an error. Signed-off-by: Li Zefan --- fs/btrfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81d4f68f35c9..65474d95f26f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5777,8 +5777,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - ret = btrfs_update_inode(trans, root, inode); - err = ret; + err = btrfs_update_inode(trans, root, inode); goto out; } From fee187d9d9ddc382c81370a9a280391132dea2e1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 29 Sep 2011 15:55:28 +0800 Subject: [PATCH 12/19] Btrfs: do not set EXTENT_DIRTY along with EXTENT_DELALLOC Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0ada0b700b44..f284d4e5f447 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1107,7 +1107,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, NULL, cached_state, mask); } From 10b2f34d6e7fbe07f498cb2006272e9a561f5e60 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 2 Oct 2011 13:56:53 +0300 Subject: [PATCH 13/19] Btrfs: pass the correct root to lookup_free_space_inode() Free space items are located in tree of tree roots, not in the extent tree. It didn't pop up because lookup_free_space_inode() grabs the inode all the time instead of actually searching the tree. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4eb7d2ba38f8..6cfcc9060c83 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7312,7 +7312,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); BUG_ON(ret); From cfbffc39ac89dbd5197cbeec2599a1128eb928f8 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 6 Oct 2011 13:37:08 +0900 Subject: [PATCH 14/19] Btrfs: fix return value of btrfs_get_acl() In btrfs_get_acl(), when the second __btrfs_getxattr() call fails, acl is not correctly set. Therefore, a wrong value might return to the caller. Signed-off-by: Tsutomu Itoh --- fs/btrfs/acl.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index eb159aaa5a11..89b156d85d63 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) { - kfree(value); - return acl; - } - set_cached_acl(inode, type, acl); - } - kfree(value); + } + if (size > 0) { + acl = posix_acl_from_xattr(value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; - set_cached_acl(inode, type, acl); } else { acl = ERR_PTR(-EIO); } + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); return acl; } From 60d2adbb1e7fee1cb4bc67f70bd0bd8ace7b6c3c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 9 Sep 2011 17:34:35 +0800 Subject: [PATCH 15/19] Btrfs: fix race between multi-task space allocation and caching space The task may fail to get free space though it is enough when multi-task space allocation and caching space happen at the same time. Task1 Caching Thread Task2 ------------------------------------------------------------------------ find_free_extent The space has not be cached, and start caching thread. And wait for it. cache space, if the space is > 2MB wake up Task1 find_free_extent get all the space that is cached. try to allocate space, but there is no space now. trigger BUG_ON() The message is following: btrfs allocation failed flags 1, wanted 4096 space_info has 1040187392 free, is not full space_info total=1082130432, used=4096, pinned=41938944, reserved=0, may_use=40828928, readonly=0 block group 12582912 has 8388608 bytes, 0 used 8388608 pinned 0 reserved block group has cluster?: no 0 blocks of free space at or bigger than bytes is block group 1103101952 has 1073741824 bytes, 4096 used 33550336 pinned 0 reserved block group has cluster?: no 0 blocks of free space at or bigger than bytes is ------------[ cut here ]------------ kernel BUG at fs/btrfs/inode.c:835! [] __extent_writepage+0x1bf/0x5ce [btrfs] [] ? __set_page_dirty_nobuffers+0xfe/0x108 [] ? wait_current_trans+0x23/0xec [btrfs] [] ? find_get_pages_tag+0x73/0xe2 [] extent_write_cache_pages.clone.0+0x176/0x29a [btrfs] [] extent_writepages+0x3e/0x53 [btrfs] [] ? do_sync_write+0xc6/0x103 [] ? btrfs_submit_direct+0x414/0x414 [btrfs] [] ? fsnotify+0x236/0x266 [] btrfs_writepages+0x22/0x24 [btrfs] [] do_writepages+0x1c/0x25 [] __filemap_fdatawrite_range+0x4e/0x50 [] filemap_write_and_wait_range+0x28/0x51 [] btrfs_sync_file+0x7d/0x198 [btrfs] [] ? fsnotify_modify+0x5d/0x65 [] vfs_fsync_range+0x18/0x21 [] vfs_fsync+0x17/0x19 [] do_fsync+0x29/0x3e [] sys_fsync+0xb/0xf [] system_call_fastpath+0x16/0x1b [SNIP] RIP [] cow_file_range+0x1c4/0x32b [btrfs] We fix this bug by trying to allocate the space again if there are block groups in caching. Signed-off-by: Miao Xie --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6cfcc9060c83..cef355f1328a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4954,6 +4954,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; + bool have_caching_bg = false; u64 ideal_cache_percent = 0; u64 ideal_cache_offset = 0; @@ -5036,6 +5037,7 @@ ideal_cache: } } search: + have_caching_bg = false; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -5244,6 +5246,8 @@ refill_cluster: failed_alloc = true; goto have_block_group; } else if (!offset) { + if (!cached) + have_caching_bg = true; goto loop; } checks: @@ -5294,6 +5298,9 @@ loop: } up_read(&space_info->groups_sem); + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) + goto search; + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; From 068132bad1de70f85f5f6d12c36d64f8f7848d92 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Thu, 23 Jun 2011 23:01:01 +0800 Subject: [PATCH 16/19] btrfs: fix oops on failure path If lookup_extent_backref fails, path->nodes[0] reasonably could be null along with other callers of btrfs_print_leaf, so ensure we have a valid extent buffer before dereferencing. Signed-off-by: Daniel J Blueman --- fs/btrfs/print-tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fb2605d998e9..f38e452486b8 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; - u32 type; - u32 nr = btrfs_header_nritems(l); + u32 type, nr; struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; @@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) struct btrfs_key key; struct btrfs_key found_key; + if (!l) + return; + + nr = btrfs_header_nritems(l); + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); From 5f524444c351e145a5f7e28253594688a421bfe8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Oct 2011 00:20:43 +0300 Subject: [PATCH 17/19] Btrfs: fix a bug when opening seed devices Initialize fs_info->bdev_holder a bit earlier to be able to pass a correct holder id to blkdev_get() when opening seed devices with O_EXCL. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 09ce951666ea..29eecbb6ec3a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -939,6 +939,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + btrfs_sb(s)->fs_info->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { @@ -946,7 +947,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return ERR_PTR(error); } - btrfs_sb(s)->fs_info->bdev_holder = fs_type; s->s_flags |= MS_ACTIVE; } From 20bcd64934e4eb8f3f90a0dca54fb0ac2edd7795 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 20 Oct 2011 00:06:20 +0300 Subject: [PATCH 18/19] Btrfs: close all bdevs on mount failure Fix a bug introduced by 20b45077. We have to return EINVAL on mount failure, but doing that too early in the sequence leaves all of the devices opened exclusively. This also fixes an issue where under some scenarios only a second mount -o degraded command would succeed. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e138af710de2..c6938b45e0fd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -597,10 +597,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; + if (!bh) goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -655,7 +653,7 @@ error: continue; } if (fs_devices->open_devices == 0) { - ret = -EIO; + ret = -EINVAL; goto out; } fs_devices->seeding = seeding; From f9d9ef62cd3ecbd6cbb7957a253c1e81f69d5586 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 29 Sep 2011 13:11:33 +0200 Subject: [PATCH 19/19] btrfs: do not allow mounting non-subvolumes via subvol option There's a missing test whether the path passed to subvol=path option during mount is a real subvolume, allowing any directory located in default subovlume to be passed and accepted for mount. (current btrfs progs prevent this early) $ btrfs subvol snapshot . p1-snap ERROR: '.' is not a subvolume (with "is subvolume?" test bypassed) $ btrfs subvol snapshot . p1-snap Create a snapshot of '.' in './p1-snap' $ btrfs subvol list -p . ID 258 parent 5 top level 5 path subvol ID 259 parent 5 top level 5 path subvol1 ID 260 parent 5 top level 5 path default-subvol1 ID 262 parent 5 top level 5 path p1/p1-snapshot ID 263 parent 259 top level 5 path subvol1/subvol1-snap The problem I see is that this makes a false impression of snapshotting the given subvolume but in fact snapshots the default one: a user expects outcome like ID 263 but in fact gets ID 262 . This patch makes mount fail with EINVAL with a message in syslog. Signed-off-by: David Sterba --- fs/btrfs/super.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 29eecbb6ec3a..5429b1fa0bfc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -740,6 +740,16 @@ static int btrfs_set_super(struct super_block *s, void *data) return set_anon_super(s, data); } +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ + if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + /* * This will strip out the subvol=%s argument for an argument string and add * subvolid=0 to make sure we get the actual tree root for path walking to the @@ -843,6 +853,15 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (error) return ERR_PTR(error); + if (!is_subvolume_inode(path.dentry->d_inode)) { + path_put(&path); + mntput(mnt); + error = -EINVAL; + printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", + subvol_name); + return ERR_PTR(-EINVAL); + } + /* Get a ref to the sb and the dentry we found and return it */ s = path.mnt->mnt_sb; atomic_inc(&s->s_active);