From 84850e8d8a5ec7b9d3c47d224e9a10c9da52ff1b Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 29 Aug 2011 09:25:53 +0800
Subject: [PATCH 01/19] btrfs: check file extent backref offset underflow

Offset field in data extent backref can underflow if clone range ioctl
is used. We can reliably detect the underflow because max file size is
limited to 2^63 and max data extent size is limited by block group size.

Signed-off-by: Zheng Yan  <zheng.z.yan@intel.com>
---
 fs/btrfs/relocation.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 10af6a0e0865..24d654ce7a06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3322,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
 	}
 
 	key.objectid = ref_objectid;
-	key.offset = ref_offset;
 	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (ref_offset > ((u64)-1 << 32))
+		key.offset = 0;
+	else
+		key.offset = ref_offset;
 
 	path->search_commit_root = 1;
 	path->skip_locking = 1;

From 60ccf82f5b6e26e10d41783464ca469c070c7d49 Mon Sep 17 00:00:00 2001
From: Diego Calleja <diegocg@gmail.com>
Date: Thu, 1 Sep 2011 16:33:57 +0200
Subject: [PATCH 02/19] btrfs: fix memory leak in btrfs_defrag_file

kmemleak found this:
unreferenced object 0xffff8801b64af968 (size 512):
  comm "btrfs-cleaner", pid 3317, jiffies 4306810886 (age 903.272s)
  hex dump (first 32 bytes):
    00 82 01 07 00 ea ff ff c0 83 01 07 00 ea ff ff  ................
    80 82 01 07 00 ea ff ff c0 87 01 07 00 ea ff ff  ................
  backtrace:
    [<ffffffff816875cc>] kmemleak_alloc+0x5c/0xc0
    [<ffffffff8114aec3>] kmem_cache_alloc_trace+0x163/0x240
    [<ffffffff8127a290>] btrfs_defrag_file+0xf0/0xb20
    [<ffffffff8125d9a5>] btrfs_run_defrag_inodes+0x165/0x210
    [<ffffffff812479d7>] cleaner_kthread+0x177/0x190
    [<ffffffff81075c7d>] kthread+0x8d/0xa0
    [<ffffffff816af5f4>] kernel_thread_helper+0x4/0x10
    [<ffffffffffffffff>] 0xffffffffffffffff

"pages" is not always freed. Fix it removing the unnecesary additional return.

Signed-off-by: Diego Calleja <diegocg@gmail.com>
---
 fs/btrfs/ioctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d2b53eb8a8c2..8ccc106f4e18 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1140,9 +1140,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		btrfs_set_super_incompat_flags(disk_super, features);
 	}
 
-	if (!file)
-		kfree(ra);
-	return defrag_count;
+	ret = defrag_count;
 
 out_ra:
 	if (!file)

From cbcc83265d929ac71553c1b5dafdb830171af947 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:25 +0800
Subject: [PATCH 03/19] Btrfs: fix defragmentation regression

There's an off-by-one bug:

  # create a file with lots of 4K file extents
  # btrfs fi defrag /mnt/file
  # sync
  # filefrag -v /mnt/file
  Filesystem type is: 9123683e
  File size of /mnt/file is 1228800 (300 blocks, blocksize 4096)
   ext logical physical expected length flags
     0       0     3372              64
     1      64     3136     3435      1
     2      65     3436     3136     64
     3     129     3201     3499      1
     4     130     3500     3201     64
     5     194     3266     3563      1
     6     195     3564     3266     64
     7     259     3331     3627      1
     8     260     3628     3331     40 eof

After this patch:

  ...
  # filefrag -v /mnt/file
  Filesystem type is: 9123683e
  File size of /mnt/file is 1228800 (300 blocks, blocksize 4096)
   ext logical physical expected length flags
     0       0     3372             300 eof
  /mnt/file: 1 extent found

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8ccc106f4e18..b39f7bf92704 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1087,7 +1087,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 		defrag_count += ret;
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
-		i += ret;
 
 		if (newer_than) {
 			if (newer_off == (u64)-1)
@@ -1107,7 +1106,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 				break;
 			}
 		} else {
-			i++;
+			if (ret > 0)
+				i += ret;
+			else
+				i++;
 		}
 	}
 

From 151a31b25e5c941bdd9fdefed650effca223c716 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:39 +0800
Subject: [PATCH 04/19] Btrfs: use i_size_read() in btrfs_defrag_file()

Don't use inode->i_size directly, since we're not holding i_mutex.

This also fixes another bug, that i_size can change after it's checked
against 0 and then (i_size - 1) can be negative.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b39f7bf92704..323d77f09258 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -978,6 +978,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	struct btrfs_super_block *disk_super;
 	struct file_ra_state *ra = NULL;
 	unsigned long last_index;
+	u64 isize = i_size_read(inode);
 	u64 features;
 	u64 last_len = 0;
 	u64 skip = 0;
@@ -1003,7 +1004,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			compress_type = range->compress_type;
 	}
 
-	if (inode->i_size == 0)
+	if (isize == 0)
 		return 0;
 
 	/*
@@ -1028,10 +1029,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 	/* find the last page to defrag */
 	if (range->start + range->len > range->start) {
-		last_index = min_t(u64, inode->i_size - 1,
+		last_index = min_t(u64, isize - 1,
 			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
 	} else {
-		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 	}
 
 	if (newer_than) {

From 5ca496604b5975d371bb669ee6c2394bcbea818f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:55 +0800
Subject: [PATCH 05/19] Btrfs: fix wrong max_to_defrag in btrfs_defrag_file()

It's off-by-one, and thus we may skip the last page while defragmenting.

An example case:

  # create /mnt/file with 2 4K file extents
  # btrfs fi defrag /mnt/file
  # sync
  # filefrag /mnt/file
  /mnt/file: 2 extents found

So it's not defragmented.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 323d77f09258..f9026413bcf1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1052,7 +1052,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index - 1;
+		max_to_defrag = last_index;
 
 	while (i <= last_index && defrag_count < max_to_defrag) {
 		/*

From 83c8c9bde0add721f7509aa446455183b040b931 Mon Sep 17 00:00:00 2001
From: Jeff Liu <jeff.liu@oracle.com>
Date: Wed, 14 Sep 2011 14:11:21 +0800
Subject: [PATCH 06/19] btrfs: trivial fix, a potential memory leak in
 btrfs_parse_early_options()

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
---
 fs/btrfs/super.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 266d1f35465d..09ce951666ea 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -419,7 +419,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
-	char *opts, *orig, *p;
+	char *device_name, *opts, *orig, *p;
 	int error = 0;
 	int intarg;
 
@@ -470,8 +470,14 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 			}
 			break;
 		case Opt_device:
-			error = btrfs_scan_one_device(match_strdup(&args[0]),
+			device_name = match_strdup(&args[0]);
+			if (!device_name) {
+				error = -ENOMEM;
+				goto out;
+			}
+			error = btrfs_scan_one_device(device_name,
 					flags, holder, fs_devices);
+			kfree(device_name);
 			if (error)
 				goto out;
 			break;

From 008873eafbc77deb1702aedece33756c58486c6a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:57:07 +0800
Subject: [PATCH 07/19] Btrfs: honor extent thresh during defragmentation

We won't defrag an extent, if it's bigger than the threshold we
specified and there's no small extent before it, but actually
the code doesn't work this way.

There are three bugs:

- When should_defrag_range() decides we should keep on defragmenting
  an extent, last_len is not incremented. (old bug)

- The length that passes to should_defrag_range() is not the length
  we're going to defrag. (new bug)

- We always defrag 256K bytes data, and a big extent can be part of
  this range. (new bug)

For a file with 4 extents:

        | 4K | 4K | 256K | 256K |

The result of defrag with (the default) 256K extent thresh should be:

        | 264K | 256K |

but with those bugs, we'll get:

        | 520K |

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f9026413bcf1..d524b6697ad9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -765,7 +765,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	int ret = 1;
 
 	/*
-	 * make sure that once we start defragging and extent, we keep on
+	 * make sure that once we start defragging an extent, we keep on
 	 * defragging it
 	 */
 	if (start < *defrag_end)
@@ -810,7 +810,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	 * extent will force at least part of that big extent to be defragged.
 	 */
 	if (ret) {
-		*last_len += len;
 		*defrag_end = extent_map_end(em);
 	} else {
 		*last_len = 0;
@@ -984,13 +983,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	u64 skip = 0;
 	u64 defrag_end = 0;
 	u64 newer_off = range->start;
-	int newer_left = 0;
 	unsigned long i;
+	unsigned long ra_index = 0;
 	int ret;
 	int defrag_count = 0;
 	int compress_type = BTRFS_COMPRESS_ZLIB;
 	int extent_thresh = range->extent_thresh;
-	int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	int cluster = max_cluster;
 	u64 new_align = ~((u64)128 * 1024 - 1);
 	struct page **pages = NULL;
 
@@ -1020,7 +1020,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		ra = &file->f_ra;
 	}
 
-	pages = kmalloc(sizeof(struct page *) * newer_cluster,
+	pages = kmalloc(sizeof(struct page *) * max_cluster,
 			GFP_NOFS);
 	if (!pages) {
 		ret = -ENOMEM;
@@ -1045,7 +1045,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			 * the extents in the file evenly spaced
 			 */
 			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-			newer_left = newer_cluster;
 		} else
 			goto out_ra;
 	} else {
@@ -1077,12 +1076,26 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			i = max(i + 1, next);
 			continue;
 		}
+
+		if (!newer_than) {
+			cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+				   PAGE_CACHE_SHIFT) - i;
+			cluster = min(cluster, max_cluster);
+		} else {
+			cluster = max_cluster;
+		}
+
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
 			BTRFS_I(inode)->force_compress = compress_type;
 
-		btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+		if (i + cluster > ra_index) {
+			ra_index = max(i, ra_index);
+			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+				       cluster);
+			ra_index += max_cluster;
+		}
 
-		ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0)
 			goto out_ra;
 
@@ -1102,15 +1115,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			if (!ret) {
 				range->start = newer_off;
 				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-				newer_left = newer_cluster;
 			} else {
 				break;
 			}
 		} else {
-			if (ret > 0)
+			if (ret > 0) {
 				i += ret;
-			else
+				last_len += ret << PAGE_CACHE_SHIFT;
+			} else {
 				i++;
+				last_len = 0;
+			}
 		}
 	}
 

From f4c697e6406da5dd445eda8d923c53e1138793dd Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Mon, 5 Sep 2011 16:34:54 +0200
Subject: [PATCH 08/19] btrfs: return EINVAL if start > total_bytes in fitrim
 ioctl

We should retirn EINVAL if the start is beyond the end of the file
system in the btrfs_ioctl_fitrim(). Fix that by adding the appropriate
check for it.

Also in the btrfs_trim_fs() it is possible that len+start might overflow
if big values are passed. Fix it by decrementing the len so that start+len
is equal to the file system size in the worst case.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/btrfs/ioctl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d524b6697ad9..136a2f980e21 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -282,6 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
+	u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -300,12 +301,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 		}
 	}
 	rcu_read_unlock();
+
 	if (!num_devices)
 		return -EOPNOTSUPP;
-
 	if (copy_from_user(&range, arg, sizeof(range)))
 		return -EFAULT;
+	if (range.start > total_bytes)
+		return -EINVAL;
 
+	range.len = min(range.len, total_bytes - range.start);
 	range.minlen = max(range.minlen, minlen);
 	ret = btrfs_trim_fs(root, &range);
 	if (ret < 0)

From a05a9bb18ae0abec0b513b5fde876c47905fa13e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 6 Sep 2011 16:55:34 +0800
Subject: [PATCH 09/19] Btrfs: fix array bound checking

Otherwise we can execced the array bound of path->slots[].

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ctree.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..0fe615e4ea38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
-	if (level < BTRFS_MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1) {
 		parent = path->nodes[level + 1];
-	pslot = path->slots[level + 1];
+		pslot = path->slots[level + 1];
+	}
 
 	/*
 	 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	mid = path->nodes[level];
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
-	if (level < BTRFS_MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1) {
 		parent = path->nodes[level + 1];
-	pslot = path->slots[level + 1];
+		pslot = path->slots[level + 1];
+	}
 
 	if (!parent)
 		return 1;

From 560f7d75457f86a43970aa413e334e394082dce4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 8 Sep 2011 10:22:01 +0800
Subject: [PATCH 10/19] Btrfs: remove BUG_ON() in compress_file_range()

It's not a big deal if we fail to allocate the array, and instead of
panic we can just give up compressing.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f12747c9447b..81d4f68f35c9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -393,7 +393,10 @@ again:
 	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
-		BUG_ON(!pages);
+		if (!pages) {
+			/* just bail out to the uncompressed code */
+			goto cont;
+		}
 
 		if (BTRFS_I(inode)->force_compress)
 			compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +427,7 @@ again:
 			will_compress = 1;
 		}
 	}
+cont:
 	if (start == 0) {
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));

From f0dd9592a1aa014b3a01aa2be7e795aae040d65b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 8 Sep 2011 10:26:51 +0800
Subject: [PATCH 11/19] Btrfs: fix direct-io vs nodatacow

To reproduce the bug:

  # mount -o nodatacow /dev/sda7 /mnt/
  # dd if=/dev/zero of=/mnt/tmp bs=4K count=1
  1+0 records in
  1+0 records out
  4096 bytes (4.1 kB) copied, 0.000136115 s, 30.1 MB/s
  # dd if=/dev/zero of=/mnt/tmp bs=4K count=1 conv=notrunc oflag=direct
  dd: writing `/mnt/tmp': Input/output error
  1+0 records in
  0+0 records out

btrfs_ordered_update_i_size() may return 1, but btrfs_endio_direct_write()
mistakenly takes it as an error.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81d4f68f35c9..65474d95f26f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5777,8 +5777,7 @@ again:
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
 		if (!ret)
-			ret = btrfs_update_inode(trans, root, inode);
-		err = ret;
+			err = btrfs_update_inode(trans, root, inode);
 		goto out;
 	}
 

From fee187d9d9ddc382c81370a9a280391132dea2e1 Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Thu, 29 Sep 2011 15:55:28 +0800
Subject: [PATCH 12/19] Btrfs: do not set EXTENT_DIRTY along with
 EXTENT_DELALLOC

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0ada0b700b44..f284d4e5f447 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1107,7 +1107,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE,
 			      0, NULL, cached_state, mask);
 }
 

From 10b2f34d6e7fbe07f498cb2006272e9a561f5e60 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 2 Oct 2011 13:56:53 +0300
Subject: [PATCH 13/19] Btrfs: pass the correct root to
 lookup_free_space_inode()

Free space items are located in tree of tree roots, not in the extent
tree.  It didn't pop up because lookup_free_space_inode() grabs the
inode all the time instead of actually searching the tree.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4eb7d2ba38f8..6cfcc9060c83 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7312,7 +7312,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	inode = lookup_free_space_inode(root, block_group, path);
+	inode = lookup_free_space_inode(tree_root, block_group, path);
 	if (!IS_ERR(inode)) {
 		ret = btrfs_orphan_add(trans, inode);
 		BUG_ON(ret);

From cfbffc39ac89dbd5197cbeec2599a1128eb928f8 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 6 Oct 2011 13:37:08 +0900
Subject: [PATCH 14/19] Btrfs: fix return value of btrfs_get_acl()

In btrfs_get_acl(), when the second __btrfs_getxattr() call fails,
acl is not correctly set.
Therefore, a wrong value might return to the caller.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/acl.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		if (!value)
 			return ERR_PTR(-ENOMEM);
 		size = __btrfs_getxattr(inode, name, value, size);
-		if (size > 0) {
-			acl = posix_acl_from_xattr(value, size);
-			if (IS_ERR(acl)) {
-				kfree(value);
-				return acl;
-			}
-			set_cached_acl(inode, type, acl);
-		}
-		kfree(value);
+	}
+	if (size > 0) {
+		acl = posix_acl_from_xattr(value, size);
 	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
 		/* FIXME, who returns -ENOENT?  I think nobody */
 		acl = NULL;
-		set_cached_acl(inode, type, acl);
 	} else {
 		acl = ERR_PTR(-EIO);
 	}
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
 	return acl;
 }

From 60d2adbb1e7fee1cb4bc67f70bd0bd8ace7b6c3c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 9 Sep 2011 17:34:35 +0800
Subject: [PATCH 15/19] Btrfs: fix race between multi-task space allocation and
 caching space

The task may fail to get free space though it is enough when multi-task
space allocation and caching space happen at the same time.

	Task1			Caching Thread		Task2
	------------------------------------------------------------------------
	find_free_extent
	  The space has not
	  be cached, and start
	  caching thread. And
	  wait for it.
				cache space, if
				the space is > 2MB
				wake up Task1
							find_free_extent
							  get all the space that
							  is cached.
	  try to allocate space,
	  but there is no space
	  now.
	trigger BUG_ON()

The message is following:
btrfs allocation failed flags 1, wanted 4096
space_info has 1040187392 free, is not full
space_info total=1082130432, used=4096, pinned=41938944, reserved=0, may_use=40828928, readonly=0
block group 12582912 has 8388608 bytes, 0 used 8388608 pinned 0 reserved
block group has cluster?: no
0 blocks of free space at or bigger than bytes is
block group 1103101952 has 1073741824 bytes, 4096 used 33550336 pinned 0 reserved
block group has cluster?: no
0 blocks of free space at or bigger than bytes is
------------[ cut here ]------------
kernel BUG at fs/btrfs/inode.c:835!
 [<ffffffffa031261b>] __extent_writepage+0x1bf/0x5ce [btrfs]
 [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108
 [<ffffffffa02f8ada>] ? wait_current_trans+0x23/0xec [btrfs]
 [<ffffffff810c3fbf>] ? find_get_pages_tag+0x73/0xe2
 [<ffffffffa0312d12>] extent_write_cache_pages.clone.0+0x176/0x29a [btrfs]
 [<ffffffffa0312e74>] extent_writepages+0x3e/0x53 [btrfs]
 [<ffffffff8110ad2c>] ? do_sync_write+0xc6/0x103
 [<ffffffffa0302d6e>] ? btrfs_submit_direct+0x414/0x414 [btrfs]
 [<ffffffff811380fa>] ? fsnotify+0x236/0x266
 [<ffffffffa02fc930>] btrfs_writepages+0x22/0x24 [btrfs]
 [<ffffffff810cc215>] do_writepages+0x1c/0x25
 [<ffffffff810c4958>] __filemap_fdatawrite_range+0x4e/0x50
 [<ffffffff810c4982>] filemap_write_and_wait_range+0x28/0x51
 [<ffffffffa0306b2e>] btrfs_sync_file+0x7d/0x198 [btrfs]
 [<ffffffff8110aa26>] ? fsnotify_modify+0x5d/0x65
 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21
 [<ffffffff8112d170>] vfs_fsync+0x17/0x19
 [<ffffffff8112d316>] do_fsync+0x29/0x3e
 [<ffffffff8112d348>] sys_fsync+0xb/0xf
 [<ffffffff81468352>] system_call_fastpath+0x16/0x1b
[SNIP]
RIP  [<ffffffffa02fe08c>] cow_file_range+0x1c4/0x32b [btrfs]

We fix this bug by trying to allocate the space again if there are block groups
in caching.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6cfcc9060c83..cef355f1328a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4954,6 +4954,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
 	bool use_cluster = true;
+	bool have_caching_bg = false;
 	u64 ideal_cache_percent = 0;
 	u64 ideal_cache_offset = 0;
 
@@ -5036,6 +5037,7 @@ ideal_cache:
 		}
 	}
 search:
+	have_caching_bg = false;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups[index],
 			    list) {
@@ -5244,6 +5246,8 @@ refill_cluster:
 			failed_alloc = true;
 			goto have_block_group;
 		} else if (!offset) {
+			if (!cached)
+				have_caching_bg = true;
 			goto loop;
 		}
 checks:
@@ -5294,6 +5298,9 @@ loop:
 	}
 	up_read(&space_info->groups_sem);
 
+	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+		goto search;
+
 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
 		goto search;
 

From 068132bad1de70f85f5f6d12c36d64f8f7848d92 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Thu, 23 Jun 2011 23:01:01 +0800
Subject: [PATCH 16/19] btrfs: fix oops on failure path

If lookup_extent_backref fails, path->nodes[0] reasonably could be
null along with other callers of btrfs_print_leaf, so ensure we have a
valid extent buffer before dereferencing.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
---
 fs/btrfs/print-tree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
-	u32 type;
-	u32 nr = btrfs_header_nritems(l);
+	u32 type, nr;
 	struct btrfs_item *item;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 
+	if (!l)
+		return;
+
+	nr = btrfs_header_nritems(l);
+
 	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
 		btrfs_leaf_free_space(root, l));

From 5f524444c351e145a5f7e28253594688a421bfe8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 13 Oct 2011 00:20:43 +0300
Subject: [PATCH 17/19] Btrfs: fix a bug when opening seed devices

Initialize fs_info->bdev_holder a bit earlier to be able to pass a
correct holder id to blkdev_get() when opening seed devices with O_EXCL.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 09ce951666ea..29eecbb6ec3a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -939,6 +939,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
@@ -946,7 +947,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 			return ERR_PTR(error);
 		}
 
-		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		s->s_flags |= MS_ACTIVE;
 	}
 

From 20bcd64934e4eb8f3f90a0dca54fb0ac2edd7795 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 20 Oct 2011 00:06:20 +0300
Subject: [PATCH 18/19] Btrfs: close all bdevs on mount failure

Fix a bug introduced by 20b45077.  We have to return EINVAL on mount
failure, but doing that too early in the sequence leaves all of the
devices opened exclusively.  This also fixes an issue where under some
scenarios only a second mount -o degraded <devices> command would
succeed.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e138af710de2..c6938b45e0fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -597,10 +597,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		set_blocksize(bdev, 4096);
 
 		bh = btrfs_read_dev_super(bdev);
-		if (!bh) {
-			ret = -EINVAL;
+		if (!bh)
 			goto error_close;
-		}
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +653,7 @@ error:
 		continue;
 	}
 	if (fs_devices->open_devices == 0) {
-		ret = -EIO;
+		ret = -EINVAL;
 		goto out;
 	}
 	fs_devices->seeding = seeding;

From f9d9ef62cd3ecbd6cbb7957a253c1e81f69d5586 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 29 Sep 2011 13:11:33 +0200
Subject: [PATCH 19/19] btrfs: do not allow mounting non-subvolumes via subvol
 option

There's a missing test whether the path passed to subvol=path option
during mount is a real subvolume, allowing any directory located in
default subovlume to be passed and accepted for mount.

(current btrfs progs prevent this early)
$ btrfs subvol snapshot . p1-snap
ERROR: '.' is not a subvolume

(with "is subvolume?" test bypassed)
$ btrfs subvol snapshot . p1-snap
Create a snapshot of '.' in './p1-snap'

$ btrfs subvol list -p .
ID 258 parent 5 top level 5 path subvol
ID 259 parent 5 top level 5 path subvol1
ID 260 parent 5 top level 5 path default-subvol1
ID 262 parent 5 top level 5 path p1/p1-snapshot
ID 263 parent 259 top level 5 path subvol1/subvol1-snap

The problem I see is that this makes a false impression of snapshotting the
given subvolume but in fact snapshots the default one: a user expects outcome
like ID 263 but in fact gets ID 262 .

This patch makes mount fail with EINVAL with a message in syslog.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/super.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 29eecbb6ec3a..5429b1fa0bfc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -740,6 +740,16 @@ static int btrfs_set_super(struct super_block *s, void *data)
 	return set_anon_super(s, data);
 }
 
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
 /*
  * This will strip out the subvol=%s argument for an argument string and add
  * subvolid=0 to make sure we get the actual tree root for path walking to the
@@ -843,6 +853,15 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 	if (error)
 		return ERR_PTR(error);
 
+	if (!is_subvolume_inode(path.dentry->d_inode)) {
+		path_put(&path);
+		mntput(mnt);
+		error = -EINVAL;
+		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+				subvol_name);
+		return ERR_PTR(-EINVAL);
+	}
+
 	/* Get a ref to the sb and the dentry we found and return it */
 	s = path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);