From 547edce3ba234a5161c6e1f4389f4a0d2f8451aa Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:30:58 +1100
Subject: [PATCH 01/86] ext4: tell DAX the size of allocation holes

When DAX calls _ext4_get_block() and the file offset points to a hole we
currently don't set bh->b_size.  This is current worked around via
buffer_size_valid() in fs/dax.c.

_ext4_get_block() has the hole size information from ext4_map_blocks(), so
populate bh->b_size so we can remove buffer_size_valid() in a later patch.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/ext4/inode.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9c064727ed62..3d58b2b477e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -767,6 +767,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 		ext4_update_bh_state(bh, map.m_flags);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
+	} else if (ret == 0) {
+		/* hole case, need to fill in bh->b_size */
+		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 	}
 	return ret;
 }

From fa0d3fce7ceff0aae62f3e19678713bc5a7f3377 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:31:14 +1100
Subject: [PATCH 02/86] dax: remove buffer_size_valid()

Now that ext4 properly sets bh.b_size when we call get_block() for a hole,
rely on that value and remove the buffer_size_valid() sanity check.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 014defd2e744..b09817ac6f87 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -123,19 +123,6 @@ static bool buffer_written(struct buffer_head *bh)
 	return buffer_mapped(bh) && !buffer_unwritten(bh);
 }
 
-/*
- * When ext4 encounters a hole, it returns without modifying the buffer_head
- * which means that we can't trust b_size.  To cope with this, we set b_state
- * to 0 before calling get_block and, if any bit is set, we know we can trust
- * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
- * and would save us time calling get_block repeatedly.
- */
-static bool buffer_size_valid(struct buffer_head *bh)
-{
-	return bh->b_state != 0;
-}
-
-
 static sector_t to_sector(const struct buffer_head *bh,
 		const struct inode *inode)
 {
@@ -177,8 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 				rc = get_block(inode, block, bh, rw == WRITE);
 				if (rc)
 					break;
-				if (!buffer_size_valid(bh))
-					bh->b_size = 1 << blkbits;
 				bh_max = pos - first + bh->b_size;
 				bdev = bh->b_bdev;
 				/*
@@ -1012,12 +997,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 
 	bdev = bh.b_bdev;
 
-	/*
-	 * If the filesystem isn't willing to tell us the length of a hole,
-	 * just fall back to PTEs.  Calling get_block 512 times in a loop
-	 * would be silly.
-	 */
-	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
+	if (bh.b_size < PMD_SIZE) {
 		dax_pmd_dbg(&bh, address, "allocated block too small");
 		return VM_FAULT_FALLBACK;
 	}

From 03e0990fc88f82c85abeaf90aabe1921e4e0b72f Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:31:33 +1100
Subject: [PATCH 03/86] ext2: remove support for DAX PMD faults

DAX PMD support was added via the following commit:

commit e7b1ea2ad658 ("ext2: huge page fault support")

I believe this path to be untested as ext2 doesn't reliably provide block
allocations that are aligned to 2MiB.  In my testing I've been unable to
get ext2 to actually fault in a PMD.  It always fails with a "pfn
unaligned" message because the sector returned by ext2_get_block() isn't
aligned.

I've tried various settings for the "stride" and "stripe_width" extended
options to mkfs.ext2, without any luck.

Since we can't reliably get PMDs, remove support so that we don't have an
untested code path that we may someday traverse when we happen to get an
aligned block allocation.  This should also make 4k DAX faults in ext2 a
bit faster since they will no longer have to call the PMD fault handler
only to get a response of VM_FAULT_FALLBACK.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/ext2/file.c | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a0e1478dfd04..fb88b51ca947 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return ret;
 }
 
-static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-						pmd_t *pmd, unsigned int flags)
-{
-	struct inode *inode = file_inode(vma->vm_file);
-	struct ext2_inode_info *ei = EXT2_I(inode);
-	int ret;
-
-	if (flags & FAULT_FLAG_WRITE) {
-		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
-	}
-	down_read(&ei->dax_sem);
-
-	ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
-
-	up_read(&ei->dax_sem);
-	if (flags & FAULT_FLAG_WRITE)
-		sb_end_pagefault(inode->i_sb);
-	return ret;
-}
-
 static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 		struct vm_fault *vmf)
 {
@@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
-	.pmd_fault	= ext2_dax_pmd_fault,
+	/*
+	 * .pmd_fault is not supported for DAX because allocation in ext2
+	 * cannot be reliably aligned to huge page sizes and so pmd faults
+	 * will always fail and fail back to regular faults.
+	 */
 	.page_mkwrite	= ext2_dax_fault,
 	.pfn_mkwrite	= ext2_dax_pfn_mkwrite,
 };
@@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	file_accessed(file);
 	vma->vm_ops = &ext2_dax_vm_ops;
-	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	vma->vm_flags |= VM_MIXEDMAP;
 	return 0;
 }
 #else

From ce95ab0fa669698805ff8841a756733a7702b5c9 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:31:44 +1100
Subject: [PATCH 04/86] dax: make 'wait_table' global variable static

The global 'wait_table' variable is only used within fs/dax.c, and
generates the following sparse warning:

fs/dax.c:39:19: warning: symbol 'wait_table' was not declared. Should it be static?

Make it static so it has scope local to fs/dax.c, and to make sparse happy.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index b09817ac6f87..e52e7543c351 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -52,7 +52,7 @@
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
 
-wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
 
 static int __init init_dax_wait_table(void)
 {

From aada54f9800497cc8249c99ffcb1431c6d8e98e7 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:32:00 +1100
Subject: [PATCH 05/86] dax: remove the last BUG_ON() from fs/dax.c

Don't take down the kernel if we get an invalid 'from' and 'length'
argument pair.  Just warn once and return an error.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index e52e7543c351..219fa2bd676a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1194,7 +1194,8 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	/* Block boundary? Nothing to do */
 	if (!length)
 		return 0;
-	BUG_ON((offset + length) > PAGE_SIZE);
+	if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
+		return -EINVAL;
 
 	memset(&bh, 0, sizeof(bh));
 	bh.b_bdev = inode->i_sb->s_bdev;

From e3ad61c64abceeb5cc122f0bde3700e6e17ca60a Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:32:12 +1100
Subject: [PATCH 06/86] dax: consistent variable naming for DAX entries

No functional change.

Consistently use the variable name 'entry' instead of 'ret' for DAX radix
tree entries.  This was already happening in most of the code, so update
get_unlocked_mapping_entry(), grab_mapping_entry() and
dax_unlock_mapping_entry().

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 219fa2bd676a..835e7f082cff 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -357,7 +357,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
 static void *get_unlocked_mapping_entry(struct address_space *mapping,
 					pgoff_t index, void ***slotp)
 {
-	void *ret, **slot;
+	void *entry, **slot;
 	struct wait_exceptional_entry_queue ewait;
 	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
 
@@ -367,13 +367,13 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
 	ewait.key.index = index;
 
 	for (;;) {
-		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
 					  &slot);
-		if (!ret || !radix_tree_exceptional_entry(ret) ||
+		if (!entry || !radix_tree_exceptional_entry(entry) ||
 		    !slot_locked(mapping, slot)) {
 			if (slotp)
 				*slotp = slot;
-			return ret;
+			return entry;
 		}
 		prepare_to_wait_exclusive(wq, &ewait.wait,
 					  TASK_UNINTERRUPTIBLE);
@@ -396,13 +396,13 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
  */
 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
-	void *ret, **slot;
+	void *entry, **slot;
 
 restart:
 	spin_lock_irq(&mapping->tree_lock);
-	ret = get_unlocked_mapping_entry(mapping, index, &slot);
+	entry = get_unlocked_mapping_entry(mapping, index, &slot);
 	/* No entry for given index? Make sure radix tree is big enough. */
-	if (!ret) {
+	if (!entry) {
 		int err;
 
 		spin_unlock_irq(&mapping->tree_lock);
@@ -410,10 +410,10 @@ restart:
 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
 		if (err)
 			return ERR_PTR(err);
-		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+		entry = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
 			       RADIX_DAX_ENTRY_LOCK);
 		spin_lock_irq(&mapping->tree_lock);
-		err = radix_tree_insert(&mapping->page_tree, index, ret);
+		err = radix_tree_insert(&mapping->page_tree, index, entry);
 		radix_tree_preload_end();
 		if (err) {
 			spin_unlock_irq(&mapping->tree_lock);
@@ -425,11 +425,11 @@ restart:
 		/* Good, we have inserted empty locked entry into the tree. */
 		mapping->nrexceptional++;
 		spin_unlock_irq(&mapping->tree_lock);
-		return ret;
+		return entry;
 	}
 	/* Normal page in radix tree? */
-	if (!radix_tree_exceptional_entry(ret)) {
-		struct page *page = ret;
+	if (!radix_tree_exceptional_entry(entry)) {
+		struct page *page = entry;
 
 		get_page(page);
 		spin_unlock_irq(&mapping->tree_lock);
@@ -442,9 +442,9 @@ restart:
 		}
 		return page;
 	}
-	ret = lock_slot(mapping, slot);
+	entry = lock_slot(mapping, slot);
 	spin_unlock_irq(&mapping->tree_lock);
-	return ret;
+	return entry;
 }
 
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
@@ -469,11 +469,11 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 
 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
-	void *ret, **slot;
+	void *entry, **slot;
 
 	spin_lock_irq(&mapping->tree_lock);
-	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
-	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+	entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
 			 !slot_locked(mapping, slot))) {
 		spin_unlock_irq(&mapping->tree_lock);
 		return;

From 63e95b5c4f16e156b98adcf2f7d820ba941c82a3 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:32:20 +1100
Subject: [PATCH 07/86] dax: coordinate locking for offsets in PMD range

DAX radix tree locking currently locks entries based on the unique
combination of the 'mapping' pointer and the pgoff_t 'index' for the entry.
This works for PTEs, but as we move to PMDs we will need to have all the
offsets within the range covered by the PMD to map to the same bit lock.
To accomplish this, for ranges covered by a PMD entry we will instead lock
based on the page offset of the beginning of the PMD entry.  The 'mapping'
pointer is still used in the same way.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c            | 65 ++++++++++++++++++++++++++++-----------------
 include/linux/dax.h |  2 +-
 mm/filemap.c        |  2 +-
 3 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 835e7f082cff..72387023545e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -64,14 +64,6 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
 
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
-					      pgoff_t index)
-{
-	unsigned long hash = hash_long((unsigned long)mapping ^ index,
-				       DAX_WAIT_TABLE_BITS);
-	return wait_table + hash;
-}
-
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
@@ -285,7 +277,7 @@ EXPORT_SYMBOL_GPL(dax_do_io);
  */
 struct exceptional_entry_key {
 	struct address_space *mapping;
-	unsigned long index;
+	pgoff_t entry_start;
 };
 
 struct wait_exceptional_entry_queue {
@@ -293,6 +285,26 @@ struct wait_exceptional_entry_queue {
 	struct exceptional_entry_key key;
 };
 
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+		pgoff_t index, void *entry, struct exceptional_entry_key *key)
+{
+	unsigned long hash;
+
+	/*
+	 * If 'entry' is a PMD, align the 'index' that we use for the wait
+	 * queue to the start of that PMD.  This ensures that all offsets in
+	 * the range covered by the PMD map to the same bit lock.
+	 */
+	if (RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+		index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+
+	key->mapping = mapping;
+	key->entry_start = index;
+
+	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+	return wait_table + hash;
+}
+
 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
 				       int sync, void *keyp)
 {
@@ -301,7 +313,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
 		container_of(wait, struct wait_exceptional_entry_queue, wait);
 
 	if (key->mapping != ewait->key.mapping ||
-	    key->index != ewait->key.index)
+	    key->entry_start != ewait->key.entry_start)
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, NULL);
 }
@@ -359,12 +371,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
 {
 	void *entry, **slot;
 	struct wait_exceptional_entry_queue ewait;
-	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+	wait_queue_head_t *wq;
 
 	init_wait(&ewait.wait);
 	ewait.wait.func = wake_exceptional_entry_func;
-	ewait.key.mapping = mapping;
-	ewait.key.index = index;
 
 	for (;;) {
 		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
@@ -375,6 +385,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
 				*slotp = slot;
 			return entry;
 		}
+
+		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
 		prepare_to_wait_exclusive(wq, &ewait.wait,
 					  TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(&mapping->tree_lock);
@@ -447,10 +459,20 @@ restart:
 	return entry;
 }
 
+/*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree.  This is okay, though, because all we really need to do is to
+ * find the correct waitqueue where tasks might be sleeping waiting for that
+ * old 'entry' and wake them.
+ */
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-				   pgoff_t index, bool wake_all)
+		pgoff_t index, void *entry, bool wake_all)
 {
-	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+	struct exceptional_entry_key key;
+	wait_queue_head_t *wq;
+
+	wq = dax_entry_waitqueue(mapping, index, entry, &key);
 
 	/*
 	 * Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -458,13 +480,8 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 	 * So at this point all tasks that could have seen our entry locked
 	 * must be in the waitqueue and the following check will see them.
 	 */
-	if (waitqueue_active(wq)) {
-		struct exceptional_entry_key key;
-
-		key.mapping = mapping;
-		key.index = index;
+	if (waitqueue_active(wq))
 		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
-	}
 }
 
 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
@@ -480,7 +497,7 @@ void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 	}
 	unlock_slot(mapping, slot);
 	spin_unlock_irq(&mapping->tree_lock);
-	dax_wake_mapping_entry_waiter(mapping, index, false);
+	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
 }
 
 static void put_locked_mapping_entry(struct address_space *mapping,
@@ -505,7 +522,7 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
 		return;
 
 	/* We have to wake up next waiter for the radix tree entry lock */
-	dax_wake_mapping_entry_waiter(mapping, index, false);
+	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
 }
 
 /*
@@ -532,7 +549,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 	radix_tree_delete(&mapping->page_tree, index);
 	mapping->nrexceptional--;
 	spin_unlock_irq(&mapping->tree_lock);
-	dax_wake_mapping_entry_waiter(mapping, index, true);
+	dax_wake_mapping_entry_waiter(mapping, index, entry, true);
 
 	return 1;
 }
diff --git a/include/linux/dax.h b/include/linux/dax.h
index add6c4bc568f..a41a747d6112 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -22,7 +22,7 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-				   pgoff_t index, bool wake_all);
+		pgoff_t index, void *entry, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
diff --git a/mm/filemap.c b/mm/filemap.c
index 849f459ad078..1ffb7dcd1b5d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -143,7 +143,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
 			if (node)
 				workingset_node_pages_dec(node);
 			/* Wakeup waiters for exceptional entry lock */
-			dax_wake_mapping_entry_waiter(mapping, page->index,
+			dax_wake_mapping_entry_waiter(mapping, page->index, p,
 						      false);
 		}
 	}

From b9fde0462e34a05b25c3d68d344971865659abae Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:32:35 +1100
Subject: [PATCH 08/86] dax: remove dax_pmd_fault()

dax_pmd_fault() is the old struct buffer_head + get_block_t based 2 MiB DAX
fault handler.  This fault handler has been disabled for several kernel
releases, and support for PMDs will be reintroduced using the struct iomap
interface instead.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c            | 213 --------------------------------------------
 include/linux/dax.h |   6 +-
 2 files changed, 1 insertion(+), 218 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 72387023545e..3d0b1032c555 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -915,219 +915,6 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 EXPORT_SYMBOL_GPL(dax_fault);
 
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
- * more often than one might expect in the below function.
- */
-#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
-
-static void __dax_dbg(struct buffer_head *bh, unsigned long address,
-		const char *reason, const char *fn)
-{
-	if (bh) {
-		char bname[BDEVNAME_SIZE];
-		bdevname(bh->b_bdev, bname);
-		pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
-			"length %zd fallback: %s\n", fn, current->comm,
-			address, bname, bh->b_state, (u64)bh->b_blocknr,
-			bh->b_size, reason);
-	} else {
-		pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
-			current->comm, address, reason);
-	}
-}
-
-#define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
-
-/**
- * dax_pmd_fault - handle a PMD fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * pmd_fault handler for DAX files.
- */
-int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, get_block_t get_block)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	struct buffer_head bh;
-	unsigned blkbits = inode->i_blkbits;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = flags & FAULT_FLAG_WRITE;
-	struct block_device *bdev;
-	pgoff_t size, pgoff;
-	sector_t block;
-	int result = 0;
-	bool alloc = false;
-
-	/* dax pmd mappings require pfn_t_devmap() */
-	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
-		return VM_FAULT_FALLBACK;
-
-	/* Fall back to PTEs if we're going to COW */
-	if (write && !(vma->vm_flags & VM_SHARED)) {
-		split_huge_pmd(vma, pmd, address);
-		dax_pmd_dbg(NULL, address, "cow write");
-		return VM_FAULT_FALLBACK;
-	}
-	/* If the PMD would extend outside the VMA */
-	if (pmd_addr < vma->vm_start) {
-		dax_pmd_dbg(NULL, address, "vma start unaligned");
-		return VM_FAULT_FALLBACK;
-	}
-	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
-		dax_pmd_dbg(NULL, address, "vma end unaligned");
-		return VM_FAULT_FALLBACK;
-	}
-
-	pgoff = linear_page_index(vma, pmd_addr);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (pgoff >= size)
-		return VM_FAULT_SIGBUS;
-	/* If the PMD would cover blocks out of the file */
-	if ((pgoff | PG_PMD_COLOUR) >= size) {
-		dax_pmd_dbg(NULL, address,
-				"offset + huge page size > file size");
-		return VM_FAULT_FALLBACK;
-	}
-
-	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
-
-	bh.b_size = PMD_SIZE;
-
-	if (get_block(inode, block, &bh, 0) != 0)
-		return VM_FAULT_SIGBUS;
-
-	if (!buffer_mapped(&bh) && write) {
-		if (get_block(inode, block, &bh, 1) != 0)
-			return VM_FAULT_SIGBUS;
-		alloc = true;
-		WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-	}
-
-	bdev = bh.b_bdev;
-
-	if (bh.b_size < PMD_SIZE) {
-		dax_pmd_dbg(&bh, address, "allocated block too small");
-		return VM_FAULT_FALLBACK;
-	}
-
-	/*
-	 * If we allocated new storage, make sure no process has any
-	 * zero pages covering this hole
-	 */
-	if (alloc) {
-		loff_t lstart = pgoff << PAGE_SHIFT;
-		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
-
-		truncate_pagecache_range(inode, lstart, lend);
-	}
-
-	if (!write && !buffer_mapped(&bh)) {
-		spinlock_t *ptl;
-		pmd_t entry;
-		struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
-
-		if (unlikely(!zero_page)) {
-			dax_pmd_dbg(&bh, address, "no zero page");
-			goto fallback;
-		}
-
-		ptl = pmd_lock(vma->vm_mm, pmd);
-		if (!pmd_none(*pmd)) {
-			spin_unlock(ptl);
-			dax_pmd_dbg(&bh, address, "pmd already present");
-			goto fallback;
-		}
-
-		dev_dbg(part_to_dev(bdev->bd_part),
-				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
-				__func__, current->comm, address,
-				(unsigned long long) to_sector(&bh, inode));
-
-		entry = mk_pmd(zero_page, vma->vm_page_prot);
-		entry = pmd_mkhuge(entry);
-		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
-		result = VM_FAULT_NOPAGE;
-		spin_unlock(ptl);
-	} else {
-		struct blk_dax_ctl dax = {
-			.sector = to_sector(&bh, inode),
-			.size = PMD_SIZE,
-		};
-		long length = dax_map_atomic(bdev, &dax);
-
-		if (length < 0) {
-			dax_pmd_dbg(&bh, address, "dax-error fallback");
-			goto fallback;
-		}
-		if (length < PMD_SIZE) {
-			dax_pmd_dbg(&bh, address, "dax-length too small");
-			dax_unmap_atomic(bdev, &dax);
-			goto fallback;
-		}
-		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
-			dax_pmd_dbg(&bh, address, "pfn unaligned");
-			dax_unmap_atomic(bdev, &dax);
-			goto fallback;
-		}
-
-		if (!pfn_t_devmap(dax.pfn)) {
-			dax_unmap_atomic(bdev, &dax);
-			dax_pmd_dbg(&bh, address, "pfn not in memmap");
-			goto fallback;
-		}
-		dax_unmap_atomic(bdev, &dax);
-
-		/*
-		 * For PTE faults we insert a radix tree entry for reads, and
-		 * leave it clean.  Then on the first write we dirty the radix
-		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
-		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
-		 * call into get_block() to translate the pgoff to a sector in
-		 * order to be able to create a new radix tree entry.
-		 *
-		 * The PMD path doesn't have an equivalent to
-		 * dax_pfn_mkwrite(), though, so for a read followed by a
-		 * write we traverse all the way through dax_pmd_fault()
-		 * twice.  This means we can just skip inserting a radix tree
-		 * entry completely on the initial read and just wait until
-		 * the write to insert a dirty entry.
-		 */
-		if (write) {
-			/*
-			 * We should insert radix-tree entry and dirty it here.
-			 * For now this is broken...
-			 */
-		}
-
-		dev_dbg(part_to_dev(bdev->bd_part),
-				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
-				__func__, current->comm, address,
-				pfn_t_to_pfn(dax.pfn),
-				(unsigned long long) dax.sector);
-		result |= vmf_insert_pfn_pmd(vma, address, pmd,
-				dax.pfn, write);
-	}
-
- out:
-	return result;
-
- fallback:
-	count_vm_event(THP_FAULT_FALLBACK);
-	result = VM_FAULT_FALLBACK;
-	goto out;
-}
-EXPORT_SYMBOL_GPL(dax_pmd_fault);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
 /**
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a41a747d6112..0f74866edae6 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -48,16 +48,12 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 }
 #endif
 
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
-int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-				unsigned int flags, get_block_t);
-#else
 static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t *pmd, unsigned int flags, get_block_t gb)
 {
 	return VM_FAULT_FALLBACK;
 }
-#endif
+
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 

From 11c59c92f44d9272db7655a462608658a6d95013 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:32:46 +1100
Subject: [PATCH 09/86] dax: correct dax iomap code namespace

The recently added DAX functions that use the new struct iomap data
structure were named iomap_dax_rw(), iomap_dax_fault() and
iomap_dax_actor().  These are actually defined in fs/dax.c, though, so
should be part of the "dax" namespace and not the "iomap" namespace.
Rename them to dax_iomap_rw(), dax_iomap_fault() and dax_iomap_actor()
respectively.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c            | 16 ++++++++--------
 fs/ext2/file.c      |  6 +++---
 fs/xfs/xfs_file.c   |  8 ++++----
 include/linux/dax.h |  4 ++--
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 3d0b1032c555..fdbd7a1ec6cf 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1031,7 +1031,7 @@ EXPORT_SYMBOL_GPL(dax_truncate_page);
 
 #ifdef CONFIG_FS_IOMAP
 static loff_t
-iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
 {
 	struct iov_iter *iter = data;
@@ -1088,7 +1088,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 }
 
 /**
- * iomap_dax_rw - Perform I/O to a DAX file
+ * dax_iomap_rw - Perform I/O to a DAX file
  * @iocb:	The control block for this I/O
  * @iter:	The addresses to do I/O from or to
  * @ops:	iomap ops passed from the file system
@@ -1098,7 +1098,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  * and evicting any page cache pages in the region under I/O.
  */
 ssize_t
-iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
@@ -1128,7 +1128,7 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
 
 	while (iov_iter_count(iter)) {
 		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
-				iter, iomap_dax_actor);
+				iter, dax_iomap_actor);
 		if (ret <= 0)
 			break;
 		pos += ret;
@@ -1138,10 +1138,10 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
 	iocb->ki_pos += done;
 	return done ? done : ret;
 }
-EXPORT_SYMBOL_GPL(iomap_dax_rw);
+EXPORT_SYMBOL_GPL(dax_iomap_rw);
 
 /**
- * iomap_dax_fault - handle a page fault on a DAX file
+ * dax_iomap_fault - handle a page fault on a DAX file
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @ops: iomap ops passed from the file system
@@ -1150,7 +1150,7 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw);
  * or mkwrite handler for DAX files. Assumes the caller has done all the
  * necessary locking for the page fault to proceed successfully.
  */
-int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -1252,5 +1252,5 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		return VM_FAULT_SIGBUS | major;
 	return VM_FAULT_NOPAGE | major;
 }
-EXPORT_SYMBOL_GPL(iomap_dax_fault);
+EXPORT_SYMBOL_GPL(dax_iomap_fault);
 #endif /* CONFIG_FS_IOMAP */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index fb88b51ca947..b0f241528a30 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		return 0; /* skip atime */
 
 	inode_lock_shared(inode);
-	ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+	ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops);
 	inode_unlock_shared(inode);
 
 	file_accessed(iocb->ki_filp);
@@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out_unlock;
 
-	ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+	ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops);
 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 		i_size_write(inode, iocb->ki_pos);
 		mark_inode_dirty(inode);
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a314fc7b56fa..e7f35d548cfc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -344,7 +344,7 @@ xfs_file_dax_read(
 		return 0; /* skip atime */
 
 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
+	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
 	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	file_accessed(iocb->ki_filp);
@@ -691,7 +691,7 @@ xfs_file_dax_write(
 
 	trace_xfs_file_dax_write(ip, count, pos);
 
-	ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 		i_size_write(inode, iocb->ki_pos);
 		error = xfs_setfilesize(ip, pos, ret);
@@ -1640,7 +1640,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
 	} else {
 		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
@@ -1674,7 +1674,7 @@ xfs_filemap_fault(
 		 * changes to xfs_get_blocks_direct() to map unwritten extent
 		 * ioend for conversion on read-only mappings.
 		 */
-		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
 	} else
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0f74866edae6..a3dfee4cb03f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -11,13 +11,13 @@ struct iomap_ops;
 /* We use lowest available exceptional entry bit for locking */
 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
 
-ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops);
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
 		  get_block_t, dio_iodone_t, int flags);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			struct iomap_ops *ops);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);

From 333ccc978e1e09af2690e459b6f97d8e91cc01fa Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:33:09 +1100
Subject: [PATCH 10/86] dax: add dax_iomap_sector() helper function

To be able to correctly calculate the sector from a file position and a
struct iomap there is a complex little bit of logic that currently happens
in both dax_iomap_actor() and dax_iomap_fault().  This will need to be
repeated yet again in the DAX PMD fault handler when it is added, so break
it out into a helper function.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index fdbd7a1ec6cf..77379546433e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1030,6 +1030,11 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 EXPORT_SYMBOL_GPL(dax_truncate_page);
 
 #ifdef CONFIG_FS_IOMAP
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
+{
+	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
+}
+
 static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
@@ -1055,8 +1060,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct blk_dax_ctl dax = { 0 };
 		ssize_t map_len;
 
-		dax.sector = iomap->blkno +
-			(((pos & PAGE_MASK) - iomap->offset) >> 9);
+		dax.sector = dax_iomap_sector(iomap, pos);
 		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
 		map_len = dax_map_atomic(iomap->bdev, &dax);
 		if (map_len < 0) {
@@ -1193,7 +1197,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		goto unlock_entry;
 	}
 
-	sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+	sector = dax_iomap_sector(&iomap, pos);
 
 	if (vmf->cow_page) {
 		switch (iomap.type) {

From 1550290b08012637e8d741a6a298ec6320dadda2 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:33:26 +1100
Subject: [PATCH 11/86] dax: dax_iomap_fault() needs to call iomap_end()

Currently iomap_end() doesn't do anything for DAX page faults for both ext2
and XFS.  ext2_iomap_end() just checks for a write underrun, and
xfs_file_iomap_end() checks to see if it needs to finish a delayed
allocation.  However, in the future iomap_end() calls might be needed to
make sure we have balanced allocations, locks, etc.  So, add calls to
iomap_end() with appropriate error handling to dax_iomap_fault().

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 77379546433e..6edd89b3b69c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1165,6 +1165,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct iomap iomap = { 0 };
 	unsigned flags = 0;
 	int error, major = 0;
+	int locked_status = 0;
 	void *entry;
 
 	/*
@@ -1194,7 +1195,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		goto unlock_entry;
 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
 		error = -EIO;		/* fs corruption? */
-		goto unlock_entry;
+		goto finish_iomap;
 	}
 
 	sector = dax_iomap_sector(&iomap, pos);
@@ -1216,13 +1217,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		}
 
 		if (error)
-			goto unlock_entry;
+			goto finish_iomap;
 		if (!radix_tree_exceptional_entry(entry)) {
 			vmf->page = entry;
-			return VM_FAULT_LOCKED;
+			locked_status = VM_FAULT_LOCKED;
+		} else {
+			vmf->entry = entry;
+			locked_status = VM_FAULT_DAX_LOCKED;
 		}
-		vmf->entry = entry;
-		return VM_FAULT_DAX_LOCKED;
+		goto finish_iomap;
 	}
 
 	switch (iomap.type) {
@@ -1237,8 +1240,10 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
-		if (!(vmf->flags & FAULT_FLAG_WRITE))
-			return dax_load_hole(mapping, entry, vmf);
+		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+			locked_status = dax_load_hole(mapping, entry, vmf);
+			break;
+		}
 		/*FALLTHRU*/
 	default:
 		WARN_ON_ONCE(1);
@@ -1246,14 +1251,30 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		break;
 	}
 
+ finish_iomap:
+	if (ops->iomap_end) {
+		if (error) {
+			/* keep previous error */
+			ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
+					&iomap);
+		} else {
+			error = ops->iomap_end(inode, pos, PAGE_SIZE,
+					PAGE_SIZE, flags, &iomap);
+		}
+	}
  unlock_entry:
-	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+	if (!locked_status || error)
+		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  out:
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM | major;
 	/* -EBUSY is fine, somebody else faulted on the same PTE */
 	if (error < 0 && error != -EBUSY)
 		return VM_FAULT_SIGBUS | major;
+	if (locked_status) {
+		WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
+		return locked_status;
+	}
 	return VM_FAULT_NOPAGE | major;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);

From fa28f7296a7ce38ed15dc06bd2149e04c8db9d4b Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:33:35 +1100
Subject: [PATCH 12/86] dax: move RADIX_DAX_* defines to dax.h

The RADIX_DAX_* defines currently mostly live in fs/dax.c, with just
RADIX_DAX_ENTRY_LOCK being in include/linux/dax.h so it can be used in
mm/filemap.c.  When we add PMD support, though, mm/filemap.c will also need
access to the RADIX_DAX_PTE type so it can properly construct a 4k sized
empty entry.

Instead of shifting the defines between dax.c and dax.h as they are
individually used in other code, just move them wholesale to dax.h so
they'll be available when we need them.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c            | 14 --------------
 include/linux/dax.h | 15 ++++++++++++++-
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 6edd89b3b69c..c45cc4d8e996 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -34,20 +34,6 @@
 #include <linux/iomap.h>
 #include "internal.h"
 
-/*
- * We use lowest available bit in exceptional entry for locking, other two
- * bits to determine entry type. In total 3 special bits.
- */
-#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
-#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
-		RADIX_TREE_EXCEPTIONAL_ENTRY))
-
 /* We choose 4096 entries - same as per-zone page wait tables */
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a3dfee4cb03f..e9ea78c1cf98 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -8,8 +8,21 @@
 
 struct iomap_ops;
 
-/* We use lowest available exceptional entry bit for locking */
+/*
+ * We use lowest available bit in exceptional entry for locking, other two
+ * bits to determine entry type. In total 3 special bits.
+ */
+#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
+		RADIX_TREE_EXCEPTIONAL_ENTRY))
+
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops);

From 422476c4641ec65906406f3d266b69a91dd3170c Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:33:44 +1100
Subject: [PATCH 13/86] dax: move put_(un)locked_mapping_entry() in dax.c

No functional change.

The static functions put_locked_mapping_entry() and
put_unlocked_mapping_entry() will soon be used in error cases in
grab_mapping_entry(), so move their definitions above this function.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index c45cc4d8e996..0582c7c2ae40 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -382,6 +382,31 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
 	}
 }
 
+static void put_locked_mapping_entry(struct address_space *mapping,
+				     pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry)) {
+		unlock_page(entry);
+		put_page(entry);
+	} else {
+		dax_unlock_mapping_entry(mapping, index);
+	}
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+				       pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry))
+		return;
+
+	/* We have to wake up next waiter for the radix tree entry lock */
+	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
 /*
  * Find radix tree entry at given index. If it points to a page, return with
  * the page locked. If it points to the exceptional entry, return with the
@@ -486,31 +511,6 @@ void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
 }
 
-static void put_locked_mapping_entry(struct address_space *mapping,
-				     pgoff_t index, void *entry)
-{
-	if (!radix_tree_exceptional_entry(entry)) {
-		unlock_page(entry);
-		put_page(entry);
-	} else {
-		dax_unlock_mapping_entry(mapping, index);
-	}
-}
-
-/*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
- */
-static void put_unlocked_mapping_entry(struct address_space *mapping,
-				       pgoff_t index, void *entry)
-{
-	if (!radix_tree_exceptional_entry(entry))
-		return;
-
-	/* We have to wake up next waiter for the radix tree entry lock */
-	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
-}
-
 /*
  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
  * entry to get unlocked before deleting it.

From 642261ac995e01d7837db1f4b90181496f7e6835 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:34:45 +1100
Subject: [PATCH 14/86] dax: add struct iomap based DAX PMD support

DAX PMDs have been disabled since Jan Kara introduced DAX radix tree based
locking.  This patch allows DAX PMDs to participate in the DAX radix tree
based locking scheme so that they can be re-enabled using the new struct
iomap based fault handlers.

There are currently three types of DAX 4k entries: 4k zero pages, 4k DAX
mappings that have an associated block allocation, and 4k DAX empty
entries.  The empty entries exist to provide locking for the duration of a
given page fault.

This patch adds three equivalent 2MiB DAX entries: Huge Zero Page (HZP)
entries, PMD DAX entries that have associated block allocations, and 2 MiB
DAX empty entries.

Unlike the 4k case where we insert a struct page* into the radix tree for
4k zero pages, for HZP we insert a DAX exceptional entry with the new
RADIX_DAX_HZP flag set.  This is because we use a single 2 MiB zero page in
every 2MiB hole mapping, and it doesn't make sense to have that same struct
page* with multiple entries in multiple trees.  This would cause contention
on the single page lock for the one Huge Zero Page, and it would break the
page->index and page->mapping associations that are assumed to be valid in
many other places in the kernel.

One difficult use case is when one thread is trying to use 4k entries in
radix tree for a given offset, and another thread is using 2 MiB entries
for that same offset.  The current code handles this by making the 2 MiB
user fall back to 4k entries for most cases.  This was done because it is
the simplest solution, and because the use of 2MiB pages is already
opportunistic.

If we were to try to upgrade from 4k pages to 2MiB pages for a given range,
we run into the problem of how we lock out 4k page faults for the entire
2MiB range while we clean out the radix tree so we can insert the 2MiB
entry.  We can solve this problem if we need to, but I think that the cases
where both 2MiB entries and 4K entries are being used for the same range
will be rare enough and the gain small enough that it probably won't be
worth the complexity.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c            | 378 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/dax.h |  55 +++++--
 mm/filemap.c        |   3 +-
 3 files changed, 386 insertions(+), 50 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 0582c7c2ae40..281e91a63367 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -76,6 +76,26 @@ static void dax_unmap_atomic(struct block_device *bdev,
 	blk_queue_exit(bdev->bd_queue);
 }
 
+static int dax_is_pmd_entry(void *entry)
+{
+	return (unsigned long)entry & RADIX_DAX_PMD;
+}
+
+static int dax_is_pte_entry(void *entry)
+{
+	return !((unsigned long)entry & RADIX_DAX_PMD);
+}
+
+static int dax_is_zero_entry(void *entry)
+{
+	return (unsigned long)entry & RADIX_DAX_HZP;
+}
+
+static int dax_is_empty_entry(void *entry)
+{
+	return (unsigned long)entry & RADIX_DAX_EMPTY;
+}
+
 struct page *read_dax_sector(struct block_device *bdev, sector_t n)
 {
 	struct page *page = alloc_pages(GFP_KERNEL, 0);
@@ -281,7 +301,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
 	 * queue to the start of that PMD.  This ensures that all offsets in
 	 * the range covered by the PMD map to the same bit lock.
 	 */
-	if (RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+	if (dax_is_pmd_entry(entry))
 		index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
 
 	key->mapping = mapping;
@@ -413,36 +433,116 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
  * radix tree entry locked. If the radix tree doesn't contain given index,
  * create empty exceptional entry for the index and return with it locked.
  *
+ * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return an error.  This error will
+ * happen if there are any 4k entries (either zero pages or DAX entries)
+ * within the 2MiB range that we are requesting.
+ *
+ * We always favor 4k entries over 2MiB entries. There isn't a flow where we
+ * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
+ * insertion will fail if it finds any 4k entries already in the tree, and a
+ * 4k insertion will cause an existing 2MiB entry to be unmapped and
+ * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
+ * well as 2MiB empty entries.
+ *
+ * The exception to this downgrade path is for 2MiB DAX PMD entries that have
+ * real storage backing them.  We will leave these real 2MiB DAX entries in
+ * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ *
  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  * persistent memory the benefit is doubtful. We can add that later if we can
  * show it helps.
  */
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
+		unsigned long size_flag)
 {
+	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
 	void *entry, **slot;
 
 restart:
 	spin_lock_irq(&mapping->tree_lock);
 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+
+	if (entry) {
+		if (size_flag & RADIX_DAX_PMD) {
+			if (!radix_tree_exceptional_entry(entry) ||
+			    dax_is_pte_entry(entry)) {
+				put_unlocked_mapping_entry(mapping, index,
+						entry);
+				entry = ERR_PTR(-EEXIST);
+				goto out_unlock;
+			}
+		} else { /* trying to grab a PTE entry */
+			if (radix_tree_exceptional_entry(entry) &&
+			    dax_is_pmd_entry(entry) &&
+			    (dax_is_zero_entry(entry) ||
+			     dax_is_empty_entry(entry))) {
+				pmd_downgrade = true;
+			}
+		}
+	}
+
 	/* No entry for given index? Make sure radix tree is big enough. */
-	if (!entry) {
+	if (!entry || pmd_downgrade) {
 		int err;
 
+		if (pmd_downgrade) {
+			/*
+			 * Make sure 'entry' remains valid while we drop
+			 * mapping->tree_lock.
+			 */
+			entry = lock_slot(mapping, slot);
+		}
+
 		spin_unlock_irq(&mapping->tree_lock);
 		err = radix_tree_preload(
 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
-		if (err)
+		if (err) {
+			if (pmd_downgrade)
+				put_locked_mapping_entry(mapping, index, entry);
 			return ERR_PTR(err);
-		entry = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
-			       RADIX_DAX_ENTRY_LOCK);
+		}
+
+		/*
+		 * Besides huge zero pages the only other thing that gets
+		 * downgraded are empty entries which don't need to be
+		 * unmapped.
+		 */
+		if (pmd_downgrade && dax_is_zero_entry(entry))
+			unmap_mapping_range(mapping,
+				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+
 		spin_lock_irq(&mapping->tree_lock);
-		err = radix_tree_insert(&mapping->page_tree, index, entry);
+
+		if (pmd_downgrade) {
+			radix_tree_delete(&mapping->page_tree, index);
+			mapping->nrexceptional--;
+			dax_wake_mapping_entry_waiter(mapping, index, entry,
+					true);
+		}
+
+		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
+
+		err = __radix_tree_insert(&mapping->page_tree, index,
+				dax_radix_order(entry), entry);
 		radix_tree_preload_end();
 		if (err) {
 			spin_unlock_irq(&mapping->tree_lock);
-			/* Someone already created the entry? */
-			if (err == -EEXIST)
+			/*
+			 * Someone already created the entry?  This is a
+			 * normal failure when inserting PMDs in a range
+			 * that already contains PTEs.  In that case we want
+			 * to return -EEXIST immediately.
+			 */
+			if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
 				goto restart;
+			/*
+			 * Our insertion of a DAX PMD entry failed, most
+			 * likely because it collided with a PTE sized entry
+			 * at a different index in the PMD range.  We haven't
+			 * inserted anything into the radix tree and have no
+			 * waiters to wake.
+			 */
 			return ERR_PTR(err);
 		}
 		/* Good, we have inserted empty locked entry into the tree. */
@@ -466,6 +566,7 @@ restart:
 		return page;
 	}
 	entry = lock_slot(mapping, slot);
+ out_unlock:
 	spin_unlock_irq(&mapping->tree_lock);
 	return entry;
 }
@@ -473,9 +574,9 @@ restart:
 /*
  * We do not necessarily hold the mapping->tree_lock when we call this
  * function so it is possible that 'entry' is no longer a valid item in the
- * radix tree.  This is okay, though, because all we really need to do is to
- * find the correct waitqueue where tasks might be sleeping waiting for that
- * old 'entry' and wake them.
+ * radix tree.  This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
  */
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		pgoff_t index, void *entry, bool wake_all)
@@ -588,11 +689,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
 	return 0;
 }
 
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
-
+/*
+ * By this point grab_mapping_entry() has ensured that we have a locked entry
+ * of the appropriate size so we don't have to worry about downgrading PMDs to
+ * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
+ * already in the tree, we will skip the insertion and just dirty the PMD as
+ * appropriate.
+ */
 static void *dax_insert_mapping_entry(struct address_space *mapping,
 				      struct vm_fault *vmf,
-				      void *entry, sector_t sector)
+				      void *entry, sector_t sector,
+				      unsigned long flags)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
 	int error = 0;
@@ -615,22 +722,35 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
 		if (error)
 			return ERR_PTR(error);
+	} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
+		/* replacing huge zero page with PMD block mapping */
+		unmap_mapping_range(mapping,
+			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
 	}
 
 	spin_lock_irq(&mapping->tree_lock);
-	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
-		       RADIX_DAX_ENTRY_LOCK);
+	new_entry = dax_radix_locked_entry(sector, flags);
+
 	if (hole_fill) {
 		__delete_from_page_cache(entry, NULL);
 		/* Drop pagecache reference */
 		put_page(entry);
-		error = radix_tree_insert(page_tree, index, new_entry);
+		error = __radix_tree_insert(page_tree, index,
+				dax_radix_order(new_entry), new_entry);
 		if (error) {
 			new_entry = ERR_PTR(error);
 			goto unlock;
 		}
 		mapping->nrexceptional++;
-	} else {
+	} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+		/*
+		 * Only swap our new entry into the radix tree if the current
+		 * entry is a zero page or an empty entry.  If a normal PTE or
+		 * PMD entry is already in the tree, we leave it alone.  This
+		 * means that if we are trying to insert a PTE and the
+		 * existing entry is a PMD, we will just leave the PMD in the
+		 * tree and dirty it if necessary.
+		 */
 		void **slot;
 		void *ret;
 
@@ -660,7 +780,6 @@ static int dax_writeback_one(struct block_device *bdev,
 		struct address_space *mapping, pgoff_t index, void *entry)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	int type = RADIX_DAX_TYPE(entry);
 	struct radix_tree_node *node;
 	struct blk_dax_ctl dax;
 	void **slot;
@@ -681,13 +800,21 @@ static int dax_writeback_one(struct block_device *bdev,
 	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
 		goto unlock;
 
-	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+	if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+				dax_is_zero_entry(entry))) {
 		ret = -EIO;
 		goto unlock;
 	}
 
-	dax.sector = RADIX_DAX_SECTOR(entry);
-	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+	/*
+	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
+	 * in the middle of a PMD, the 'index' we are given will be aligned to
+	 * the start index of the PMD, as will the sector we pull from
+	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
+	 * worry about partial PMD writebacks.
+	 */
+	dax.sector = dax_radix_sector(entry);
+	dax.size = PAGE_SIZE << dax_radix_order(entry);
 	spin_unlock_irq(&mapping->tree_lock);
 
 	/*
@@ -726,12 +853,11 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	pgoff_t start_index, end_index, pmd_index;
+	pgoff_t start_index, end_index;
 	pgoff_t indices[PAGEVEC_SIZE];
 	struct pagevec pvec;
 	bool done = false;
 	int i, ret = 0;
-	void *entry;
 
 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 		return -EIO;
@@ -741,15 +867,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
 	start_index = wbc->range_start >> PAGE_SHIFT;
 	end_index = wbc->range_end >> PAGE_SHIFT;
-	pmd_index = DAX_PMD_INDEX(start_index);
-
-	rcu_read_lock();
-	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
-	rcu_read_unlock();
-
-	/* see if the start of our range is covered by a PMD entry */
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
-		start_index = pmd_index;
 
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
@@ -794,7 +911,7 @@ static int dax_insert_mapping(struct address_space *mapping,
 		return PTR_ERR(dax.addr);
 	dax_unmap_atomic(bdev, &dax);
 
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 	*entryp = ret;
@@ -841,7 +958,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 
-	entry = grab_mapping_entry(mapping, vmf->pgoff);
+	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
 	if (IS_ERR(entry)) {
 		error = PTR_ERR(entry);
 		goto out;
@@ -1162,7 +1279,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (pos >= i_size_read(inode))
 		return VM_FAULT_SIGBUS;
 
-	entry = grab_mapping_entry(mapping, vmf->pgoff);
+	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
 	if (IS_ERR(entry)) {
 		error = PTR_ERR(entry);
 		goto out;
@@ -1264,4 +1381,191 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	return VM_FAULT_NOPAGE | major;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+#ifdef CONFIG_FS_DAX_PMD
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
+		struct vm_fault *vmf, unsigned long address,
+		struct iomap *iomap, loff_t pos, bool write, void **entryp)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct block_device *bdev = iomap->bdev;
+	struct blk_dax_ctl dax = {
+		.sector = dax_iomap_sector(iomap, pos),
+		.size = PMD_SIZE,
+	};
+	long length = dax_map_atomic(bdev, &dax);
+	void *ret;
+
+	if (length < 0) /* dax_map_atomic() failed */
+		return VM_FAULT_FALLBACK;
+	if (length < PMD_SIZE)
+		goto unmap_fallback;
+	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
+		goto unmap_fallback;
+	if (!pfn_t_devmap(dax.pfn))
+		goto unmap_fallback;
+
+	dax_unmap_atomic(bdev, &dax);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+			RADIX_DAX_PMD);
+	if (IS_ERR(ret))
+		return VM_FAULT_FALLBACK;
+	*entryp = ret;
+
+	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+
+ unmap_fallback:
+	dax_unmap_atomic(bdev, &dax);
+	return VM_FAULT_FALLBACK;
+}
+
+static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
+		struct vm_fault *vmf, unsigned long address,
+		struct iomap *iomap, void **entryp)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	unsigned long pmd_addr = address & PMD_MASK;
+	struct page *zero_page;
+	spinlock_t *ptl;
+	pmd_t pmd_entry;
+	void *ret;
+
+	zero_page = mm_get_huge_zero_page(vma->vm_mm);
+
+	if (unlikely(!zero_page))
+		return VM_FAULT_FALLBACK;
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
+			RADIX_DAX_PMD | RADIX_DAX_HZP);
+	if (IS_ERR(ret))
+		return VM_FAULT_FALLBACK;
+	*entryp = ret;
+
+	ptl = pmd_lock(vma->vm_mm, pmd);
+	if (!pmd_none(*pmd)) {
+		spin_unlock(ptl);
+		return VM_FAULT_FALLBACK;
+	}
+
+	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+	pmd_entry = pmd_mkhuge(pmd_entry);
+	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+	spin_unlock(ptl);
+	return VM_FAULT_NOPAGE;
+}
+
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	unsigned long pmd_addr = address & PMD_MASK;
+	bool write = flags & FAULT_FLAG_WRITE;
+	unsigned int iomap_flags = write ? IOMAP_WRITE : 0;
+	struct inode *inode = mapping->host;
+	int result = VM_FAULT_FALLBACK;
+	struct iomap iomap = { 0 };
+	pgoff_t max_pgoff, pgoff;
+	struct vm_fault vmf;
+	void *entry;
+	loff_t pos;
+	int error;
+
+	/* Fall back to PTEs if we're going to COW */
+	if (write && !(vma->vm_flags & VM_SHARED))
+		goto fallback;
+
+	/* If the PMD would extend outside the VMA */
+	if (pmd_addr < vma->vm_start)
+		goto fallback;
+	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+		goto fallback;
+
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is
+	 * supposed to hold locks serializing us with truncate / punch hole so
+	 * this is a reliable test.
+	 */
+	pgoff = linear_page_index(vma, pmd_addr);
+	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+	if (pgoff > max_pgoff)
+		return VM_FAULT_SIGBUS;
+
+	/* If the PMD would extend beyond the file size */
+	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+		goto fallback;
+
+	/*
+	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
+	 * the tree, for instance), it will return -EEXIST and we just fall
+	 * back to 4k entries.
+	 */
+	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+	if (IS_ERR(entry))
+		goto fallback;
+
+	/*
+	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
+	 * setting up a mapping, so really we're using iomap_begin() as a way
+	 * to look up our filesystem block.
+	 */
+	pos = (loff_t)pgoff << PAGE_SHIFT;
+	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+	if (error)
+		goto unlock_entry;
+	if (iomap.offset + iomap.length < pos + PMD_SIZE)
+		goto finish_iomap;
+
+	vmf.pgoff = pgoff;
+	vmf.flags = flags;
+	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+
+	switch (iomap.type) {
+	case IOMAP_MAPPED:
+		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
+				&iomap, pos, write, &entry);
+		break;
+	case IOMAP_UNWRITTEN:
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(write))
+			goto finish_iomap;
+		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
+				&entry);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+ finish_iomap:
+	if (ops->iomap_end) {
+		if (result == VM_FAULT_FALLBACK) {
+			ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
+					&iomap);
+		} else {
+			error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
+					iomap_flags, &iomap);
+			if (error)
+				result = VM_FAULT_FALLBACK;
+		}
+	}
+ unlock_entry:
+	put_locked_mapping_entry(mapping, pgoff, entry);
+ fallback:
+	if (result == VM_FAULT_FALLBACK) {
+		split_huge_pmd(vma, pmd, address);
+		count_vm_event(THP_FAULT_FALLBACK);
+	}
+	return result;
+}
+EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#endif /* CONFIG_FS_DAX_PMD */
 #endif /* CONFIG_FS_IOMAP */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e9ea78c1cf98..8d1a5c47945f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -9,20 +9,32 @@
 struct iomap_ops;
 
 /*
- * We use lowest available bit in exceptional entry for locking, other two
- * bits to determine entry type. In total 3 special bits.
+ * We use lowest available bit in exceptional entry for locking, one bit for
+ * the entry size (PMD) and two more to tell us if the entry is a huge zero
+ * page (HZP) or an empty entry that is just used for locking.  In total four
+ * special bits.
+ *
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
+ * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * block allocation.
  */
-#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
+#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
-		RADIX_TREE_EXCEPTIONAL_ENTRY))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
 
+static inline unsigned long dax_radix_sector(void *entry)
+{
+	return (unsigned long)entry >> RADIX_DAX_SHIFT;
+}
+
+static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+{
+	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
+			((unsigned long)sector << RADIX_DAX_SHIFT) |
+			RADIX_DAX_ENTRY_LOCK);
+}
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops);
@@ -67,6 +79,27 @@ static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	return VM_FAULT_FALLBACK;
 }
 
+#ifdef CONFIG_FS_DAX_PMD
+static inline unsigned int dax_radix_order(void *entry)
+{
+	if ((unsigned long)entry & RADIX_DAX_PMD)
+		return PMD_SHIFT - PAGE_SHIFT;
+	return 0;
+}
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
+#else
+static inline unsigned int dax_radix_order(void *entry)
+{
+	return 0;
+}
+static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
+		unsigned long address, pmd_t *pmd, unsigned int flags,
+		struct iomap_ops *ops)
+{
+	return VM_FAULT_FALLBACK;
+}
+#endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 1ffb7dcd1b5d..00ab94a882de 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -137,8 +137,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		} else {
 			/* DAX can replace empty locked entry with a hole */
 			WARN_ON_ONCE(p !=
-				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
-					 RADIX_DAX_ENTRY_LOCK));
+				dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
 			/* DAX accounts exceptional entries as normal pages */
 			if (node)
 				workingset_node_pages_dec(node);

From 862f1b9d67187cf441e713cc77efae682d8ebc45 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:35:02 +1100
Subject: [PATCH 15/86] xfs: use struct iomap based DAX PMD fault path

Switch xfs_filemap_pmd_fault() from using dax_pmd_fault() to the new and
improved dax_iomap_pmd_fault().  Also, now that it has no more users,
remove xfs_get_blocks_dax_fault().

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 26 +++++---------------------
 fs/xfs/xfs_aops.h |  3 ---
 fs/xfs/xfs_file.c |  2 +-
 3 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3e57a56cf829..561cf1456c6c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1298,8 +1298,7 @@ __xfs_get_blocks(
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
 	int			create,
-	bool			direct,
-	bool			dax_fault)
+	bool			direct)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -1420,13 +1419,8 @@ __xfs_get_blocks(
 		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
 		/* direct IO needs special help */
-		if (create) {
-			if (dax_fault)
-				ASSERT(!ISUNWRITTEN(&imap));
-			else
-				xfs_map_direct(inode, bh_result, &imap, offset,
-						is_cow);
-		}
+		if (create)
+			xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
 	}
 
 	/*
@@ -1466,7 +1460,7 @@ xfs_get_blocks(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
 }
 
 int
@@ -1476,17 +1470,7 @@ xfs_get_blocks_direct(
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
-}
-
-int
-xfs_get_blocks_dax_fault(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
+	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b3c6634f9518..34dc00dfb91d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -59,9 +59,6 @@ int	xfs_get_blocks(struct inode *inode, sector_t offset,
 		       struct buffer_head *map_bh, int create);
 int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
 			      struct buffer_head *map_bh, int create);
-int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
-			         struct buffer_head *map_bh, int create);
-
 int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
 		ssize_t size, void *private);
 int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e7f35d548cfc..ca2ab738fae2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1711,7 +1711,7 @@ xfs_filemap_pmd_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (flags & FAULT_FLAG_WRITE)

From 190b5caad750942c0791889b809ce04b1eede43d Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 8 Nov 2016 11:35:16 +1100
Subject: [PATCH 16/86] dax: remove "depends on BROKEN" from FS_DAX_PMD

Now that DAX PMD faults are once again working and are now participating in
DAX's radix tree locking scheme, allow their config option to be enabled.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 4bd03a2b0518..8e9e5f4104f4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -55,7 +55,6 @@ config FS_DAX_PMD
 	depends on FS_DAX
 	depends on ZONE_DEVICE
 	depends on TRANSPARENT_HUGEPAGE
-	depends on BROKEN
 
 endif # BLOCK
 

From 9484ab1bf4464faae695321dd4fa66365beda74e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 10 Nov 2016 10:26:50 +1100
Subject: [PATCH 17/86] dax: Introduce IOMAP_FAULT flag

Introduce a flag telling iomap operations whether they are handling a
fault or other IO. That may influence behavior wrt inode size and
similar things.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c              | 4 ++--
 fs/iomap.c            | 5 +++--
 include/linux/iomap.h | 1 +
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 281e91a63367..28af41b9da3a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1266,7 +1266,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
 	sector_t sector;
 	struct iomap iomap = { 0 };
-	unsigned flags = 0;
+	unsigned flags = IOMAP_FAULT;
 	int error, major = 0;
 	int locked_status = 0;
 	void *entry;
@@ -1467,7 +1467,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = address & PMD_MASK;
 	bool write = flags & FAULT_FLAG_WRITE;
-	unsigned int iomap_flags = write ? IOMAP_WRITE : 0;
+	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
diff --git a/fs/iomap.c b/fs/iomap.c
index 013d1d36fbbf..51a02573405e 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -468,8 +468,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	offset = page_offset(page);
 	while (length > 0) {
-		ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
-				ops, page, iomap_page_mkwrite_actor);
+		ret = iomap_apply(inode, offset, length,
+				IOMAP_WRITE | IOMAP_FAULT, ops, page,
+				iomap_page_mkwrite_actor);
 		if (unlikely(ret <= 0))
 			goto out_unlock;
 		offset += ret;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e63e288dee83..b9e7b8ec8c1d 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -44,6 +44,7 @@ struct iomap {
  */
 #define IOMAP_WRITE		(1 << 0)
 #define IOMAP_ZERO		(1 << 1)
+#define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 
 struct iomap_ops {
 	/*

From 1c7dcf69eea3224474fe884a03b3e19b82d1101e Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Sun, 13 Nov 2016 22:20:44 +0100
Subject: [PATCH 18/86] fscrypt: Add in-place encryption mode

ext4 and f2fs require a bounce page when encrypting pages. However, not
all filesystems will need that (eg. UBIFS). This is handled via a
flag on fscrypt_operations where a fs implementation can select in-place
encryption over using a bounce page (which is the default).

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c       | 25 +++++++++++++++----------
 include/linux/fscrypto.h |  6 ++++++
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 98f87fe8f186..f38dc8aac2fe 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -217,8 +217,9 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
  * @plaintext_page: The page to encrypt. Must be locked.
  * @gfp_flags:      The gfp flag for memory allocation
  *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
+ * Encrypts plaintext_page using the ctx encryption context. If
+ * the filesystem supports it, encryption is performed in-place, otherwise a
+ * new ciphertext_page is allocated and returned.
  *
  * Called on the page write path.  The caller must call
  * fscrypt_restore_control_page() on the returned ciphertext page to
@@ -231,7 +232,7 @@ struct page *fscrypt_encrypt_page(struct inode *inode,
 				struct page *plaintext_page, gfp_t gfp_flags)
 {
 	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = NULL;
+	struct page *ciphertext_page = plaintext_page;
 	int err;
 
 	BUG_ON(!PageLocked(plaintext_page));
@@ -240,10 +241,12 @@ struct page *fscrypt_encrypt_page(struct inode *inode,
 	if (IS_ERR(ctx))
 		return (struct page *)ctx;
 
-	/* The encryption operation will require a bounce page. */
-	ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
-	if (IS_ERR(ciphertext_page))
-		goto errout;
+	if (!(inode->i_sb->s_cop->flags & FS_CFLG_INPLACE_ENCRYPTION)) {
+		/* The encryption operation will require a bounce page. */
+		ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+		if (IS_ERR(ciphertext_page))
+			goto errout;
+	}
 
 	ctx->w.control_page = plaintext_page;
 	err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
@@ -253,9 +256,11 @@ struct page *fscrypt_encrypt_page(struct inode *inode,
 		ciphertext_page = ERR_PTR(err);
 		goto errout;
 	}
-	SetPagePrivate(ciphertext_page);
-	set_page_private(ciphertext_page, (unsigned long)ctx);
-	lock_page(ciphertext_page);
+	if (!(inode->i_sb->s_cop->flags & FS_CFLG_INPLACE_ENCRYPTION)) {
+		SetPagePrivate(ciphertext_page);
+		set_page_private(ciphertext_page, (unsigned long)ctx);
+		lock_page(ciphertext_page);
+	}
 	return ciphertext_page;
 
 errout:
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index ff8b11b26f31..5a65b0e3773f 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -153,10 +153,16 @@ struct fscrypt_name {
 #define fname_name(p)		((p)->disk_name.name)
 #define fname_len(p)		((p)->disk_name.len)
 
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_INPLACE_ENCRYPTION (1U << 1)
+
 /*
  * crypto opertions for filesystems
  */
 struct fscrypt_operations {
+	unsigned int flags;
 	int (*get_context)(struct inode *, void *, size_t);
 	int (*key_prefix)(struct inode *, u8 **);
 	int (*prepare_context)(struct inode *);

From b50f7b268bad8088dfe7579a65cd910d8cc5c40f Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Sun, 13 Nov 2016 22:20:45 +0100
Subject: [PATCH 19/86] fscrypt: Allow fscrypt_decrypt_page() to function with
 non-writeback pages

Some filesystem might pass pages which do not have page->mapping->host
set to the encrypted inode. We want the caller to explicitly pass the
corresponding inode.

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c       | 11 ++++++-----
 fs/ext4/inode.c          |  5 +++--
 include/linux/fscrypto.h |  5 +++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index f38dc8aac2fe..222a70520565 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -271,7 +271,8 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
 
 /**
  * f2crypt_decrypt_page() - Decrypts a page in-place
- * @page: The page to decrypt. Must be locked.
+ * @inode: The encrypted inode to decrypt.
+ * @page:  The page to decrypt. Must be locked.
  *
  * Decrypts page in-place using the ctx encryption context.
  *
@@ -279,12 +280,12 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int fscrypt_decrypt_page(struct page *page)
+int fscrypt_decrypt_page(struct inode *inode, struct page *page)
 {
 	BUG_ON(!PageLocked(page));
 
-	return do_page_crypto(page->mapping->host,
-			FS_DECRYPT, page->index, page, page, GFP_NOFS);
+	return do_page_crypto(inode, FS_DECRYPT, page->index, page, page,
+			GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
 
@@ -419,7 +420,7 @@ static void completion_pages(struct work_struct *work)
 
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_page(page);
+		int ret = fscrypt_decrypt_page(page->mapping->host, page);
 
 		if (ret) {
 			WARN_ON_ONCE(1);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9c064727ed62..4b7b842ec024 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1166,7 +1166,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 	if (unlikely(err))
 		page_zero_new_buffers(page, from, to);
 	else if (decrypt)
-		err = fscrypt_decrypt_page(page);
+		err = fscrypt_decrypt_page(page->mapping->host, page);
 	return err;
 }
 #endif
@@ -3743,7 +3743,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 			/* We expect the key to be set. */
 			BUG_ON(!fscrypt_has_encryption_key(inode));
 			BUG_ON(blocksize != PAGE_SIZE);
-			WARN_ON_ONCE(fscrypt_decrypt_page(page));
+			WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
+						page));
 		}
 	}
 	if (ext4_should_journal_data(inode)) {
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 5a65b0e3773f..8be8e9657c63 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -249,7 +249,7 @@ int fscrypt_initialize(void);
 extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
 extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t);
-extern int fscrypt_decrypt_page(struct page *);
+extern int fscrypt_decrypt_page(struct inode *, struct page *);
 extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
 extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern void fscrypt_restore_control_page(struct page *);
@@ -298,7 +298,8 @@ static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline int fscrypt_notsupp_decrypt_page(struct page *p)
+static inline int fscrypt_notsupp_decrypt_page(struct inode *i,
+						struct page *p)
 {
 	return -EOPNOTSUPP;
 }

From 7821d4dd4589ce5af54f3e46d04a29439ba3c2e5 Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Sun, 13 Nov 2016 22:20:46 +0100
Subject: [PATCH 20/86] fscrypt: Enable partial page encryption

Not all filesystems work on full pages, thus we should allow them to
hand partial pages to fscrypt for en/decryption.

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c       | 42 +++++++++++++++++++++++++---------------
 fs/ext4/inode.c          |  6 ++++--
 fs/ext4/page-io.c        |  2 +-
 fs/f2fs/data.c           |  2 ++
 include/linux/fscrypto.h | 16 ++++++++++-----
 5 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 222a70520565..e170aa05011d 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -149,6 +149,7 @@ typedef enum {
 static int do_page_crypto(struct inode *inode,
 			fscrypt_direction_t rw, pgoff_t index,
 			struct page *src_page, struct page *dest_page,
+			unsigned int src_len, unsigned int src_offset,
 			gfp_t gfp_flags)
 {
 	struct {
@@ -179,10 +180,10 @@ static int do_page_crypto(struct inode *inode,
 	memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
 
 	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
+	sg_set_page(&dst, dest_page, src_len, src_offset);
 	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, PAGE_SIZE, 0);
-	skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak);
+	sg_set_page(&src, src_page, src_len, src_offset);
+	skcipher_request_set_crypt(req, &src, &dst, src_len, &xts_tweak);
 	if (rw == FS_DECRYPT)
 		res = crypto_skcipher_decrypt(req);
 	else
@@ -213,9 +214,11 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
 
 /**
  * fscypt_encrypt_page() - Encrypts a page
- * @inode:          The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- * @gfp_flags:      The gfp flag for memory allocation
+ * @inode:            The inode for which the encryption should take place
+ * @plaintext_page:   The page to encrypt. Must be locked.
+ * @plaintext_len:    Length of plaintext within page
+ * @plaintext_offset: Offset of plaintext within page
+ * @gfp_flags:        The gfp flag for memory allocation
  *
  * Encrypts plaintext_page using the ctx encryption context. If
  * the filesystem supports it, encryption is performed in-place, otherwise a
@@ -229,13 +232,17 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
  * error value or NULL.
  */
 struct page *fscrypt_encrypt_page(struct inode *inode,
-				struct page *plaintext_page, gfp_t gfp_flags)
+				struct page *plaintext_page,
+				unsigned int plaintext_len,
+				unsigned int plaintext_offset,
+				gfp_t gfp_flags)
+
 {
 	struct fscrypt_ctx *ctx;
 	struct page *ciphertext_page = plaintext_page;
 	int err;
 
-	BUG_ON(!PageLocked(plaintext_page));
+	BUG_ON(plaintext_len % FS_CRYPTO_BLOCK_SIZE != 0);
 
 	ctx = fscrypt_get_ctx(inode, gfp_flags);
 	if (IS_ERR(ctx))
@@ -251,6 +258,7 @@ struct page *fscrypt_encrypt_page(struct inode *inode,
 	ctx->w.control_page = plaintext_page;
 	err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
 					plaintext_page, ciphertext_page,
+					plaintext_len, plaintext_offset,
 					gfp_flags);
 	if (err) {
 		ciphertext_page = ERR_PTR(err);
@@ -270,9 +278,11 @@ errout:
 EXPORT_SYMBOL(fscrypt_encrypt_page);
 
 /**
- * f2crypt_decrypt_page() - Decrypts a page in-place
- * @inode: The encrypted inode to decrypt.
+ * fscrypt_decrypt_page() - Decrypts a page in-place
+ * @inode: Encrypted inode to decrypt.
  * @page:  The page to decrypt. Must be locked.
+ * @len:   Number of bytes in @page to be decrypted.
+ * @offs:  Start of data in @page.
  *
  * Decrypts page in-place using the ctx encryption context.
  *
@@ -280,11 +290,10 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int fscrypt_decrypt_page(struct inode *inode, struct page *page)
+int fscrypt_decrypt_page(struct inode *inode, struct page *page,
+			unsigned int len, unsigned int offs)
 {
-	BUG_ON(!PageLocked(page));
-
-	return do_page_crypto(inode, FS_DECRYPT, page->index, page, page,
+	return do_page_crypto(inode, FS_DECRYPT, page->index, page, page, len, offs,
 			GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
@@ -312,7 +321,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
 	while (len--) {
 		err = do_page_crypto(inode, FS_ENCRYPT, lblk,
 					ZERO_PAGE(0), ciphertext_page,
-					GFP_NOFS);
+					PAGE_SIZE, 0, GFP_NOFS);
 		if (err)
 			goto errout;
 
@@ -420,7 +429,8 @@ static void completion_pages(struct work_struct *work)
 
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_page(page->mapping->host, page);
+		int ret = fscrypt_decrypt_page(page->mapping->host, page,
+				PAGE_SIZE, 0);
 
 		if (ret) {
 			WARN_ON_ONCE(1);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b7b842ec024..1d498c5e2990 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1166,7 +1166,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 	if (unlikely(err))
 		page_zero_new_buffers(page, from, to);
 	else if (decrypt)
-		err = fscrypt_decrypt_page(page->mapping->host, page);
+		err = fscrypt_decrypt_page(page->mapping->host, page,
+				PAGE_SIZE, 0);
 	return err;
 }
 #endif
@@ -3743,8 +3744,9 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 			/* We expect the key to be set. */
 			BUG_ON(!fscrypt_has_encryption_key(inode));
 			BUG_ON(blocksize != PAGE_SIZE);
+			BUG_ON(!PageLocked(page));
 			WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
-						page));
+						page, PAGE_SIZE, 0));
 		}
 	}
 	if (ext4_should_journal_data(inode)) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0094923e5ebf..3d1d3d0f4303 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -470,7 +470,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		gfp_t gfp_flags = GFP_NOFS;
 
 	retry_encrypt:
-		data_page = fscrypt_encrypt_page(inode, page, gfp_flags);
+		data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, gfp_flags);
 		if (IS_ERR(data_page)) {
 			ret = PTR_ERR(data_page);
 			if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9ae194fd2fdb..fac207254e8d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1194,7 +1194,9 @@ int do_write_data_page(struct f2fs_io_info *fio)
 		f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
 							fio->old_blkaddr);
 retry_encrypt:
+		BUG_ON(!PageLocked(fio->page));
 		fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+								PAGE_SIZE, 0,
 								gfp_flags);
 		if (IS_ERR(fio->encrypted_page)) {
 			err = PTR_ERR(fio->encrypted_page);
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 8be8e9657c63..e05931272cea 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -248,8 +248,11 @@ int fscrypt_initialize(void);
 
 extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
-extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t);
-extern int fscrypt_decrypt_page(struct inode *, struct page *);
+extern struct page *fscrypt_encrypt_page(struct inode *, struct page *,
+						unsigned int, unsigned int,
+						gfp_t);
+extern int fscrypt_decrypt_page(struct inode *, struct page *, unsigned int,
+				unsigned int);
 extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
 extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern void fscrypt_restore_control_page(struct page *);
@@ -293,13 +296,16 @@ static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c)
 }
 
 static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i,
-						struct page *p, gfp_t f)
+						struct page *p,
+						unsigned int len,
+						unsigned int offs,
+						gfp_t f)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline int fscrypt_notsupp_decrypt_page(struct inode *i,
-						struct page *p)
+static inline int fscrypt_notsupp_decrypt_page(struct inode *i, struct page *p,
+						unsigned int len, unsigned int offs)
 {
 	return -EOPNOTSUPP;
 }

From 0b93e1b94b86f826d18a2aaf219a53e271274d49 Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Sun, 13 Nov 2016 22:20:47 +0100
Subject: [PATCH 21/86] fscrypt: Constify struct inode pointer

Some filesystems, such as UBIFS, maintain a const pointer for struct
inode.

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c       | 10 +++++-----
 fs/crypto/fname.c        |  4 ++--
 include/linux/fscrypto.h | 22 +++++++++++-----------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index e170aa05011d..f5c5e84ea9db 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx);
  * Return: An allocated and initialized encryption context on success; error
  * value or NULL otherwise.
  */
-struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
+struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
 {
 	struct fscrypt_ctx *ctx = NULL;
 	struct fscrypt_info *ci = inode->i_crypt_info;
@@ -146,7 +146,7 @@ typedef enum {
 	FS_ENCRYPT,
 } fscrypt_direction_t;
 
-static int do_page_crypto(struct inode *inode,
+static int do_page_crypto(const struct inode *inode,
 			fscrypt_direction_t rw, pgoff_t index,
 			struct page *src_page, struct page *dest_page,
 			unsigned int src_len, unsigned int src_offset,
@@ -231,7 +231,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
  * Return: An allocated page with the encrypted content on success. Else, an
  * error value or NULL.
  */
-struct page *fscrypt_encrypt_page(struct inode *inode,
+struct page *fscrypt_encrypt_page(const struct inode *inode,
 				struct page *plaintext_page,
 				unsigned int plaintext_len,
 				unsigned int plaintext_offset,
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int fscrypt_decrypt_page(struct inode *inode, struct page *page,
+int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
 			unsigned int len, unsigned int offs)
 {
 	return do_page_crypto(inode, FS_DECRYPT, page->index, page, page, len, offs,
@@ -298,7 +298,7 @@ int fscrypt_decrypt_page(struct inode *inode, struct page *page,
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
 
-int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 				sector_t pblk, unsigned int len)
 {
 	struct fscrypt_ctx *ctx;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 9a28133ac3b8..faeaf0be9400 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -220,7 +220,7 @@ static int digest_decode(const char *src, int len, char *dst)
 	return cp - dst;
 }
 
-u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
+u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen)
 {
 	int padding = 32;
 	struct fscrypt_info *ci = inode->i_crypt_info;
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
  * Allocates an output buffer that is sufficient for the crypto operation
  * specified by the context and the direction.
  */
-int fscrypt_fname_alloc_buffer(struct inode *inode,
+int fscrypt_fname_alloc_buffer(const struct inode *inode,
 				u32 ilen, struct fscrypt_str *crypto_str)
 {
 	unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index e05931272cea..e9be944a324c 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -212,7 +212,7 @@ static inline struct page *fscrypt_control_page(struct page *page)
 #endif
 }
 
-static inline int fscrypt_has_encryption_key(struct inode *inode)
+static inline int fscrypt_has_encryption_key(const struct inode *inode)
 {
 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
 	return (inode->i_crypt_info != NULL);
@@ -246,17 +246,17 @@ static inline void fscrypt_set_d_op(struct dentry *dentry)
 extern struct kmem_cache *fscrypt_info_cachep;
 int fscrypt_initialize(void);
 
-extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t);
+extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
-extern struct page *fscrypt_encrypt_page(struct inode *, struct page *,
+extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
 						unsigned int, unsigned int,
 						gfp_t);
-extern int fscrypt_decrypt_page(struct inode *, struct page *, unsigned int,
+extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
 				unsigned int);
 extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
 extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern void fscrypt_restore_control_page(struct page *);
-extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t,
+extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 						unsigned int);
 /* policy.c */
 extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *);
@@ -273,8 +273,8 @@ extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
 extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
 				int lookup, struct fscrypt_name *);
 extern void fscrypt_free_filename(struct fscrypt_name *);
-extern u32 fscrypt_fname_encrypted_size(struct inode *, u32);
-extern int fscrypt_fname_alloc_buffer(struct inode *, u32,
+extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32);
+extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
 				struct fscrypt_str *);
 extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
 extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
@@ -284,7 +284,7 @@ extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
 #endif
 
 /* crypto.c */
-static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i,
+static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(const struct inode *i,
 							gfp_t f)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -295,7 +295,7 @@ static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c)
 	return;
 }
 
-static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i,
+static inline struct page *fscrypt_notsupp_encrypt_page(const struct inode *i,
 						struct page *p,
 						unsigned int len,
 						unsigned int offs,
@@ -304,7 +304,7 @@ static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline int fscrypt_notsupp_decrypt_page(struct inode *i, struct page *p,
+static inline int fscrypt_notsupp_decrypt_page(const struct inode *i, struct page *p,
 						unsigned int len, unsigned int offs)
 {
 	return -EOPNOTSUPP;
@@ -326,7 +326,7 @@ static inline void fscrypt_notsupp_restore_control_page(struct page *p)
 	return;
 }
 
-static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p,
+static inline int fscrypt_notsupp_zeroout_range(const struct inode *i, pgoff_t p,
 					sector_t s, unsigned int f)
 {
 	return -EOPNOTSUPP;

From 9c4bb8a3a9b4de21753053d667310c2b7cb39916 Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Sun, 13 Nov 2016 22:20:48 +0100
Subject: [PATCH 22/86] fscrypt: Let fs select encryption index/tweak

Avoid re-use of page index as tweak for AES-XTS when multiple parts of
same page are encrypted. This will happen on multiple (partial) calls of
fscrypt_encrypt_page on same page.
page->index is only valid for writeback pages.

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c       | 11 +++++++----
 fs/ext4/inode.c          |  4 ++--
 fs/ext4/page-io.c        |  3 ++-
 fs/f2fs/data.c           |  5 +++--
 include/linux/fscrypto.h |  9 +++++----
 5 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index f5c5e84ea9db..b6029785714c 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -218,6 +218,8 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
  * @plaintext_page:   The page to encrypt. Must be locked.
  * @plaintext_len:    Length of plaintext within page
  * @plaintext_offset: Offset of plaintext within page
+ * @index:            Index for encryption. This is mainly the page index, but
+ *                    but might be different for multiple calls on same page.
  * @gfp_flags:        The gfp flag for memory allocation
  *
  * Encrypts plaintext_page using the ctx encryption context. If
@@ -235,7 +237,7 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 				struct page *plaintext_page,
 				unsigned int plaintext_len,
 				unsigned int plaintext_offset,
-				gfp_t gfp_flags)
+				pgoff_t index, gfp_t gfp_flags)
 
 {
 	struct fscrypt_ctx *ctx;
@@ -256,7 +258,7 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 	}
 
 	ctx->w.control_page = plaintext_page;
-	err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
+	err = do_page_crypto(inode, FS_ENCRYPT, index,
 					plaintext_page, ciphertext_page,
 					plaintext_len, plaintext_offset,
 					gfp_flags);
@@ -283,6 +285,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  * @page:  The page to decrypt. Must be locked.
  * @len:   Number of bytes in @page to be decrypted.
  * @offs:  Start of data in @page.
+ * @index: Index for encryption.
  *
  * Decrypts page in-place using the ctx encryption context.
  *
@@ -291,7 +294,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  * Return: Zero on success, non-zero otherwise.
  */
 int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
-			unsigned int len, unsigned int offs)
+			unsigned int len, unsigned int offs, pgoff_t index)
 {
 	return do_page_crypto(inode, FS_DECRYPT, page->index, page, page, len, offs,
 			GFP_NOFS);
@@ -430,7 +433,7 @@ static void completion_pages(struct work_struct *work)
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
 		int ret = fscrypt_decrypt_page(page->mapping->host, page,
-				PAGE_SIZE, 0);
+				PAGE_SIZE, 0, page->index);
 
 		if (ret) {
 			WARN_ON_ONCE(1);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1d498c5e2990..1485ac273bfb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1167,7 +1167,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 		page_zero_new_buffers(page, from, to);
 	else if (decrypt)
 		err = fscrypt_decrypt_page(page->mapping->host, page,
-				PAGE_SIZE, 0);
+				PAGE_SIZE, 0, page->index);
 	return err;
 }
 #endif
@@ -3746,7 +3746,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 			BUG_ON(blocksize != PAGE_SIZE);
 			BUG_ON(!PageLocked(page));
 			WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
-						page, PAGE_SIZE, 0));
+						page, PAGE_SIZE, 0, page->index));
 		}
 	}
 	if (ext4_should_journal_data(inode)) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 3d1d3d0f4303..902a3e3059b3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -470,7 +470,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		gfp_t gfp_flags = GFP_NOFS;
 
 	retry_encrypt:
-		data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, gfp_flags);
+		data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
+						page->index, gfp_flags);
 		if (IS_ERR(data_page)) {
 			ret = PTR_ERR(data_page);
 			if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fac207254e8d..435590c4b341 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1196,8 +1196,9 @@ int do_write_data_page(struct f2fs_io_info *fio)
 retry_encrypt:
 		BUG_ON(!PageLocked(fio->page));
 		fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
-								PAGE_SIZE, 0,
-								gfp_flags);
+							PAGE_SIZE, 0,
+							fio->page->index,
+							gfp_flags);
 		if (IS_ERR(fio->encrypted_page)) {
 			err = PTR_ERR(fio->encrypted_page);
 			if (err == -ENOMEM) {
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index e9be944a324c..98c71e973a96 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -250,9 +250,9 @@ extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
 extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
 						unsigned int, unsigned int,
-						gfp_t);
+						pgoff_t, gfp_t);
 extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
-				unsigned int);
+				unsigned int, pgoff_t);
 extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
 extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern void fscrypt_restore_control_page(struct page *);
@@ -299,13 +299,14 @@ static inline struct page *fscrypt_notsupp_encrypt_page(const struct inode *i,
 						struct page *p,
 						unsigned int len,
 						unsigned int offs,
-						gfp_t f)
+						pgoff_t index, gfp_t f)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline int fscrypt_notsupp_decrypt_page(const struct inode *i, struct page *p,
-						unsigned int len, unsigned int offs)
+						unsigned int len, unsigned int offs,
+						pgoff_t index)
 {
 	return -EOPNOTSUPP;
 }

From 08ae877f4e9123ee2a70e8e16cc8c3131ba07dba Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Nov 2016 20:35:52 -0500
Subject: [PATCH 23/86] fscrypto: don't use on-stack buffer for filename
 encryption

With the new (in 4.9) option to use a virtually-mapped stack
(CONFIG_VMAP_STACK), stack buffers cannot be used as input/output for
the scatterlist crypto API because they may not be directly mappable to
struct page.  For short filenames, fname_encrypt() was encrypting a
stack buffer holding the padded filename.  Fix it by encrypting the
filename in-place in the output buffer, thereby making the temporary
buffer unnecessary.

This bug could most easily be observed in a CONFIG_DEBUG_SG kernel
because this allowed the BUG in sg_set_buf() to be triggered.

Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/fname.c | 53 +++++++++++++++++++----------------------------
 1 file changed, 21 insertions(+), 32 deletions(-)

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index faeaf0be9400..6b45d9caeeb0 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -39,65 +39,54 @@ static void fname_crypt_complete(struct crypto_async_request *req, int res)
 static int fname_encrypt(struct inode *inode,
 			const struct qstr *iname, struct fscrypt_str *oname)
 {
-	u32 ciphertext_len;
 	struct skcipher_request *req = NULL;
 	DECLARE_FS_COMPLETION_RESULT(ecr);
 	struct fscrypt_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 	char iv[FS_CRYPTO_BLOCK_SIZE];
-	struct scatterlist src_sg, dst_sg;
+	struct scatterlist sg;
 	int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
-	char *workbuf, buf[32], *alloc_buf = NULL;
-	unsigned lim;
+	unsigned int lim;
+	unsigned int cryptlen;
 
 	lim = inode->i_sb->s_cop->max_namelen(inode);
 	if (iname->len <= 0 || iname->len > lim)
 		return -EIO;
 
-	ciphertext_len = max(iname->len, (u32)FS_CRYPTO_BLOCK_SIZE);
-	ciphertext_len = round_up(ciphertext_len, padding);
-	ciphertext_len = min(ciphertext_len, lim);
+	/*
+	 * Copy the filename to the output buffer for encrypting in-place and
+	 * pad it with the needed number of NUL bytes.
+	 */
+	cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE);
+	cryptlen = round_up(cryptlen, padding);
+	cryptlen = min(cryptlen, lim);
+	memcpy(oname->name, iname->name, iname->len);
+	memset(oname->name + iname->len, 0, cryptlen - iname->len);
 
-	if (ciphertext_len <= sizeof(buf)) {
-		workbuf = buf;
-	} else {
-		alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
-		if (!alloc_buf)
-			return -ENOMEM;
-		workbuf = alloc_buf;
-	}
+	/* Initialize the IV */
+	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
 
-	/* Allocate request */
+	/* Set up the encryption request */
 	req = skcipher_request_alloc(tfm, GFP_NOFS);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
-			"%s: crypto_request_alloc() failed\n", __func__);
-		kfree(alloc_buf);
+			"%s: skcipher_request_alloc() failed\n", __func__);
 		return -ENOMEM;
 	}
 	skcipher_request_set_callback(req,
 			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 			fname_crypt_complete, &ecr);
+	sg_init_one(&sg, oname->name, cryptlen);
+	skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv);
 
-	/* Copy the input */
-	memcpy(workbuf, iname->name, iname->len);
-	if (iname->len < ciphertext_len)
-		memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
-
-	/* Initialize IV */
-	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
-
-	/* Create encryption request */
-	sg_init_one(&src_sg, workbuf, ciphertext_len);
-	sg_init_one(&dst_sg, oname->name, ciphertext_len);
-	skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+	/* Do the encryption */
 	res = crypto_skcipher_encrypt(req);
 	if (res == -EINPROGRESS || res == -EBUSY) {
+		/* Request is being completed asynchronously; wait for it */
 		wait_for_completion(&ecr.completion);
 		res = ecr.res;
 	}
-	kfree(alloc_buf);
 	skcipher_request_free(req);
 	if (res < 0) {
 		printk_ratelimited(KERN_ERR
@@ -105,7 +94,7 @@ static int fname_encrypt(struct inode *inode,
 		return res;
 	}
 
-	oname->len = ciphertext_len;
+	oname->len = cryptlen;
 	return 0;
 }
 

From a6e08912861757b8b335cc49b093a51ec025c5de Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Nov 2016 20:41:09 -0500
Subject: [PATCH 24/86] fscrypto: don't use on-stack buffer for key derivation

With the new (in 4.9) option to use a virtually-mapped stack
(CONFIG_VMAP_STACK), stack buffers cannot be used as input/output for
the scatterlist crypto API because they may not be directly mappable to
struct page.  get_crypt_info() was using a stack buffer to hold the
output from the encryption operation used to derive the per-file key.
Fix it by using a heap buffer.

This bug could most easily be observed in a CONFIG_DEBUG_SG kernel
because this allowed the BUG in sg_set_buf() to be triggered.

Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/keyinfo.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 82f0285f5d08..67fb6d8876d0 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -185,7 +185,7 @@ int get_crypt_info(struct inode *inode)
 	struct crypto_skcipher *ctfm;
 	const char *cipher_str;
 	int keysize;
-	u8 raw_key[FS_MAX_KEY_SIZE];
+	u8 *raw_key = NULL;
 	int res;
 
 	res = fscrypt_initialize();
@@ -238,6 +238,15 @@ retry:
 	if (res)
 		goto out;
 
+	/*
+	 * This cannot be a stack buffer because it is passed to the scatterlist
+	 * crypto API as part of key derivation.
+	 */
+	res = -ENOMEM;
+	raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS);
+	if (!raw_key)
+		goto out;
+
 	if (fscrypt_dummy_context_enabled(inode)) {
 		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
 		goto got_key;
@@ -276,7 +285,8 @@ got_key:
 	if (res)
 		goto out;
 
-	memzero_explicit(raw_key, sizeof(raw_key));
+	kzfree(raw_key);
+	raw_key = NULL;
 	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
 		put_crypt_info(crypt_info);
 		goto retry;
@@ -287,7 +297,7 @@ out:
 	if (res == -ENOKEY)
 		res = 0;
 	put_crypt_info(crypt_info);
-	memzero_explicit(raw_key, sizeof(raw_key));
+	kzfree(raw_key);
 	return res;
 }
 

From 2c98eb5ea249767bbc11cf4e70e91d5b0458ed13 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 13 Nov 2016 22:02:26 -0500
Subject: [PATCH 25/86] ext4: allow ext4_truncate() to return an error

This allows us to properly propagate errors back up to
ext4_truncate()'s callers.  This also means we no longer have to
silently ignore some errors (e.g., when trying to add the inode to the
orphan inode list).

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h  |  2 +-
 fs/ext4/inode.c | 41 ++++++++++++++++++++++++++---------------
 fs/ext4/ioctl.c |  7 +++++--
 fs/ext4/super.c |  6 ++++--
 4 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 282a51b07c57..be2282dcde7d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2491,7 +2491,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate(struct inode *);
+extern int ext4_truncate(struct inode *);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 48fbdfc43c10..585153057c2a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -261,8 +261,15 @@ void ext4_evict_inode(struct inode *inode)
 			     "couldn't mark inode dirty (err %d)", err);
 		goto stop_handle;
 	}
-	if (inode->i_blocks)
-		ext4_truncate(inode);
+	if (inode->i_blocks) {
+		err = ext4_truncate(inode);
+		if (err) {
+			ext4_error(inode->i_sb,
+				   "couldn't truncate inode %lu (err %d)",
+				   inode->i_ino, err);
+			goto stop_handle;
+		}
+	}
 
 	/*
 	 * ext4_ext_truncate() doesn't reserve any slop when it
@@ -4097,10 +4104,11 @@ int ext4_inode_attach_jinode(struct inode *inode)
  * that's fine - as long as they are linked from the inode, the post-crash
  * ext4_truncate() run will find them and release them.
  */
-void ext4_truncate(struct inode *inode)
+int ext4_truncate(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int credits;
+	int err = 0;
 	handle_t *handle;
 	struct address_space *mapping = inode->i_mapping;
 
@@ -4114,7 +4122,7 @@ void ext4_truncate(struct inode *inode)
 	trace_ext4_truncate_enter(inode);
 
 	if (!ext4_can_truncate(inode))
-		return;
+		return 0;
 
 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 
@@ -4126,13 +4134,13 @@ void ext4_truncate(struct inode *inode)
 
 		ext4_inline_data_truncate(inode, &has_inline);
 		if (has_inline)
-			return;
+			return 0;
 	}
 
 	/* If we zero-out tail of the page, we have to create jinode for jbd2 */
 	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
 		if (ext4_inode_attach_jinode(inode) < 0)
-			return;
+			return 0;
 	}
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4141,10 +4149,8 @@ void ext4_truncate(struct inode *inode)
 		credits = ext4_blocks_for_truncate(inode);
 
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
-	if (IS_ERR(handle)) {
-		ext4_std_error(inode->i_sb, PTR_ERR(handle));
-		return;
-	}
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
 	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
 		ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4158,7 +4164,8 @@ void ext4_truncate(struct inode *inode)
 	 * Implication: the file must always be in a sane, consistent
 	 * truncatable state while each transaction commits.
 	 */
-	if (ext4_orphan_add(handle, inode))
+	err = ext4_orphan_add(handle, inode);
+	if (err)
 		goto out_stop;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
@@ -4191,6 +4198,7 @@ out_stop:
 	ext4_journal_stop(handle);
 
 	trace_ext4_truncate_exit(inode);
+	return err;
 }
 
 /*
@@ -5205,12 +5213,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		 * in data=journal mode to make pages freeable.
 		 */
 		truncate_pagecache(inode, inode->i_size);
-		if (shrink)
-			ext4_truncate(inode);
+		if (shrink) {
+			rc = ext4_truncate(inode);
+			if (rc)
+				error = rc;
+		}
 		up_write(&EXT4_I(inode)->i_mmap_sem);
 	}
 
-	if (!rc) {
+	if (!error) {
 		setattr_copy(inode, attr);
 		mark_inode_dirty(inode);
 	}
@@ -5222,7 +5233,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (orphan && inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 
-	if (!rc && (ia_valid & ATTR_MODE))
+	if (!error && (ia_valid & ATTR_MODE))
 		rc = posix_acl_chmod(inode, inode->i_mode);
 
 err_out:
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae8ebbc97..99862a3726fc 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -248,8 +248,11 @@ static int ext4_ioctl_setflags(struct inode *inode,
 			err = -EOPNOTSUPP;
 			goto flags_out;
 		}
-	} else if (oldflags & EXT4_EOFBLOCKS_FL)
-		ext4_truncate(inode);
+	} else if (oldflags & EXT4_EOFBLOCKS_FL) {
+		err = ext4_truncate(inode);
+		if (err)
+			goto flags_out;
+	}
 
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle)) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 20da99da0a34..e4f61c39328a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2330,7 +2330,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 				struct ext4_super_block *es)
 {
 	unsigned int s_flags = sb->s_flags;
-	int nr_orphans = 0, nr_truncates = 0;
+	int ret, nr_orphans = 0, nr_truncates = 0;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
@@ -2412,7 +2412,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 				  inode->i_ino, inode->i_size);
 			inode_lock(inode);
 			truncate_inode_pages(inode->i_mapping, inode->i_size);
-			ext4_truncate(inode);
+			ret = ext4_truncate(inode);
+			if (ret)
+				ext4_std_error(inode->i_sb, ret);
 			inode_unlock(inode);
 			nr_truncates++;
 		} else {

From d0abb36db44faaf8f8aa148ca206fe2404042dec Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 13 Nov 2016 22:02:28 -0500
Subject: [PATCH 26/86] ext4: allow ext4_ext_truncate() to return an error

Return errors to the caller instead of declaring the file system
corrupted.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/extents.c | 15 +++++++--------
 fs/ext4/inode.c   |  4 +++-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index be2282dcde7d..54211c7876f8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3128,7 +3128,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_truncate(handle_t *, struct inode *);
 extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 				 ext4_lblk_t end);
 extern void ext4_ext_init(struct super_block *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c930a0110fb4..d3b119499c53 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4631,7 +4631,7 @@ out2:
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(handle_t *handle, struct inode *inode)
+int ext4_ext_truncate(handle_t *handle, struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t last_block;
@@ -4645,7 +4645,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
 
 	/* we have to know where to truncate from in crash case */
 	EXT4_I(inode)->i_disksize = inode->i_size;
-	ext4_mark_inode_dirty(handle, inode);
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err)
+		return err;
 
 	last_block = (inode->i_size + sb->s_blocksize - 1)
 			>> EXT4_BLOCK_SIZE_BITS(sb);
@@ -4657,12 +4659,9 @@ retry:
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 		goto retry;
 	}
-	if (err) {
-		ext4_std_error(inode->i_sb, err);
-		return;
-	}
-	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
-	ext4_std_error(inode->i_sb, err);
+	if (err)
+		return err;
+	return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
 }
 
 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 585153057c2a..79f46f4b43b8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4173,11 +4173,13 @@ int ext4_truncate(struct inode *inode)
 	ext4_discard_preallocations(inode);
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		ext4_ext_truncate(handle, inode);
+		err = ext4_ext_truncate(handle, inode);
 	else
 		ext4_ind_truncate(handle, inode);
 
 	up_write(&ei->i_data_sem);
+	if (err)
+		goto out_stop;
 
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);

From 1566a48aaa10c6bb29b9a69dd8279f9a4fc41e35 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 13 Nov 2016 22:02:29 -0500
Subject: [PATCH 27/86] ext4: don't lock buffer in ext4_commit_super if holding
 spinlock

If there is an error reported in mballoc via ext4_grp_locked_error(),
the code is holding a spinlock, so ext4_commit_super() must not try to
lock the buffer head, or else it will trigger a BUG:

  BUG: sleeping function called from invalid context at ./include/linux/buffer_head.h:358
  in_atomic(): 1, irqs_disabled(): 0, pid: 993, name: mount
  CPU: 0 PID: 993 Comm: mount Not tainted 4.9.0-rc1-clouder1 #62
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
   ffff880006423548 ffffffff81318c89 ffffffff819ecdd0 0000000000000166
   ffff880006423558 ffffffff810810b0 ffff880006423580 ffffffff81081153
   ffff880006e5a1a0 ffff88000690e400 0000000000000000 ffff8800064235c0
  Call Trace:
    [<ffffffff81318c89>] dump_stack+0x67/0x9e
    [<ffffffff810810b0>] ___might_sleep+0xf0/0x140
    [<ffffffff81081153>] __might_sleep+0x53/0xb0
    [<ffffffff8126c1dc>] ext4_commit_super+0x19c/0x290
    [<ffffffff8126e61a>] __ext4_grp_locked_error+0x14a/0x230
    [<ffffffff81081153>] ? __might_sleep+0x53/0xb0
    [<ffffffff812822be>] ext4_mb_generate_buddy+0x1de/0x320

Since ext4_grp_locked_error() calls ext4_commit_super with sync == 0
(and it is the only caller which does so), avoid locking and unlocking
the buffer in this case.

This can result in races with ext4_commit_super() if there are other
problems (which is what commit 4743f83990614 was trying to address),
but a Warning is better than BUG.

Fixes: 4743f83990614
Cc: stable@vger.kernel.org # 4.9
Reported-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4f61c39328a..ff6f3ab09c7e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4537,7 +4537,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 				&EXT4_SB(sb)->s_freeinodes_counter));
 	BUFFER_TRACE(sbh, "marking dirty");
 	ext4_superblock_csum_set(sb);
-	lock_buffer(sbh);
+	if (sync)
+		lock_buffer(sbh);
 	if (buffer_write_io_error(sbh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the
@@ -4553,8 +4554,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 		set_buffer_uptodate(sbh);
 	}
 	mark_buffer_dirty(sbh);
-	unlock_buffer(sbh);
 	if (sync) {
+		unlock_buffer(sbh);
 		error = __sync_dirty_buffer(sbh,
 			test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
 		if (error)

From 69e43e8cc971a79dd1ee5d4343d8e63f82725123 Mon Sep 17 00:00:00 2001
From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2016 21:04:37 -0500
Subject: [PATCH 28/86] ext4: fix mballoc breakage with 64k block size

'border' variable is set to a value of 2 times the block size of the
underlying filesystem. With 64k block size, the resulting value won't
fit into a 16-bit variable. Hence this commit changes the data type of
'border' to 'unsigned int'.

Fixes: c9de560ded61f
Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Cc: stable@vger.kernel.org
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f418f55c2bbe..a937ac7ef99f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	ext4_grpblk_t min;
 	ext4_grpblk_t max;
 	ext4_grpblk_t chunk;
-	unsigned short border;
+	unsigned int border;
 
 	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 

From 30a9d7afe70ed6bd9191d3000e2ef1a34fb58493 Mon Sep 17 00:00:00 2001
From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2016 21:26:26 -0500
Subject: [PATCH 29/86] ext4: fix stack memory corruption with 64k block size

The number of 'counters' elements needed in 'struct sg' is
super_block->s_blocksize_bits + 2. Presently we have 16 'counters'
elements in the array. This is insufficient for block sizes >= 32k. In
such cases the memcpy operation performed in ext4_mb_seq_groups_show()
would cause stack memory corruption.

Fixes: c9de560ded61f
Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a937ac7ef99f..7ae43c59bc79 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	struct ext4_group_info *grinfo;
 	struct sg {
 		struct ext4_group_info info;
-		ext4_grpblk_t counters[16];
+		ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
 	} sg;
 
 	group--;

From eeca7ea1baa939c97d58ba821f8c6e683e4388f2 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Mon, 14 Nov 2016 21:40:10 -0500
Subject: [PATCH 30/86] ext4: use current_time() for inode timestamps

CURRENT_TIME_SEC and CURRENT_TIME are not y2038 safe.
current_time() will be transitioned to be y2038 safe
along with vfs.

current_time() returns timestamps according to the
granularities set in the super_block.
The granularity check in ext4_current_time() to call
current_time() or CURRENT_TIME_SEC is not required.
Use current_time() directly to obtain timestamps
unconditionally, and remove ext4_current_time().

Quota files are assumed to be on the same filesystem.
Hence, use current_time() for these files as well.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ext4/acl.c     |  2 +-
 fs/ext4/ext4.h    |  6 ------
 fs/ext4/extents.c | 10 +++++-----
 fs/ext4/ialloc.c  |  2 +-
 fs/ext4/inline.c  |  4 ++--
 fs/ext4/inode.c   |  6 +++---
 fs/ext4/ioctl.c   |  8 ++++----
 fs/ext4/namei.c   | 24 +++++++++++++-----------
 fs/ext4/super.c   |  2 +-
 fs/ext4/xattr.c   |  2 +-
 10 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index dfa519979038..fd389935ecd1 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -196,7 +196,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
 			if (error)
 				return error;
-			inode->i_ctime = ext4_current_time(inode);
+			inode->i_ctime = current_time(inode);
 			ext4_mark_inode_dirty(handle, inode);
 		}
 		break;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 54211c7876f8..53d6d463ac4d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1532,12 +1532,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
 	return container_of(inode, struct ext4_inode_info, vfs_inode);
 }
 
-static inline struct timespec ext4_current_time(struct inode *inode)
-{
-	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
-		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
-}
-
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
 	return ino == EXT4_ROOT_INO ||
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d3b119499c53..65dbd2bcf775 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4724,7 +4724,7 @@ retry:
 		map.m_lblk += ret;
 		map.m_len = len = len - ret;
 		epos = (loff_t)map.m_lblk << inode->i_blkbits;
-		inode->i_ctime = ext4_current_time(inode);
+		inode->i_ctime = current_time(inode);
 		if (new_size) {
 			if (epos > new_size)
 				epos = new_size;
@@ -4852,7 +4852,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		}
 		/* Now release the pages and zero block aligned part of pages */
 		truncate_pagecache_range(inode, start, end - 1);
-		inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+		inode->i_mtime = inode->i_ctime = current_time(inode);
 
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 					     flags, mode);
@@ -4877,7 +4877,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		goto out_dio;
 	}
 
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	if (new_size) {
 		ext4_update_inode_size(inode, new_size);
 	} else {
@@ -5567,7 +5567,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
 out_stop:
@@ -5677,7 +5677,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	/* Expand file to avoid data loss if there is error while shifting */
 	inode->i_size += len;
 	EXT4_I(inode)->i_disksize += len;
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ret = ext4_mark_inode_dirty(handle, inode);
 	if (ret)
 		goto out_stop;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 170421edfdfe..088afe07ddda 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1039,7 +1039,7 @@ got:
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
-						       ext4_current_time(inode);
+						       current_time(inode);
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index f74d5ee2cdec..cfa87bd49c5a 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1028,7 +1028,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+	dir->i_mtime = dir->i_ctime = current_time(dir);
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
@@ -1971,7 +1971,7 @@ out:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 79f46f4b43b8..e821ee28fa33 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4039,7 +4039,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 out_stop:
 	ext4_journal_stop(handle);
@@ -4195,7 +4195,7 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 
@@ -5170,7 +5170,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			 * update c/mtime in shrink case below
 			 */
 			if (!shrink) {
-				inode->i_mtime = ext4_current_time(inode);
+				inode->i_mtime = current_time(inode);
 				inode->i_ctime = inode->i_mtime;
 			}
 			down_write(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 99862a3726fc..fc1cd37ba2d9 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -153,7 +153,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 
 	swap_inode_data(inode, inode_bl);
 
-	inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = inode_bl->i_ctime = current_time(inode);
 
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
@@ -275,7 +275,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	}
 
 	ext4_set_inode_flags(inode);
-	inode->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = current_time(inode);
 
 	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
@@ -371,7 +371,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
 	}
 
 	EXT4_I(inode)->i_projid = kprojid;
-	inode->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = current_time(inode);
 out_dirty:
 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
@@ -503,7 +503,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
-			inode->i_ctime = ext4_current_time(inode);
+			inode->i_ctime = current_time(inode);
 			inode->i_generation = generation;
 			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 		}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 104f8bfba718..eadba919f26b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1941,7 +1941,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+	dir->i_mtime = dir->i_ctime = current_time(dir);
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
@@ -2987,7 +2987,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	 * recovery. */
 	inode->i_size = 0;
 	ext4_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_dec_count(handle, dir);
 	ext4_update_dx_flag(dir);
@@ -3050,13 +3050,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_unlink;
-	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+	dir->i_ctime = dir->i_mtime = current_time(dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	drop_nlink(inode);
 	if (!inode->i_nlink)
 		ext4_orphan_add(handle, inode);
-	inode->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
 end_unlink:
@@ -3254,7 +3254,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = current_time(inode);
 	ext4_inc_count(handle, inode);
 	ihold(inode);
 
@@ -3381,7 +3381,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
 		ent->de->file_type = file_type;
 	ent->dir->i_version++;
 	ent->dir->i_ctime = ent->dir->i_mtime =
-		ext4_current_time(ent->dir);
+		current_time(ent->dir);
 	ext4_mark_inode_dirty(handle, ent->dir);
 	BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
 	if (!ent->inlined) {
@@ -3651,7 +3651,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
 	 */
-	old.inode->i_ctime = ext4_current_time(old.inode);
+	old.inode->i_ctime = current_time(old.inode);
 	ext4_mark_inode_dirty(handle, old.inode);
 
 	if (!whiteout) {
@@ -3663,9 +3663,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	if (new.inode) {
 		ext4_dec_count(handle, new.inode);
-		new.inode->i_ctime = ext4_current_time(new.inode);
+		new.inode->i_ctime = current_time(new.inode);
 	}
-	old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+	old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
 	ext4_update_dx_flag(old.dir);
 	if (old.dir_bh) {
 		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
@@ -3723,6 +3723,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	};
 	u8 new_file_type;
 	int retval;
+	struct timespec ctime;
 
 	if ((ext4_encrypted_inode(old_dir) ||
 	     ext4_encrypted_inode(new_dir)) &&
@@ -3823,8 +3824,9 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
 	 */
-	old.inode->i_ctime = ext4_current_time(old.inode);
-	new.inode->i_ctime = ext4_current_time(new.inode);
+	ctime = current_time(old.inode);
+	old.inode->i_ctime = ctime;
+	new.inode->i_ctime = ctime;
 	ext4_mark_inode_dirty(handle, old.inode);
 	ext4_mark_inode_dirty(handle, new.inode);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ff6f3ab09c7e..35ccbdc2d64e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5354,7 +5354,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
 	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
 	if (IS_ERR(handle))
 		goto out;
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d77be9e9f535..9c1bb5cd89d2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1249,7 +1249,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	}
 	if (!error) {
 		ext4_xattr_update_super_block(handle, inode->i_sb);
-		inode->i_ctime = ext4_current_time(inode);
+		inode->i_ctime = current_time(inode);
 		if (!value)
 			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);

From 88e0387769c4bc36c93d25329f6151fb3d6474b2 Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Mon, 14 Nov 2016 21:48:35 -0500
Subject: [PATCH 31/86] ext4: allow inode expansion for nojournal file systems

Runs of xfstest ext4/022 on nojournal file systems result in failures
because the inodes of some of its test files do not expand as expected.
The cause is a conditional in ext4_mark_inode_dirty() that prevents inode
expansion unless the test file system has a journal.  Remove this
unnecessary restriction.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e821ee28fa33..b1b4c85bbae3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5474,18 +5474,20 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (err)
 		return err;
-	if (ext4_handle_valid(handle) &&
-	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
 	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
 		/*
-		 * We need extra buffer credits since we may write into EA block
+		 * In nojournal mode, we can immediately attempt to expand
+		 * the inode.  When journaled, we first need to obtain extra
+		 * buffer credits since we may write into the EA block
 		 * with this same handle. If journal_extend fails, then it will
 		 * only result in a minor loss of functionality for that inode.
 		 * If this is felt to be critical, then e2fsck should be run to
 		 * force a large enough s_min_extra_isize.
 		 */
-		if ((jbd2_journal_extend(handle,
-			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+		if (!ext4_handle_valid(handle) ||
+		    jbd2_journal_extend(handle,
+			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) {
 			ret = ext4_expand_extra_isize(inode,
 						      sbi->s_want_extra_isize,
 						      iloc, handle);

From d5c8dab6a8a1e328b976140ee7dc8e66957aaf61 Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Mon, 14 Nov 2016 21:56:48 -0500
Subject: [PATCH 32/86] ext4: remove parameter from ext4_xattr_ibody_set()

The parameter "handle" isn't used.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9c1bb5cd89d2..1846e9168f80 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1109,7 +1109,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+static int ext4_xattr_ibody_set(struct inode *inode,
 				struct ext4_xattr_info *i,
 				struct ext4_xattr_ibody_find *is)
 {
@@ -1216,7 +1216,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	}
 	if (!value) {
 		if (!is.s.not_found)
-			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+			error = ext4_xattr_ibody_set(inode, &i, &is);
 		else if (!bs.s.not_found)
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 	} else {
@@ -1227,7 +1227,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
 			goto cleanup;
 
-		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+		error = ext4_xattr_ibody_set(inode, &i, &is);
 		if (!error && !bs.s.not_found) {
 			i.value = NULL;
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
@@ -1242,8 +1242,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 				goto cleanup;
 			if (!is.s.not_found) {
 				i.value = NULL;
-				error = ext4_xattr_ibody_set(handle, inode, &i,
-							     &is);
+				error = ext4_xattr_ibody_set(inode, &i, &is);
 			}
 		}
 	}
@@ -1384,7 +1383,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 		goto out;
 
 	/* Remove the chosen entry from the inode */
-	error = ext4_xattr_ibody_set(handle, inode, &i, is);
+	error = ext4_xattr_ibody_set(inode, &i, is);
 	if (error)
 		goto out;
 

From 9e47a4c9fc58032ee135bf76516809c7624b1551 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 18 Nov 2016 13:00:24 -0500
Subject: [PATCH 33/86] ext4: sanity check the block and cluster size at mount
 time

If the block size or cluster size is insane, reject the mount.  This
is important for security reasons (although we shouldn't be just
depending on this check).

Ref: http://www.securityfocus.com/archive/1/539661
Ref: https://bugzilla.redhat.com/show_bug.cgi?id=1332506
Reported-by: Borislav Petkov <bp@alien8.de>
Reported-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/super.c | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 53d6d463ac4d..bdf1e5ee8642 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -235,6 +235,7 @@ struct ext4_io_submit {
 #define	EXT4_MAX_BLOCK_SIZE		65536
 #define EXT4_MIN_BLOCK_LOG_SIZE		10
 #define EXT4_MAX_BLOCK_LOG_SIZE		16
+#define EXT4_MAX_CLUSTER_LOG_SIZE	30
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE(s)		((s)->s_blocksize)
 #else
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 35ccbdc2d64e..0f9ae4ce33d6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3567,7 +3567,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
 		ext4_msg(sb, KERN_ERR,
-		       "Unsupported filesystem blocksize %d", blocksize);
+		       "Unsupported filesystem blocksize %d (%d log_block_size)",
+			 blocksize, le32_to_cpu(es->s_log_block_size));
+		goto failed_mount;
+	}
+	if (le32_to_cpu(es->s_log_block_size) >
+	    (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Invalid log block size: %u",
+			 le32_to_cpu(es->s_log_block_size));
 		goto failed_mount;
 	}
 
@@ -3699,6 +3707,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				 "block size (%d)", clustersize, blocksize);
 			goto failed_mount;
 		}
+		if (le32_to_cpu(es->s_log_cluster_size) >
+		    (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Invalid log cluster size: %u",
+				 le32_to_cpu(es->s_log_cluster_size));
+			goto failed_mount;
+		}
 		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
 			le32_to_cpu(es->s_log_block_size);
 		sbi->s_clusters_per_group =

From 5aee0f8a3f42c94c5012f1673420aee96315925a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 18 Nov 2016 13:24:26 -0500
Subject: [PATCH 34/86] ext4: fix in-superblock mount options processing

Fix a large number of problems with how we handle mount options in the
superblock.  For one, if the string in the superblock is long enough
that it is not null terminated, we could run off the end of the string
and try to interpret superblocks fields as characters.  It's unlikely
this will cause a security problem, but it could result in an invalid
parse.  Also, parse_options is destructive to the string, so in some
cases if there is a comma-separated string, it would be modified in
the superblock.  (Fortunately it only happens on file systems with a
1k block size.)

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0f9ae4ce33d6..404e6f3c1bed 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3303,7 +3303,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
-	struct ext4_sb_info *sbi;
+	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	ext4_fsblk_t block;
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
@@ -3322,16 +3322,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	ext4_group_t first_not_zeroed;
 
-	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
-		goto out_free_orig;
+	if ((data && !orig_data) || !sbi)
+		goto out_free_base;
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
-	if (!sbi->s_blockgroup_lock) {
-		kfree(sbi);
-		goto out_free_orig;
-	}
+	if (!sbi->s_blockgroup_lock)
+		goto out_free_base;
+
 	sb->s_fs_info = sbi;
 	sbi->s_sb = sb;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
@@ -3477,11 +3475,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
-	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
-			   &journal_devnum, &journal_ioprio, 0)) {
-		ext4_msg(sb, KERN_WARNING,
-			 "failed to parse options in superblock: %s",
-			 sbi->s_es->s_mount_opts);
+	if (sbi->s_es->s_mount_opts[0]) {
+		char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+					      sizeof(sbi->s_es->s_mount_opts),
+					      GFP_KERNEL);
+		if (!s_mount_opts)
+			goto failed_mount;
+		if (!parse_options(s_mount_opts, sb, &journal_devnum,
+				   &journal_ioprio, 0)) {
+			ext4_msg(sb, KERN_WARNING,
+				 "failed to parse options in superblock: %s",
+				 s_mount_opts);
+		}
+		kfree(s_mount_opts);
 	}
 	sbi->s_def_mount_opt = sbi->s_mount_opt;
 	if (!parse_options((char *) data, sb, &journal_devnum,
@@ -4162,7 +4168,9 @@ no_journal:
 
 	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
 		ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
-			 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+			 "Opts: %.*s%s%s", descr,
+			 (int) sizeof(sbi->s_es->s_mount_opts),
+			 sbi->s_es->s_mount_opts,
 			 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
 
 	if (es->s_error_count)
@@ -4241,8 +4249,8 @@ failed_mount:
 out_fail:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
+out_free_base:
 	kfree(sbi);
-out_free_orig:
 	kfree(orig_data);
 	return err ? err : ret;
 }

From cd6bb35bf7f6d7d922509bf50265383a0ceabe96 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 18 Nov 2016 13:28:30 -0500
Subject: [PATCH 35/86] ext4: use more strict checks for inodes_per_block on
 mount

Centralize the checks for inodes_per_block and be more strict to make
sure the inodes_per_block_group can't end up being zero.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 404e6f3c1bed..689c02df1af4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3668,12 +3668,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
-		goto cantfind_ext4;
 
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
 		goto cantfind_ext4;
+	if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+	    sbi->s_inodes_per_group > blocksize * 8) {
+		ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
+			 sbi->s_blocks_per_group);
+		goto failed_mount;
+	}
 	sbi->s_itb_per_group = sbi->s_inodes_per_group /
 					sbi->s_inodes_per_block;
 	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
@@ -3756,13 +3760,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sbi->s_cluster_ratio = clustersize / blocksize;
 
-	if (sbi->s_inodes_per_group > blocksize * 8) {
-		ext4_msg(sb, KERN_ERR,
-		       "#inodes per group too big: %lu",
-		       sbi->s_inodes_per_group);
-		goto failed_mount;
-	}
-
 	/* Do we have standard group size of clustersize * 8 blocks ? */
 	if (sbi->s_blocks_per_group == clustersize << 3)
 		set_opt2(sb, STD_GROUP_SIZE);

From c48ae41bafe31e9a66d8be2ced4e42a6b57fa814 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 18 Nov 2016 13:37:47 -0500
Subject: [PATCH 36/86] ext4: add sanity checking to count_overhead()

The commit "ext4: sanity check the block and cluster size at mount
time" should prevent any problems, but in case the superblock is
modified while the file system is mounted, add an extra safety check
to make sure we won't overrun the allocated buffer.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 689c02df1af4..2d8a49d74f56 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3195,10 +3195,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
 			ext4_set_bit(s++, buf);
 			count++;
 		}
-		for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
-			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
-			count++;
+		j = ext4_bg_num_gdb(sb, grp);
+		if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
+			ext4_error(sb, "Invalid number of block group "
+				   "descriptor blocks: %d", j);
+			j = EXT4_BLOCKS_PER_GROUP(sb) - s;
 		}
+		count += j;
+		for (; j > 0; j--)
+			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
 	}
 	if (!count)
 		return 0;

From 213bcd9ccbf04b709e4764ad89aaaa66a47785f0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 17:29:51 -0500
Subject: [PATCH 37/86] ext4: factor out checks from ext4_file_write_iter()

Factor out checks of 'from' and whether we are overwriting out of
ext4_file_write_iter() so that the function is easier to follow.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/file.c | 97 ++++++++++++++++++++++++++------------------------
 1 file changed, 50 insertions(+), 47 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a822d30e73f..9facb4dc5c70 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -88,6 +88,51 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
 	return 0;
 }
 
+/* Is IO overwriting allocated and initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+{
+	struct ext4_map_blocks map;
+	unsigned int blkbits = inode->i_blkbits;
+	int err, blklen;
+
+	if (pos + len > i_size_read(inode))
+		return false;
+
+	map.m_lblk = pos >> blkbits;
+	map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
+	blklen = map.m_len;
+
+	err = ext4_map_blocks(NULL, inode, &map, 0);
+	/*
+	 * 'err==len' means that all of the blocks have been preallocated,
+	 * regardless of whether they have been initialized or not. To exclude
+	 * unwritten extents, we need to check m_flags.
+	 */
+	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+}
+
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		return ret;
+	/*
+	 * If we have encountered a bitmap-format file, the size limit
+	 * is smaller than s_maxbytes, which is for extent-mapped files.
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+		if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
+			return -EFBIG;
+		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
+	}
+	return iov_iter_count(from);
+}
+
 static ssize_t
 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
@@ -98,7 +143,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t ret;
 
 	inode_lock(inode);
-	ret = generic_write_checks(iocb, from);
+	ret = ext4_write_checks(iocb, from);
 	if (ret <= 0)
 		goto out;
 
@@ -114,53 +159,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ext4_unwritten_wait(inode);
 	}
 
-	/*
-	 * If we have encountered a bitmap-format file, the size limit
-	 * is smaller than s_maxbytes, which is for extent-mapped files.
-	 */
-	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
-		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-
-		if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
-			ret = -EFBIG;
-			goto out;
-		}
-		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
-	}
-
 	iocb->private = &overwrite;
-	if (o_direct) {
-		size_t length = iov_iter_count(from);
-		loff_t pos = iocb->ki_pos;
-
-		/* check whether we do a DIO overwrite or not */
-		if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
-		    pos + length <= i_size_read(inode)) {
-			struct ext4_map_blocks map;
-			unsigned int blkbits = inode->i_blkbits;
-			int err, len;
-
-			map.m_lblk = pos >> blkbits;
-			map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits);
-			len = map.m_len;
-
-			err = ext4_map_blocks(NULL, inode, &map, 0);
-			/*
-			 * 'err==len' means that all of blocks has
-			 * been preallocated no matter they are
-			 * initialized or not.  For excluding
-			 * unwritten extents, we need to check
-			 * m_flags.  There are two conditions that
-			 * indicate for initialized extents.  1) If we
-			 * hit extent cache, EXT4_MAP_MAPPED flag is
-			 * returned; 2) If we do a real lookup,
-			 * non-flags are returned.  So we should check
-			 * these two conditions.
-			 */
-			if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
-				overwrite = 1;
-		}
-	}
+	/* Check whether we do a DIO overwrite or not */
+	if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
+	    ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
+		overwrite = 1;
 
 	ret = __generic_file_write_iter(iocb, from);
 	inode_unlock(inode);

From a3caa24b703794507bf2e0a68bdc800b90f5e70b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 17:32:59 -0500
Subject: [PATCH 38/86] ext4: only set S_DAX if DAX is really supported

Currently we have S_DAX set inode->i_flags for a regular file whenever
ext4 is mounted with dax mount option. However in some cases we cannot
really do DAX - e.g. when inode is marked to use data journalling, when
inode data is being encrypted, or when inode is stored inline. Make sure
S_DAX flag is appropriately set/cleared in these cases.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inline.c | 10 ++++++++++
 fs/ext4/inode.c  |  9 ++++++++-
 fs/ext4/super.c  |  6 ++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index cfa87bd49c5a..9b67f75bdcf7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle,
 	EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
 	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
 	ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+	/*
+	 * Propagate changes to inode->i_flags as well - e.g. S_DAX may
+	 * get cleared
+	 */
+	ext4_set_inode_flags(inode);
 	get_bh(is.iloc.bh);
 	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 
@@ -442,6 +447,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
 		}
 	}
 	ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+	/*
+	 * Propagate changes to inode->i_flags as well - e.g. S_DAX may
+	 * get set.
+	 */
+	ext4_set_inode_flags(inode);
 
 	get_bh(is.iloc.bh);
 	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b1b4c85bbae3..733fda1c3646 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4368,7 +4368,9 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
-	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
+	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) &&
+	    !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
+	    !ext4_encrypted_inode(inode))
 		new_fl |= S_DAX;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
@@ -5641,6 +5643,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	}
 	ext4_set_aops(inode);
+	/*
+	 * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated.
+	 * E.g. S_DAX may get cleared / set.
+	 */
+	ext4_set_inode_flags(inode);
 
 	jbd2_journal_unlock_updates(journal);
 	percpu_up_write(&sbi->s_journal_flag_rwsem);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2d8a49d74f56..cbfaee175fa5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1126,6 +1126,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 			ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
 			ext4_clear_inode_state(inode,
 					EXT4_STATE_MAY_INLINE_DATA);
+			/*
+			 * Update inode->i_flags - e.g. S_DAX may get disabled
+			 */
+			ext4_set_inode_flags(inode);
 		}
 		return res;
 	}
@@ -1140,6 +1144,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 			len, 0);
 	if (!res) {
 		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+		/* Update inode->i_flags - e.g. S_DAX may get disabled */
+		ext4_set_inode_flags(inode);
 		res = ext4_mark_inode_dirty(handle, inode);
 		if (res)
 			EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");

From 364443cbcfe70f927b6a0dc0d410b4d4318bc1ca Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 17:36:06 -0500
Subject: [PATCH 39/86] ext4: convert DAX reads to iomap infrastructure

Implement basic iomap_begin function that handles reading and use it for
DAX reads.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  2 ++
 fs/ext4/file.c  | 38 +++++++++++++++++++++++++++++++++-
 fs/ext4/inode.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bdf1e5ee8642..da82de650350 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3266,6 +3266,8 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
 	return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
 }
 
+extern struct iomap_ops ext4_iomap_ops;
+
 #endif	/* __KERNEL__ */
 
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9facb4dc5c70..1f25c644cb12 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,6 +31,42 @@
 #include "xattr.h"
 #include "acl.h"
 
+#ifdef CONFIG_FS_DAX
+static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	inode_lock_shared(inode);
+	/*
+	 * Recheck under inode lock - at this point we are sure it cannot
+	 * change anymore
+	 */
+	if (!IS_DAX(inode)) {
+		inode_unlock_shared(inode);
+		/* Fallback to buffered IO in case we cannot support DAX */
+		return generic_file_read_iter(iocb, to);
+	}
+	ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
+	inode_unlock_shared(inode);
+
+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+#endif
+
+static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	if (!iov_iter_count(to))
+		return 0; /* skip atime */
+
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(file_inode(iocb->ki_filp)))
+		return ext4_dax_read_iter(iocb, to);
+#endif
+	return generic_file_read_iter(iocb, to);
+}
+
 /*
  * Called when an inode is released. Note that this is different
  * from ext4_file_open: open gets called at every open, but release
@@ -690,7 +726,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
 
 const struct file_operations ext4_file_operations = {
 	.llseek		= ext4_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= ext4_file_read_iter,
 	.write_iter	= ext4_file_write_iter,
 	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 733fda1c3646..5e0526fed165 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/bitops.h>
+#include <linux/iomap.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -3318,6 +3319,59 @@ int ext4_dax_get_block(struct inode *inode, sector_t iblock,
 	clear_buffer_new(bh_result);
 	return 0;
 }
+
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+			    unsigned flags, struct iomap *iomap)
+{
+	unsigned int blkbits = inode->i_blkbits;
+	unsigned long first_block = offset >> blkbits;
+	unsigned long last_block = (offset + length - 1) >> blkbits;
+	struct ext4_map_blocks map;
+	int ret;
+
+	if (flags & IOMAP_WRITE)
+		return -EIO;
+
+	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+		return -ERANGE;
+
+	map.m_lblk = first_block;
+	map.m_len = last_block - first_block + 1;
+
+	ret = ext4_map_blocks(NULL, inode, &map, 0);
+	if (ret < 0)
+		return ret;
+
+	iomap->flags = 0;
+	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->offset = first_block << blkbits;
+
+	if (ret == 0) {
+		iomap->type = IOMAP_HOLE;
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->length = (u64)map.m_len << blkbits;
+	} else {
+		if (map.m_flags & EXT4_MAP_MAPPED) {
+			iomap->type = IOMAP_MAPPED;
+		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+			iomap->type = IOMAP_UNWRITTEN;
+		} else {
+			WARN_ON_ONCE(1);
+			return -EIO;
+		}
+		iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
+		iomap->length = (u64)map.m_len << blkbits;
+	}
+
+	if (map.m_flags & EXT4_MAP_NEW)
+		iomap->flags |= IOMAP_F_NEW;
+	return 0;
+}
+
+struct iomap_ops ext4_iomap_ops = {
+	.iomap_begin		= ext4_iomap_begin,
+};
+
 #else
 /* Just define empty function, it will never get called. */
 int ext4_dax_get_block(struct inode *inode, sector_t iblock,

From 47e6935136b1f9fbda59cd929409f8e7cee4a1e4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 18:08:05 -0500
Subject: [PATCH 40/86] ext4: use iomap for zeroing blocks in DAX mode

Use iomap infrastructure for zeroing blocks when in DAX mode.
ext4_iomap_begin() handles read requests just fine and that's all that
is needed for iomap_zero_range().

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5e0526fed165..6d186ca2c34b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3859,8 +3859,10 @@ static int ext4_block_zero_page_range(handle_t *handle,
 	if (length > max || length < 0)
 		length = max;
 
-	if (IS_DAX(inode))
-		return dax_zero_page_range(inode, from, length, ext4_get_block);
+	if (IS_DAX(inode)) {
+		return iomap_zero_range(inode, from, length, NULL,
+					&ext4_iomap_ops);
+	}
 	return __ext4_block_zero_page_range(handle, mapping, from, length);
 }
 

From 776722e85d3b0936253ecc3d14db4fba37f191ba Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 18:09:11 -0500
Subject: [PATCH 41/86] ext4: DAX iomap write support

Implement DAX writes using the new iomap infrastructure instead of
overloading the direct IO path.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/file.c  |  40 +++++++++++++++
 fs/ext4/inode.c | 126 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 160 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1f25c644cb12..1953fe34f9fe 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -169,6 +169,41 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	return iov_iter_count(from);
 }
 
+#ifdef CONFIG_FS_DAX
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+	bool overwrite = false;
+
+	inode_lock(inode);
+	ret = ext4_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+	ret = file_remove_privs(iocb->ki_filp);
+	if (ret)
+		goto out;
+	ret = file_update_time(iocb->ki_filp);
+	if (ret)
+		goto out;
+
+	if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
+		overwrite = true;
+		downgrade_write(&inode->i_rwsem);
+	}
+	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+out:
+	if (!overwrite)
+		inode_unlock(inode);
+	else
+		inode_unlock_shared(inode);
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	return ret;
+}
+#endif
+
 static ssize_t
 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
@@ -178,6 +213,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	int overwrite = 0;
 	ssize_t ret;
 
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(inode))
+		return ext4_dax_write_iter(iocb, from);
+#endif
+
 	inode_lock(inode);
 	ret = ext4_write_checks(iocb, from);
 	if (ret <= 0)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6d186ca2c34b..3941cee21e4c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3329,18 +3329,79 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	struct ext4_map_blocks map;
 	int ret;
 
-	if (flags & IOMAP_WRITE)
-		return -EIO;
-
 	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
 		return -ERANGE;
 
 	map.m_lblk = first_block;
 	map.m_len = last_block - first_block + 1;
 
-	ret = ext4_map_blocks(NULL, inode, &map, 0);
-	if (ret < 0)
-		return ret;
+	if (!(flags & IOMAP_WRITE)) {
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+	} else {
+		int dio_credits;
+		handle_t *handle;
+		int retries = 0;
+
+		/* Trim mapping request to maximum we can map at once for DIO */
+		if (map.m_len > DIO_MAX_BLOCKS)
+			map.m_len = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+retry:
+		/*
+		 * Either we allocate blocks and then we don't get unwritten
+		 * extent so we have reserved enough credits, or the blocks
+		 * are already allocated and unwritten and in that case
+		 * extent conversion fits in the credits as well.
+		 */
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    dio_credits);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+
+		ret = ext4_map_blocks(handle, inode, &map,
+				      EXT4_GET_BLOCKS_PRE_IO |
+				      EXT4_GET_BLOCKS_CREATE_ZERO);
+		if (ret < 0) {
+			ext4_journal_stop(handle);
+			if (ret == -ENOSPC &&
+			    ext4_should_retry_alloc(inode->i_sb, &retries))
+				goto retry;
+			return ret;
+		}
+		/* For DAX writes we need to zero out unwritten extents */
+		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+			/*
+			 * We are protected by i_mmap_sem or i_rwsem so we know
+			 * block cannot go away from under us even though we
+			 * dropped i_data_sem. Convert extent to written and
+			 * write zeros there.
+			 */
+			ret = ext4_map_blocks(handle, inode, &map,
+					      EXT4_GET_BLOCKS_CONVERT |
+					      EXT4_GET_BLOCKS_CREATE_ZERO);
+			if (ret < 0) {
+				ext4_journal_stop(handle);
+				return ret;
+			}
+		}
+
+		/*
+		 * If we added blocks beyond i_size we need to make sure they
+		 * will get truncated if we crash before updating i_size in
+		 * ext4_iomap_end().
+		 */
+		if (first_block + map.m_len >
+		    (inode->i_size + (1 << blkbits) - 1) >> blkbits) {
+			int err;
+
+			err = ext4_orphan_add(handle, inode);
+			if (err < 0) {
+				ext4_journal_stop(handle);
+				return err;
+			}
+		}
+		ext4_journal_stop(handle);
+	}
 
 	iomap->flags = 0;
 	iomap->bdev = inode->i_sb->s_bdev;
@@ -3368,8 +3429,61 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	return 0;
 }
 
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+			  ssize_t written, unsigned flags, struct iomap *iomap)
+{
+	int ret = 0;
+	handle_t *handle;
+	int blkbits = inode->i_blkbits;
+	bool truncate = false;
+
+	if (!(flags & IOMAP_WRITE))
+		return 0;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto orphan_del;
+	}
+	if (ext4_update_inode_size(inode, offset + written))
+		ext4_mark_inode_dirty(handle, inode);
+	/*
+	 * We may need to truncate allocated but not written blocks beyond EOF.
+	 */
+	if (iomap->offset + iomap->length > 
+	    ALIGN(inode->i_size, 1 << blkbits)) {
+		ext4_lblk_t written_blk, end_blk;
+
+		written_blk = (offset + written) >> blkbits;
+		end_blk = (offset + length) >> blkbits;
+		if (written_blk < end_blk && ext4_can_truncate(inode))
+			truncate = true;
+	}
+	/*
+	 * Remove inode from orphan list if we were extending a inode and
+	 * everything went fine.
+	 */
+	if (!truncate && inode->i_nlink &&
+	    !list_empty(&EXT4_I(inode)->i_orphan))
+		ext4_orphan_del(handle, inode);
+	ext4_journal_stop(handle);
+	if (truncate) {
+		ext4_truncate_failed_write(inode);
+orphan_del:
+		/*
+		 * If truncate failed early the inode might still be on the
+		 * orphan list; we need to make sure the inode is removed from
+		 * the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+	return ret;
+}
+
 struct iomap_ops ext4_iomap_ops = {
 	.iomap_begin		= ext4_iomap_begin,
+	.iomap_end		= ext4_iomap_end,
 };
 
 #else

From 96f8ba3dd632aff684cc7c67d9f4af435be0341c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 18:10:09 -0500
Subject: [PATCH 42/86] ext4: avoid split extents for DAX writes

Currently mapping of blocks for DAX writes happen with
EXT4_GET_BLOCKS_PRE_IO flag set. That has a result that each
ext4_map_blocks() call creates a separate written extent, although it
could be merged to the neighboring extents in the extent tree.  The
reason for using this flag is that in case the extent is unwritten, we
need to convert it to written one and zero it out. However this "convert
mapped range to written" operation is already implemented by
ext4_map_blocks() for the case of data writes into unwritten extent. So
just use flags for that mode of operation, simplify the code, and avoid
unnecessary split extents.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3941cee21e4c..29237f25ddbe 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3359,7 +3359,6 @@ retry:
 			return PTR_ERR(handle);
 
 		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_PRE_IO |
 				      EXT4_GET_BLOCKS_CREATE_ZERO);
 		if (ret < 0) {
 			ext4_journal_stop(handle);
@@ -3368,22 +3367,6 @@ retry:
 				goto retry;
 			return ret;
 		}
-		/* For DAX writes we need to zero out unwritten extents */
-		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-			/*
-			 * We are protected by i_mmap_sem or i_rwsem so we know
-			 * block cannot go away from under us even though we
-			 * dropped i_data_sem. Convert extent to written and
-			 * write zeros there.
-			 */
-			ret = ext4_map_blocks(handle, inode, &map,
-					      EXT4_GET_BLOCKS_CONVERT |
-					      EXT4_GET_BLOCKS_CREATE_ZERO);
-			if (ret < 0) {
-				ext4_journal_stop(handle);
-				return ret;
-			}
-		}
 
 		/*
 		 * If we added blocks beyond i_size we need to make sure they

From e2ae766c1b030271b5099b25674e2131d1d1e8c1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 18:51:24 -0500
Subject: [PATCH 43/86] ext4: convert DAX faults to iomap infrastructure

Convert DAX faults to use iomap infrastructure. We would not have to start
transaction in ext4_dax_fault() anymore since ext4_iomap_begin takes
care of that but so far we do that to avoid lock inversion of
transaction start with DAX entry lock which gets acquired in
dax_iomap_fault() before calling ->iomap_begin handler.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/file.c  |  9 +++++----
 fs/ext4/inode.c | 14 +++++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1953fe34f9fe..b5f184493c57 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -275,7 +275,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
 	else
-		result = dax_fault(vma, vmf, ext4_dax_get_block);
+		result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
 
 	if (write) {
 		if (!IS_ERR(handle))
@@ -309,9 +309,10 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
-	else
-		result = dax_pmd_fault(vma, addr, pmd, flags,
-					 ext4_dax_get_block);
+	else {
+		result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
+					     &ext4_iomap_ops);
+	}
 
 	if (write) {
 		if (!IS_ERR(handle))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 29237f25ddbe..9de9a5a5d2a4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3369,12 +3369,16 @@ retry:
 		}
 
 		/*
-		 * If we added blocks beyond i_size we need to make sure they
+		 * If we added blocks beyond i_size, we need to make sure they
 		 * will get truncated if we crash before updating i_size in
-		 * ext4_iomap_end().
+		 * ext4_iomap_end(). For faults we don't need to do that (and
+		 * even cannot because for orphan list operations inode_lock is
+		 * required) - if we happen to instantiate block beyond i_size,
+		 * it is because we race with truncate which has already added
+		 * the inode to the orphan list.
 		 */
-		if (first_block + map.m_len >
-		    (inode->i_size + (1 << blkbits) - 1) >> blkbits) {
+		if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
+		    (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
 			int err;
 
 			err = ext4_orphan_add(handle, inode);
@@ -3420,7 +3424,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 	int blkbits = inode->i_blkbits;
 	bool truncate = false;
 
-	if (!(flags & IOMAP_WRITE))
+	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
 		return 0;
 
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);

From 0bd2d5ec3d7655a849928f04597a0ceea0329176 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 18:53:30 -0500
Subject: [PATCH 44/86] ext4: rip out DAX handling from direct IO path

Reads and writes for DAX inodes should no longer end up in direct IO
code. Rip out the support and add a warning.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  2 -
 fs/ext4/inode.c | 97 ++++++++-----------------------------------------
 2 files changed, 15 insertions(+), 84 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index da82de650350..6673e88011ec 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2452,8 +2452,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
 			     struct buffer_head *bh_result, int create);
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
-		       struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create);
 int ext4_dio_get_block(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9de9a5a5d2a4..861f848159e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3280,46 +3280,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 
 #ifdef CONFIG_FS_DAX
-/*
- * Get block function for DAX IO and mmap faults. It takes care of converting
- * unwritten extents to written ones and initializes new / converted blocks
- * to zeros.
- */
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
-		       struct buffer_head *bh_result, int create)
-{
-	int ret;
-
-	ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
-	if (!create)
-		return _ext4_get_block(inode, iblock, bh_result, 0);
-
-	ret = ext4_get_block_trans(inode, iblock, bh_result,
-				   EXT4_GET_BLOCKS_PRE_IO |
-				   EXT4_GET_BLOCKS_CREATE_ZERO);
-	if (ret < 0)
-		return ret;
-
-	if (buffer_unwritten(bh_result)) {
-		/*
-		 * We are protected by i_mmap_sem or i_mutex so we know block
-		 * cannot go away from under us even though we dropped
-		 * i_data_sem. Convert extent to written and write zeros there.
-		 */
-		ret = ext4_get_block_trans(inode, iblock, bh_result,
-					   EXT4_GET_BLOCKS_CONVERT |
-					   EXT4_GET_BLOCKS_CREATE_ZERO);
-		if (ret < 0)
-			return ret;
-	}
-	/*
-	 * At least for now we have to clear BH_New so that DAX code
-	 * doesn't attempt to zero blocks again in a racy way.
-	 */
-	clear_buffer_new(bh_result);
-	return 0;
-}
-
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@@ -3473,14 +3433,6 @@ struct iomap_ops ext4_iomap_ops = {
 	.iomap_end		= ext4_iomap_end,
 };
 
-#else
-/* Just define empty function, it will never get called. */
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
-		       struct buffer_head *bh_result, int create)
-{
-	BUG();
-	return 0;
-}
 #endif
 
 static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -3602,19 +3554,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 	iocb->private = NULL;
 	if (overwrite)
 		get_block_func = ext4_dio_get_block_overwrite;
-	else if (IS_DAX(inode)) {
-		/*
-		 * We can avoid zeroing for aligned DAX writes beyond EOF. Other
-		 * writes need zeroing either because they can race with page
-		 * faults or because they use partial blocks.
-		 */
-		if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
-		    ext4_aligned_io(inode, offset, count))
-			get_block_func = ext4_dio_get_block;
-		else
-			get_block_func = ext4_dax_get_block;
-		dio_flags = DIO_LOCKING;
-	} else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+	else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
 		   round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
 		get_block_func = ext4_dio_get_block;
 		dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
@@ -3628,14 +3568,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
-	if (IS_DAX(inode)) {
-		ret = dax_do_io(iocb, inode, iter, get_block_func,
-				ext4_end_io_dio, dio_flags);
-	} else
-		ret = __blockdev_direct_IO(iocb, inode,
-					   inode->i_sb->s_bdev, iter,
-					   get_block_func,
-					   ext4_end_io_dio, NULL, dio_flags);
+	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
+				   get_block_func, ext4_end_io_dio, NULL,
+				   dio_flags);
 
 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
@@ -3704,6 +3639,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
+	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
 	/*
@@ -3712,19 +3648,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
 	 * we are protected against page writeback as well.
 	 */
 	inode_lock_shared(inode);
-	if (IS_DAX(inode)) {
-		ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0);
-	} else {
-		size_t count = iov_iter_count(iter);
-
-		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
-						   iocb->ki_pos + count);
-		if (ret)
-			goto out_unlock;
-		ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-					   iter, ext4_dio_get_block,
-					   NULL, NULL, 0);
-	}
+	ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
+					   iocb->ki_pos + count);
+	if (ret)
+		goto out_unlock;
+	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+				   iter, ext4_dio_get_block, NULL, NULL, 0);
 out_unlock:
 	inode_unlock_shared(inode);
 	return ret;
@@ -3753,6 +3682,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (ext4_has_inline_data(inode))
 		return 0;
 
+	/* DAX uses iomap path now */
+	if (WARN_ON_ONCE(IS_DAX(inode)))
+		return 0;
+
 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 	if (iov_iter_rw(iter) == READ)
 		ret = ext4_direct_IO_read(iocb, iter);

From 00697eed386d57c5267aad5b8343f027b10da0c0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 20:47:07 -0500
Subject: [PATCH 45/86] ext2: use iomap_zero_range() for zeroing truncated page
 in DAX path

Currently the last user of ext2_get_blocks() for DAX inodes was
dax_truncate_page(). Convert that to iomap_zero_range() so that all DAX
IO uses the iomap path.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext2/inode.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 41b8b44a391c..046b642f3585 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -850,6 +850,9 @@ struct iomap_ops ext2_iomap_ops = {
 	.iomap_begin		= ext2_iomap_begin,
 	.iomap_end		= ext2_iomap_end,
 };
+#else
+/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
+struct iomap_ops ext2_iomap_ops;
 #endif /* CONFIG_FS_DAX */
 
 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -1293,9 +1296,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 
 	inode_dio_wait(inode);
 
-	if (IS_DAX(inode))
-		error = dax_truncate_page(inode, newsize, ext2_get_block);
-	else if (test_opt(inode->i_sb, NOBH))
+	if (IS_DAX(inode)) {
+		error = iomap_zero_range(inode, newsize,
+					 PAGE_ALIGN(newsize) - newsize, NULL,
+					 &ext2_iomap_ops);
+	} else if (test_opt(inode->i_sb, NOBH))
 		error = nobh_truncate_page(inode->i_mapping,
 				newsize, ext2_get_block);
 	else

From dd936e4313fa3f60abd6e67abb3cb66fc9a018d1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 20 Nov 2016 20:48:36 -0500
Subject: [PATCH 46/86] dax: rip out get_block based IO support

No one uses functions using the get_block callback anymore. Rip them
out and update documentation.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 Documentation/filesystems/dax.txt |  22 +--
 fs/dax.c                          | 315 ------------------------------
 include/linux/dax.h               |  12 --
 3 files changed, 11 insertions(+), 338 deletions(-)

diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 23d18b8a49d5..a7e6e14aeb08 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
 Filesystem support consists of
 - adding support to mark inodes as being DAX by setting the S_DAX flag in
   i_flags
-- implementing the direct_IO address space operation, and calling
-  dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
+- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
+  when inode has S_DAX flag set
 - implementing an mmap file operation for DAX files which sets the
   VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
-  include handlers for fault, pmd_fault and page_mkwrite (which should
-  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
-  appropriate get_block() callback)
-- calling dax_truncate_page() instead of block_truncate_page() for DAX files
-- calling dax_zero_page_range() instead of zero_user() for DAX files
+  include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
+  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
+  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
+  iomap operations.
+- calling iomap_zero_range() passing appropriate iomap operations instead of
+  block_truncate_page() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
   truncates and page faults
 
-The get_block() callback passed to the DAX functions may return
-uninitialised extents.  If it does, it must ensure that simultaneous
-calls to get_block() (for example by a page-fault racing with a read()
-or a write()) work correctly.
+The iomap handlers for allocating blocks must make sure that allocated blocks
+are zeroed out and converted to written extents before being returned to avoid
+exposure of uninitialized data through mmap.
 
 These filesystems may be used for inspiration:
 - ext2: see Documentation/filesystems/ext2.txt
diff --git a/fs/dax.c b/fs/dax.c
index 28af41b9da3a..ad131cd2605d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
 	return page;
 }
 
-static bool buffer_written(struct buffer_head *bh)
-{
-	return buffer_mapped(bh) && !buffer_unwritten(bh);
-}
-
-static sector_t to_sector(const struct buffer_head *bh,
-		const struct inode *inode)
-{
-	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
-
-	return sector;
-}
-
-static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
-		      loff_t start, loff_t end, get_block_t get_block,
-		      struct buffer_head *bh)
-{
-	loff_t pos = start, max = start, bh_max = start;
-	bool hole = false;
-	struct block_device *bdev = NULL;
-	int rw = iov_iter_rw(iter), rc;
-	long map_len = 0;
-	struct blk_dax_ctl dax = {
-		.addr = ERR_PTR(-EIO),
-	};
-	unsigned blkbits = inode->i_blkbits;
-	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
-								>> blkbits;
-
-	if (rw == READ)
-		end = min(end, i_size_read(inode));
-
-	while (pos < end) {
-		size_t len;
-		if (pos == max) {
-			long page = pos >> PAGE_SHIFT;
-			sector_t block = page << (PAGE_SHIFT - blkbits);
-			unsigned first = pos - (block << blkbits);
-			long size;
-
-			if (pos == bh_max) {
-				bh->b_size = PAGE_ALIGN(end - pos);
-				bh->b_state = 0;
-				rc = get_block(inode, block, bh, rw == WRITE);
-				if (rc)
-					break;
-				bh_max = pos - first + bh->b_size;
-				bdev = bh->b_bdev;
-				/*
-				 * We allow uninitialized buffers for writes
-				 * beyond EOF as those cannot race with faults
-				 */
-				WARN_ON_ONCE(
-					(buffer_new(bh) && block < file_blks) ||
-					(rw == WRITE && buffer_unwritten(bh)));
-			} else {
-				unsigned done = bh->b_size -
-						(bh_max - (pos - first));
-				bh->b_blocknr += done >> blkbits;
-				bh->b_size -= done;
-			}
-
-			hole = rw == READ && !buffer_written(bh);
-			if (hole) {
-				size = bh->b_size - first;
-			} else {
-				dax_unmap_atomic(bdev, &dax);
-				dax.sector = to_sector(bh, inode);
-				dax.size = bh->b_size;
-				map_len = dax_map_atomic(bdev, &dax);
-				if (map_len < 0) {
-					rc = map_len;
-					break;
-				}
-				dax.addr += first;
-				size = map_len - first;
-			}
-			/*
-			 * pos + size is one past the last offset for IO,
-			 * so pos + size can overflow loff_t at extreme offsets.
-			 * Cast to u64 to catch this and get the true minimum.
-			 */
-			max = min_t(u64, pos + size, end);
-		}
-
-		if (iov_iter_rw(iter) == WRITE) {
-			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
-		} else if (!hole)
-			len = copy_to_iter((void __force *) dax.addr, max - pos,
-					iter);
-		else
-			len = iov_iter_zero(max - pos, iter);
-
-		if (!len) {
-			rc = -EFAULT;
-			break;
-		}
-
-		pos += len;
-		if (!IS_ERR(dax.addr))
-			dax.addr += len;
-	}
-
-	dax_unmap_atomic(bdev, &dax);
-
-	return (pos == start) ? rc : pos - start;
-}
-
-/**
- * dax_do_io - Perform I/O to a DAX file
- * @iocb: The control block for this I/O
- * @inode: The file which the I/O is directed at
- * @iter: The addresses to do I/O from or to
- * @get_block: The filesystem method used to translate file offsets to blocks
- * @end_io: A filesystem callback for I/O completion
- * @flags: See below
- *
- * This function uses the same locking scheme as do_blockdev_direct_IO:
- * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
- * caller for writes.  For reads, we take and release the i_mutex ourselves.
- * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
- * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
- * is in progress.
- */
-ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
-		  struct iov_iter *iter, get_block_t get_block,
-		  dio_iodone_t end_io, int flags)
-{
-	struct buffer_head bh;
-	ssize_t retval = -EINVAL;
-	loff_t pos = iocb->ki_pos;
-	loff_t end = pos + iov_iter_count(iter);
-
-	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-
-	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-		inode_lock(inode);
-
-	/* Protects against truncate */
-	if (!(flags & DIO_SKIP_DIO_COUNT))
-		inode_dio_begin(inode);
-
-	retval = dax_io(inode, iter, pos, end, get_block, &bh);
-
-	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-		inode_unlock(inode);
-
-	if (end_io) {
-		int err;
-
-		err = end_io(iocb, pos, retval, bh.b_private);
-		if (err)
-			retval = err;
-	}
-
-	if (!(flags & DIO_SKIP_DIO_COUNT))
-		inode_dio_end(inode);
-	return retval;
-}
-EXPORT_SYMBOL_GPL(dax_do_io);
-
 /*
  * DAX radix tree locking
  */
@@ -919,105 +757,6 @@ static int dax_insert_mapping(struct address_space *mapping,
 	return vm_insert_mixed(vma, vaddr, dax.pfn);
 }
 
-/**
- * dax_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * fault handler for DAX files. dax_fault() assumes the caller has done all
- * the necessary locking for the page fault to proceed successfully.
- */
-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			get_block_t get_block)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	void *entry;
-	struct buffer_head bh;
-	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	unsigned blkbits = inode->i_blkbits;
-	sector_t block;
-	pgoff_t size;
-	int error;
-	int major = 0;
-
-	/*
-	 * Check whether offset isn't beyond end of file now. Caller is supposed
-	 * to hold locks serializing us with truncate / punch hole so this is
-	 * a reliable test.
-	 */
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		return VM_FAULT_SIGBUS;
-
-	memset(&bh, 0, sizeof(bh));
-	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
-	bh.b_bdev = inode->i_sb->s_bdev;
-	bh.b_size = PAGE_SIZE;
-
-	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
-	if (IS_ERR(entry)) {
-		error = PTR_ERR(entry);
-		goto out;
-	}
-
-	error = get_block(inode, block, &bh, 0);
-	if (!error && (bh.b_size < PAGE_SIZE))
-		error = -EIO;		/* fs corruption? */
-	if (error)
-		goto unlock_entry;
-
-	if (vmf->cow_page) {
-		struct page *new_page = vmf->cow_page;
-		if (buffer_written(&bh))
-			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
-					bh.b_size, new_page, vaddr);
-		else
-			clear_user_highpage(new_page, vaddr);
-		if (error)
-			goto unlock_entry;
-		if (!radix_tree_exceptional_entry(entry)) {
-			vmf->page = entry;
-			return VM_FAULT_LOCKED;
-		}
-		vmf->entry = entry;
-		return VM_FAULT_DAX_LOCKED;
-	}
-
-	if (!buffer_mapped(&bh)) {
-		if (vmf->flags & FAULT_FLAG_WRITE) {
-			error = get_block(inode, block, &bh, 1);
-			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
-			if (!error && (bh.b_size < PAGE_SIZE))
-				error = -EIO;
-			if (error)
-				goto unlock_entry;
-		} else {
-			return dax_load_hole(mapping, entry, vmf);
-		}
-	}
-
-	/* Filesystem should not return unwritten buffers to us! */
-	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
-			bh.b_size, &entry, vma, vmf);
- unlock_entry:
-	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
-	if (error == -ENOMEM)
-		return VM_FAULT_OOM | major;
-	/* -EBUSY is fine, somebody else faulted on the same PTE */
-	if ((error < 0) && (error != -EBUSY))
-		return VM_FAULT_SIGBUS | major;
-	return VM_FAULT_NOPAGE | major;
-}
-EXPORT_SYMBOL_GPL(dax_fault);
-
 /**
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
@@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
-/**
- * dax_zero_page_range - zero a range within a page of a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @length: The number of bytes to zero
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * This function can be called by a filesystem when it is zeroing part of a
- * page in a DAX file.  This is intended for hole-punch operations.  If
- * you are truncating a file, the helper function dax_truncate_page() may be
- * more convenient.
- */
-int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
-							get_block_t get_block)
-{
-	struct buffer_head bh;
-	pgoff_t index = from >> PAGE_SHIFT;
-	unsigned offset = from & (PAGE_SIZE-1);
-	int err;
-
-	/* Block boundary? Nothing to do */
-	if (!length)
-		return 0;
-	if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
-		return -EINVAL;
-
-	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-	bh.b_size = PAGE_SIZE;
-	err = get_block(inode, index, &bh, 0);
-	if (err < 0 || !buffer_written(&bh))
-		return err;
-
-	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
-			offset, length);
-}
-EXPORT_SYMBOL_GPL(dax_zero_page_range);
-
-/**
- * dax_truncate_page - handle a partial page being truncated in a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * Similar to block_truncate_page(), this function can be called by a
- * filesystem when it is truncating a DAX file to handle the partial page.
- */
-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
-{
-	unsigned length = PAGE_ALIGN(from) - from;
-	return dax_zero_page_range(inode, from, length, get_block);
-}
-EXPORT_SYMBOL_GPL(dax_truncate_page);
-
 #ifdef CONFIG_FS_IOMAP
 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8d1a5c47945f..0afade8bd3d7 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops);
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
-		  get_block_t, dio_iodone_t, int flags);
-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			struct iomap_ops *ops);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		pgoff_t index, void *entry, bool wake_all);
@@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 }
 #endif
 
-static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-				pmd_t *pmd, unsigned int flags, get_block_t gb)
-{
-	return VM_FAULT_FALLBACK;
-}
-
 #ifdef CONFIG_FS_DAX_PMD
 static inline unsigned int dax_radix_order(void *entry)
 {
@@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
 }
 #endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {

From d086630e193b5837aa3432a65cda5751aa11b425 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Mon, 21 Nov 2016 11:51:44 -0500
Subject: [PATCH 47/86] ext4: remove unused function ext4_aligned_io()

The last user of ext4_aligned_io() was the DAX path in
ext4_direct_IO_write().  This usage was removed by Jan Kara's patch
entitled "ext4: Rip out DAX handling from direct IO path".

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6673e88011ec..aff204f040fc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3257,13 +3257,6 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 	}
 }
 
-static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
-{
-	int blksize = 1 << inode->i_blkbits;
-
-	return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
-}
-
 extern struct iomap_ops ext4_iomap_ops;
 
 #endif	/* __KERNEL__ */

From 2f8f5e76c7da787153f3b9791a2cb4fb384e0e67 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 21 Nov 2016 11:52:44 -0500
Subject: [PATCH 48/86] ext4: avoid lockdep warning when inheriting encryption
 context

On a lockdep-enabled kernel, xfstests generic/027 fails due to a lockdep
warning when run on ext4 mounted with -o test_dummy_encryption:

    xfs_io/4594 is trying to acquire lock:
     (jbd2_handle
    ){++++.+}, at:
    [<ffffffff813096ef>] jbd2_log_wait_commit+0x5/0x11b

    but task is already holding lock:
     (jbd2_handle
    ){++++.+}, at:
    [<ffffffff813000de>] start_this_handle+0x354/0x3d8

The abbreviated call stack is:

 [<ffffffff813096ef>] ? jbd2_log_wait_commit+0x5/0x11b
 [<ffffffff8130972a>] jbd2_log_wait_commit+0x40/0x11b
 [<ffffffff813096ef>] ? jbd2_log_wait_commit+0x5/0x11b
 [<ffffffff8130987b>] ? __jbd2_journal_force_commit+0x76/0xa6
 [<ffffffff81309896>] __jbd2_journal_force_commit+0x91/0xa6
 [<ffffffff813098b9>] jbd2_journal_force_commit_nested+0xe/0x18
 [<ffffffff812a6049>] ext4_should_retry_alloc+0x72/0x79
 [<ffffffff812f0c1f>] ext4_xattr_set+0xef/0x11f
 [<ffffffff812cc35b>] ext4_set_context+0x3a/0x16b
 [<ffffffff81258123>] fscrypt_inherit_context+0xe3/0x103
 [<ffffffff812ab611>] __ext4_new_inode+0x12dc/0x153a
 [<ffffffff812bd371>] ext4_create+0xb7/0x161

When a file is created in an encrypted directory, ext4_set_context() is
called to set an encryption context on the new file.  This calls
ext4_xattr_set(), which contains a retry loop where the journal is
forced to commit if an ENOSPC error is encountered.

If the task actually were to wait for the journal to commit in this
case, then it would deadlock because a handle remains open from
__ext4_new_inode(), so the running transaction can't be committed yet.
Fortunately, __jbd2_journal_force_commit() avoids the deadlock by not
allowing the running transaction to be committed while the current task
has it open.  However, the above lockdep warning is still triggered.

This was a false positive which was introduced by: 1eaa566d368b: jbd2:
track more dependencies on transaction commit

Fix the problem by passing the handle through the 'fs_data' argument to
ext4_set_context(), then using ext4_xattr_set_handle() instead of
ext4_xattr_set().  And in the case where no journal handle is specified
and ext4_set_context() has to open one, add an ENOSPC retry loop since
in that case it is the outermost transaction.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/ext4/ialloc.c |  3 +--
 fs/ext4/super.c  | 32 ++++++++++++++++++++++----------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 088afe07ddda..e57e8d90ea54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1115,8 +1115,7 @@ got:
 	}
 
 	if (encrypt) {
-		/* give pointer to avoid set_context with journal ops. */
-		err = fscrypt_inherit_context(dir, inode, &encrypt, true);
+		err = fscrypt_inherit_context(dir, inode, handle, true);
 		if (err)
 			goto fail_free_drop;
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cbfaee175fa5..bb3a8edc75db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1114,14 +1114,22 @@ static int ext4_prepare_context(struct inode *inode)
 static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
-	handle_t *handle;
-	int res, res2;
+	handle_t *handle = fs_data;
+	int res, res2, retries = 0;
 
-	/* fs_data is null when internally used. */
-	if (fs_data) {
-		res  = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
-				EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
-				len, 0);
+	/*
+	 * If a journal handle was specified, then the encryption context is
+	 * being set on a new inode via inheritance and is part of a larger
+	 * transaction to create the inode.  Otherwise the encryption context is
+	 * being set on an existing inode in its own transaction.  Only in the
+	 * latter case should the "retry on ENOSPC" logic be used.
+	 */
+
+	if (handle) {
+		res = ext4_xattr_set_handle(handle, inode,
+					    EXT4_XATTR_INDEX_ENCRYPTION,
+					    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+					    ctx, len, 0);
 		if (!res) {
 			ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
 			ext4_clear_inode_state(inode,
@@ -1134,14 +1142,15 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 		return res;
 	}
 
+retry:
 	handle = ext4_journal_start(inode, EXT4_HT_MISC,
 			ext4_jbd2_credits_xattr(inode));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
-			EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
-			len, 0);
+	res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+				    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+				    ctx, len, 0);
 	if (!res) {
 		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
 		/* Update inode->i_flags - e.g. S_DAX may get disabled */
@@ -1151,6 +1160,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 			EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
 	}
 	res2 = ext4_journal_stop(handle);
+
+	if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
 	if (!res)
 		res = res2;
 	return res;

From 4f5a763c9a0d4b179064d51dddde8421fdbb76c7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 22 Nov 2016 23:21:58 -0500
Subject: [PATCH 49/86] ext4: Add select for CONFIG_FS_IOMAP

When ext4 is compiled with DAX support, it now needs the iomap code. Add
appropriate select to Kconfig.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index e38039fd96ff..7b90691e98c4 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,6 +37,7 @@ config EXT4_FS
 	select CRC16
 	select CRYPTO
 	select CRYPTO_CRC32C
+	select FS_IOMAP if FS_DAX
 	help
 	  This is the next generation of the ext3 filesystem.
 

From 9060dd2c5036b12132f9b97e3486ca6422d5bdfc Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 26 Nov 2016 14:24:51 -0500
Subject: [PATCH 50/86] ext4: fix mmp use after free during unmount

In ext4_put_super, we call brelse on the buffer head containing
the ext4 superblock, but then try to use it when we stop the
mmp thread, because when the thread shuts down it does:

write_mmp_block
  ext4_mmp_csum_set
    ext4_has_metadata_csum
      WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb)...)

which reaches into sb->s_fs_info->s_es->s_feature_ro_compat,
which lives in the superblock buffer s_sbh which we just released.

Fix this by moving the brelse down to a point where we are no
longer using it.

Reported-by: Wang Shu <shuwang@redhat.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index bb3a8edc75db..a526956e49e7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -863,7 +863,6 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
-	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -895,6 +894,7 @@ static void ext4_put_super(struct super_block *sb)
 	}
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
+	brelse(sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	/*
 	 * Now that we are completely done shutting down the

From f8011d93a2d125b812e8b90bedb5be2a00149ac4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 29 Nov 2016 11:13:13 -0500
Subject: [PATCH 51/86] ext4: add EXT4_JOURNAL_DATA_FL and EXT4_EXTENTS_FL to
 modifiable mask

Add EXT4_JOURNAL_DATA_FL and EXT4_EXTENTS_FL to EXT4_FL_USER_MODIFIABLE
to recognize that they are modifiable by userspace. So far we got away
without having them there because ext4_ioctl_setflags() treats them in a
special way. But it was really confusing like that.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 2 +-
 fs/ext4/ioctl.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aff204f040fc..6e8b8e3fa4b7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -397,7 +397,7 @@ struct flex_groups {
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x204380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE		0x204BC0FF /* User modifiable flags */
 
 #define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \
 					 EXT4_IMMUTABLE_FL | \
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index fc1cd37ba2d9..834a1c10b2c2 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -268,6 +268,9 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
 		if (!(mask & EXT4_FL_USER_MODIFIABLE))
 			continue;
+		/* These flags get special treatment later */
+		if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL)
+			continue;
 		if (mask & flags)
 			ext4_set_inode_flag(inode, i);
 		else

From d14e7683ecf06aa4365ee96adf74494114341dbe Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 29 Nov 2016 11:18:39 -0500
Subject: [PATCH 52/86] ext4: be more strict when verifying flags set via
 SETFLAGS ioctls

Currently we just silently ignore flags that we don't understand (or
that cannot be manipulated) through EXT4_IOC_SETFLAGS and
EXT4_IOC_FSSETXATTR ioctls. This makes it problematic for the unused
flags to be used in future (some app may be inadvertedly setting them
and we won't notice until the flag gets used). Also this is inconsistent
with other filesystems like XFS or BTRFS which return EOPNOTSUPP when
they see a flag they cannot set.

ext4 has the additional problem that there are flags which are returned
by EXT4_IOC_GETFLAGS ioctl but which cannot be modified via
EXT4_IOC_SETFLAGS. So we have to be careful to ignore value of these
flags and not fail the ioctl when they are set (as e.g. chattr(1) passes
flags returned from EXT4_IOC_GETFLAGS to EXT4_IOC_SETFLAGS without any
masking and thus we'd break this utility).

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/ioctl.c | 28 +++++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6e8b8e3fa4b7..f9958f661789 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -399,6 +399,7 @@ struct flex_groups {
 #define EXT4_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x204BC0FF /* User modifiable flags */
 
+/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
 #define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \
 					 EXT4_IMMUTABLE_FL | \
 					 EXT4_APPEND_FL | \
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 834a1c10b2c2..2faeb3cdbbb3 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -415,6 +415,10 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
 	return xflags;
 }
 
+#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
+				  FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
+				  FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+
 /* Transfer xflags flags to internal */
 static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
 {
@@ -459,12 +463,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (get_user(flags, (int __user *) arg))
 			return -EFAULT;
 
+		if (flags & ~EXT4_FL_USER_VISIBLE)
+			return -EOPNOTSUPP;
+		/*
+		 * chattr(1) grabs flags via GETFLAGS, modifies the result and
+		 * passes that to SETFLAGS. So we cannot easily make SETFLAGS
+		 * more restrictive than just silently masking off visible but
+		 * not settable flags as we always did.
+		 */
+		flags &= EXT4_FL_USER_MODIFIABLE;
+		if (ext4_mask_flags(inode->i_mode, flags) != flags)
+			return -EOPNOTSUPP;
+
 		err = mnt_want_write_file(filp);
 		if (err)
 			return err;
 
-		flags = ext4_mask_flags(inode->i_mode, flags);
-
 		inode_lock(inode);
 		err = ext4_ioctl_setflags(inode, flags);
 		inode_unlock(inode);
@@ -871,13 +885,17 @@ resizefs_out:
 		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
+		if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
+			return -EOPNOTSUPP;
+
+		flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+		if (ext4_mask_flags(inode->i_mode, flags) != flags)
+			return -EOPNOTSUPP;
+
 		err = mnt_want_write_file(filp);
 		if (err)
 			return err;
 
-		flags = ext4_xflags_to_iflags(fa.fsx_xflags);
-		flags = ext4_mask_flags(inode->i_mode, flags);
-
 		inode_lock(inode);
 		flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
 			 (flags & EXT4_FL_XFLAG_VISIBLE);

From 6dcc693bc57f198bd85a7881eb59a915366fae1e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 1 Dec 2016 11:46:40 -0500
Subject: [PATCH 53/86] ext4: warn when page is dirtied without buffers

Warn when a page is dirtied without buffers (as that will likely lead to
a crash in ext4_writepages()) or when it gets newly dirtied without the
page being locked (as there is nothing that prevents buffers to get
stripped just before calling set_page_dirty() under memory pressure).

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 861f848159e8..7d95b6174980 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3714,6 +3714,13 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
+static int ext4_set_page_dirty(struct page *page)
+{
+	WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
+	WARN_ON_ONCE(!page_has_buffers(page));
+	return __set_page_dirty_buffers(page);
+}
+
 static const struct address_space_operations ext4_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
@@ -3721,6 +3728,7 @@ static const struct address_space_operations ext4_aops = {
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_write_end,
+	.set_page_dirty		= ext4_set_page_dirty,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
@@ -3753,6 +3761,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
+	.set_page_dirty		= ext4_set_page_dirty,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_da_invalidatepage,
 	.releasepage		= ext4_releasepage,

From 05ac5aa18abd7db341e54df4ae2b4c98ea0e43b7 Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daeho.jeong@samsung.com>
Date: Thu, 1 Dec 2016 11:49:12 -0500
Subject: [PATCH 54/86] ext4: fix inode checksum calculation problem if
 i_extra_size is small

We've fixed the race condition problem in calculating ext4 checksum
value in commit b47820edd163 ("ext4: avoid modifying checksum fields
directly during checksum veficationon"). However, by this change,
when calculating the checksum value of inode whose i_extra_size is
less than 4, we couldn't calculate the checksum value in a proper way.
This problem was found and reported by Nix, Thank you.

Reported-by: Nix <nix@esperi.org.uk>
Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Youngjin Gil <youngjin.gil@samsung.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7d95b6174980..b48ca0392b9c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -72,10 +72,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
 			csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
 					   csum_size);
 			offset += csum_size;
-			csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
-					   EXT4_INODE_SIZE(inode->i_sb) -
-					   offset);
 		}
+		csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+				   EXT4_INODE_SIZE(inode->i_sb) - offset);
 	}
 
 	return csum;

From 35997d1ce8b01f9f002dd080c59e22430ead6db1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Dec 2016 11:54:18 -0500
Subject: [PATCH 55/86] ext4: get rid of ext4_sb_has_crypto()

ext4_sb_has_crypto() just called through to ext4_has_feature_encrypt(),
and all callers except one were already using the latter.  So remove it
and switch its one caller to ext4_has_feature_encrypt().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 5 -----
 fs/ext4/ioctl.c | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f9958f661789..7446d390d051 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2272,11 +2272,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
 					      struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
-static inline int ext4_sb_has_crypto(struct super_block *sb)
-{
-	return ext4_has_feature_encrypt(sb);
-}
-
 static inline bool ext4_encrypted_inode(struct inode *inode)
 {
 	return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2faeb3cdbbb3..560afe08a5ca 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -806,7 +806,7 @@ resizefs_out:
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		handle_t *handle;
 
-		if (!ext4_sb_has_crypto(sb))
+		if (!ext4_has_feature_encrypt(sb))
 			return -EOPNOTSUPP;
 		if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
 			err = mnt_want_write_file(filp);

From ba679017ef0f47d31c50df731441fdd39bf4bf13 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Dec 2016 11:55:51 -0500
Subject: [PATCH 56/86] ext4: disable pwsalt ioctl when encryption disabled by
 config

On a CONFIG_EXT4_FS_ENCRYPTION=n kernel, the ioctls to get and set
encryption policies were disabled but EXT4_IOC_GET_ENCRYPTION_PWSALT was
not.  But there's no good reason to expose the pwsalt ioctl if the
kernel doesn't support encryption.  The pwsalt ioctl was also disabled
pre-4.8 (via ext4_sb_has_crypto() previously returning 0 when encryption
was disabled by config) and seems to have been enabled by mistake when
ext4 encryption was refactored to use fs/crypto/.  So let's disable it
again.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 560afe08a5ca..fcc9510a819c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -191,6 +191,7 @@ journal_err_out:
 	return err;
 }
 
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
 static int uuid_is_zero(__u8 u[16])
 {
 	int	i;
@@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16])
 			return 0;
 	return 1;
 }
+#endif
 
 static int ext4_ioctl_setflags(struct inode *inode,
 			       unsigned int flags)
@@ -802,6 +804,7 @@ resizefs_out:
 #endif
 	}
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
 		int err, err2;
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		handle_t *handle;
@@ -836,6 +839,9 @@ resizefs_out:
 				 sbi->s_es->s_encrypt_pw_salt, 16))
 			return -EFAULT;
 		return 0;
+#else
+		return -EOPNOTSUPP;
+#endif
 	}
 	case EXT4_IOC_GET_ENCRYPTION_POLICY: {
 #ifdef CONFIG_EXT4_FS_ENCRYPTION

From 2dc8d9e19b0d891b0d3675b5ac82be9be3875e36 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Dec 2016 14:43:33 -0500
Subject: [PATCH 57/86] ext4: forbid i_extra_isize not divisible by 4

i_extra_isize not divisible by 4 is problematic for several reasons:

- It causes the in-inode xattr space to be misaligned, but the xattr
  header and entries are not declared __packed to express this
  possibility.  This may cause poor performance or incorrect code
  generation on some platforms.
- When validating the xattr entries we can read past the end of the
  inode if the size available for xattrs is not a multiple of 4.
- It allows the nonsensical i_extra_isize=1, which doesn't even leave
  enough room for i_extra_isize itself.

Therefore, update ext4_iget() to consider i_extra_isize not divisible by
4 to be an error, like the case where i_extra_isize is too large.

This also matches the rule recently added to e2fsck for determining
whether an inode has valid i_extra_isize.

This patch shouldn't have any noticeable effect on
non-corrupted/non-malicious filesystems, since the size of ext4_inode
has always been a multiple of 4.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/inode.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b48ca0392b9c..e3e197898c66 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4572,10 +4572,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
-		    EXT4_INODE_SIZE(inode->i_sb)) {
-			EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
-				EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
-				EXT4_INODE_SIZE(inode->i_sb));
+			EXT4_INODE_SIZE(inode->i_sb) ||
+		    (ei->i_extra_isize & 3)) {
+			EXT4_ERROR_INODE(inode,
+					 "bad extra_isize %u (inode size %u)",
+					 ei->i_extra_isize,
+					 EXT4_INODE_SIZE(inode->i_sb));
 			ret = -EFSCORRUPTED;
 			goto bad_inode;
 		}
@@ -4693,6 +4695,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		if (ei->i_extra_isize == 0) {
 			/* The extra space is currently unused. Use it. */
+			BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else {

From 290ab230016f187c3551d8380ea742889276d03a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Dec 2016 14:51:58 -0500
Subject: [PATCH 58/86] ext4: don't read out of bounds when checking for
 in-inode xattrs

With i_extra_isize equal to or close to the available space, it was
possible for us to read past the end of the inode when trying to detect
or validate in-inode xattrs.  Fix this by checking for the needed extra
space first.

This patch shouldn't have any noticeable effect on
non-corrupted/non-malicious filesystems.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/inode.c | 4 +++-
 fs/ext4/xattr.c | 5 ++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e3e197898c66..59a518ad6bb2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4527,7 +4527,9 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
 {
 	__le32 *magic = (void *)raw_inode +
 			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
-	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+	if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
+	    EXT4_INODE_SIZE(inode->i_sb) &&
+	    *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		ext4_find_inline_data_nolock(inode);
 	} else
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1846e9168f80..59c9ec7eabae 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -231,13 +231,12 @@ static int
 __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
 			 void *end, const char *function, unsigned int line)
 {
-	struct ext4_xattr_entry *entry = IFIRST(header);
 	int error = -EFSCORRUPTED;
 
-	if (((void *) header >= end) ||
+	if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
 	    (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
 		goto errout;
-	error = ext4_xattr_check_names(entry, end, entry);
+	error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
 errout:
 	if (error)
 		__ext4_error_inode(inode, function, line, 0,

From d7614cc16146e3f0b4c33e71875c19607602aed5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Dec 2016 14:57:29 -0500
Subject: [PATCH 59/86] ext4: correctly detect when an xattr value has an
 invalid size

It was possible for an xattr value to have a very large size, which
would then pass validation on 32-bit architectures due to a pointer
wraparound.  Fix this by validating the size in a way which avoids
pointer wraparound.

It was also possible that a value's size would fit in the available
space but its padded size would not.  This would cause an out-of-bounds
memory write in ext4_xattr_set_entry when replacing the xattr value.
For example, if an xattr value of unpadded size 253 bytes went until the
very end of the inode or block, then using setxattr(2) to replace this
xattr's value with 256 bytes would cause a write to the 3 bytes past the
end of the inode or buffer, and the new xattr value would be incorrectly
truncated.  Fix this by requiring that the padded size fit in the
available space rather than the unpadded size.

This patch shouldn't have any noticeable effect on
non-corrupted/non-malicious filesystems.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 59c9ec7eabae..5a94fa52b74f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -185,6 +185,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
 {
 	struct ext4_xattr_entry *e = entry;
 
+	/* Find the end of the names list */
 	while (!IS_LAST_ENTRY(e)) {
 		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
 		if ((void *)next >= end)
@@ -192,15 +193,29 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
 		e = next;
 	}
 
+	/* Check the values */
 	while (!IS_LAST_ENTRY(entry)) {
 		if (entry->e_value_block != 0)
 			return -EFSCORRUPTED;
-		if (entry->e_value_size != 0 &&
-		    (value_start + le16_to_cpu(entry->e_value_offs) <
-		     (void *)e + sizeof(__u32) ||
-		     value_start + le16_to_cpu(entry->e_value_offs) +
-		    le32_to_cpu(entry->e_value_size) > end))
-			return -EFSCORRUPTED;
+		if (entry->e_value_size != 0) {
+			u16 offs = le16_to_cpu(entry->e_value_offs);
+			u32 size = le32_to_cpu(entry->e_value_size);
+			void *value;
+
+			/*
+			 * The value cannot overlap the names, and the value
+			 * with padding cannot extend beyond 'end'.  Check both
+			 * the padded and unpadded sizes, since the size may
+			 * overflow to 0 when adding padding.
+			 */
+			if (offs > end - value_start)
+				return -EFSCORRUPTED;
+			value = value_start + offs;
+			if (value < (void *)e + sizeof(u32) ||
+			    size > end - value ||
+			    EXT4_XATTR_SIZE(size) > end - value)
+				return -EFSCORRUPTED;
+		}
 		entry = EXT4_XATTR_NEXT(entry);
 	}
 

From 3a4b77cd47bb837b8557595ec7425f281f2ca1fe Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Thu, 1 Dec 2016 15:08:37 -0500
Subject: [PATCH 60/86] ext4: validate s_first_meta_bg at mount time

Ralf Spenneberg reported that he hit a kernel crash when mounting a
modified ext4 image. And it turns out that kernel crashed when
calculating fs overhead (ext4_calculate_overhead()), this is because
the image has very large s_first_meta_bg (debug code shows it's
842150400), and ext4 overruns the memory in count_overhead() when
setting bitmap buffer, which is PAGE_SIZE.

ext4_calculate_overhead():
  buf = get_zeroed_page(GFP_NOFS);  <=== PAGE_SIZE buffer
  blks = count_overhead(sb, i, buf);

count_overhead():
  for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { <=== j = 842150400
          ext4_set_bit(EXT4_B2C(sbi, s++), buf);   <=== buffer overrun
          count++;
  }

This can be reproduced easily for me by this script:

  #!/bin/bash
  rm -f fs.img
  mkdir -p /mnt/ext4
  fallocate -l 16M fs.img
  mke2fs -t ext4 -O bigalloc,meta_bg,^resize_inode -F fs.img
  debugfs -w -R "ssv first_meta_bg 842150400" fs.img
  mount -o loop fs.img /mnt/ext4

Fix it by validating s_first_meta_bg first at mount time, and
refusing to mount if its value exceeds the largest possible meta_bg
number.

Reported-by: Ralf Spenneberg <ralf@os-t.de>
Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/super.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a526956e49e7..32c0debbaa92 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3842,6 +3842,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
+	if (ext4_has_feature_meta_bg(sb)) {
+		if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
+			ext4_msg(sb, KERN_WARNING,
+				 "first meta block group too large: %u "
+				 "(group descriptor block count %u)",
+				 le32_to_cpu(es->s_first_meta_bg), db_count);
+			goto failed_mount;
+		}
+	}
 	sbi->s_group_desc = ext4_kvmalloc(db_count *
 					  sizeof(struct buffer_head *),
 					  GFP_KERNEL);

From 4db0d88e2ebc4f47092adc01f9885a43ad748995 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 2 Dec 2016 12:12:53 -0500
Subject: [PATCH 61/86] ext4: fix reading new encrypted symlinks on no-journal
 file systems

On a filesystem with no journal, a symlink longer than about 32
characters (exact length depending on padding for encryption) could not
be followed or read immediately after being created in an encrypted
directory.  This happened because when the symlink data went through the
delayed allocation path instead of the journaling path, the symlink was
incorrectly detected as a "fast" symlink rather than a "slow" symlink
until its data was written out.

To fix this, disable delayed allocation for symlinks, since there is
no benefit for delayed allocation anyway.

Reported-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 59a518ad6bb2..a1eac0054203 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2902,7 +2902,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 
 	index = pos >> PAGE_SHIFT;
 
-	if (ext4_nonda_switch(inode->i_sb)) {
+	if (ext4_nonda_switch(inode->i_sb) ||
+	    S_ISLNK(inode->i_mode)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
 		return ext4_write_begin(file, mapping, pos,
 					len, flags, pagep, fsdata);

From 918b7306edacbcc8a9ca318a5a34d73954e1705d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 3 Dec 2016 15:13:15 -0500
Subject: [PATCH 62/86] mbcache: correctly handle 'e_referenced' bit

mbcache entries have an 'e_referenced' bit which users can set with
mb_cache_entry_touch() to indicate that an entry should be given another
pass through the LRU list before the shrinker can delete it.  However,
mb_cache_shrink() actually would, when seeing an e_referenced entry at
the front of the list (the least-recently used end), place it right at
the front of the list again.  The next iteration would then remove the
entry from the list and delete it.  Consequently, e_referenced had
essentially no effect, so ext2/ext4 xattr blocks would sometimes not be
reused as often as expected.

Fix this by making the shrinker move e_referenced entries to the back of
the list rather than the front.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index c5bd19ffa326..31e54c25d16f 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -286,7 +286,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 					 struct mb_cache_entry, e_list);
 		if (entry->e_referenced) {
 			entry->e_referenced = 0;
-			list_move_tail(&cache->c_list, &entry->e_list);
+			list_move_tail(&entry->e_list, &cache->c_list);
 			continue;
 		}
 		list_del_init(&entry->e_list);

From 21d0f4fa8e7b026cdae988f99f4c3aab154d5b7c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 3 Dec 2016 15:28:53 -0500
Subject: [PATCH 63/86] mbcache: don't BUG() if entry cache cannot be allocated

mbcache can be a module that is loaded long after startup, when someone
asks to mount an ext2 or ext4 filesystem.  Therefore it should not BUG()
if kmem_cache_create() fails, but rather just fail the module load.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 31e54c25d16f..c56ab2164d6b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -420,7 +420,8 @@ static int __init mbcache_init(void)
 	mb_entry_cache = kmem_cache_create("mbcache",
 				sizeof(struct mb_cache_entry), 0,
 				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-	BUG_ON(!mb_entry_cache);
+	if (!mb_entry_cache)
+		return -ENOMEM;
 	return 0;
 }
 

From 97c7b18a5d15516a4358a6d2d5ddfb22d62d9fac Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 3 Dec 2016 15:38:29 -0500
Subject: [PATCH 64/86] mbcache: remove unnecessary module_get/module_put

When mbcache is built as a module, any modules that use it (ext2 and/or
ext4) will depend on its symbols directly, incrementing its reference
count.  Therefore, there is no need to do module_get/module_put.

Also note that since the module_get/module_put were in the mbcache
module itself, executing those lines of code was already dependent on
another reference to the mbcache module being held.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index c56ab2164d6b..07c5d7d2b701 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -344,9 +344,6 @@ struct mb_cache *mb_cache_create(int bucket_bits)
 	int bucket_count = 1 << bucket_bits;
 	int i;
 
-	if (!try_module_get(THIS_MODULE))
-		return NULL;
-
 	cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
 	if (!cache)
 		goto err_out;
@@ -377,7 +374,6 @@ struct mb_cache *mb_cache_create(int bucket_bits)
 	return cache;
 
 err_out:
-	module_put(THIS_MODULE);
 	return NULL;
 }
 EXPORT_SYMBOL(mb_cache_create);
@@ -411,7 +407,6 @@ void mb_cache_destroy(struct mb_cache *cache)
 	}
 	kfree(cache->c_hash);
 	kfree(cache);
-	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL(mb_cache_destroy);
 

From 132d4e2d55dfd9edae26cb35f088add7c19990e6 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 3 Dec 2016 15:43:48 -0500
Subject: [PATCH 65/86] mbcache: use consistent type for entry count

mbcache used several different types to represent the number of entries
in the cache.  For consistency within mbcache and with the shrinker API,
always use unsigned long.

This does not change behavior for current mbcache users (ext2 and ext4)
since they limit the entry count to a value which easily fits in an int.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 07c5d7d2b701..bf659061a9a8 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -29,7 +29,7 @@ struct mb_cache {
 	/* log2 of hash table size */
 	int			c_bucket_bits;
 	/* Maximum entries in cache to avoid degrading hash too much */
-	int			c_max_entries;
+	unsigned long		c_max_entries;
 	/* Protects c_list, c_entry_count */
 	spinlock_t		c_list_lock;
 	struct list_head	c_list;
@@ -43,7 +43,7 @@ struct mb_cache {
 static struct kmem_cache *mb_entry_cache;
 
 static unsigned long mb_cache_shrink(struct mb_cache *cache,
-				     unsigned int nr_to_scan);
+				     unsigned long nr_to_scan);
 
 static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
 							u32 key)
@@ -274,11 +274,11 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
 
 /* Shrink number of entries in cache */
 static unsigned long mb_cache_shrink(struct mb_cache *cache,
-				     unsigned int nr_to_scan)
+				     unsigned long nr_to_scan)
 {
 	struct mb_cache_entry *entry;
 	struct hlist_bl_head *head;
-	unsigned int shrunk = 0;
+	unsigned long shrunk = 0;
 
 	spin_lock(&cache->c_list_lock);
 	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
@@ -316,10 +316,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 static unsigned long mb_cache_scan(struct shrinker *shrink,
 				   struct shrink_control *sc)
 {
-	int nr_to_scan = sc->nr_to_scan;
 	struct mb_cache *cache = container_of(shrink, struct mb_cache,
 					      c_shrink);
-	return mb_cache_shrink(cache, nr_to_scan);
+	return mb_cache_shrink(cache, sc->nr_to_scan);
 }
 
 /* We shrink 1/X of the cache when we have too many entries in it */
@@ -341,8 +340,8 @@ static void mb_cache_shrink_worker(struct work_struct *work)
 struct mb_cache *mb_cache_create(int bucket_bits)
 {
 	struct mb_cache *cache;
-	int bucket_count = 1 << bucket_bits;
-	int i;
+	unsigned long bucket_count = 1UL << bucket_bits;
+	unsigned long i;
 
 	cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
 	if (!cache)

From b649668c0bb3f54523cd932490a3f637859fdeb0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 3 Dec 2016 15:55:01 -0500
Subject: [PATCH 66/86] mbcache: document that "find" functions only return
 reusable entries

mb_cache_entry_find_first() and mb_cache_entry_find_next() only return
cache entries with the 'e_reusable' bit set.  This should be documented.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index bf659061a9a8..b19be429d655 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -155,12 +155,12 @@ out:
 }
 
 /*
- * mb_cache_entry_find_first - find the first entry in cache with given key
+ * mb_cache_entry_find_first - find the first reusable entry with the given key
  * @cache: cache where we should search
  * @key: key to look for
  *
- * Search in @cache for entry with key @key. Grabs reference to the first
- * entry found and returns the entry.
+ * Search in @cache for a reusable entry with key @key. Grabs reference to the
+ * first reusable entry found and returns the entry.
  */
 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
 						 u32 key)
@@ -170,14 +170,14 @@ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
 EXPORT_SYMBOL(mb_cache_entry_find_first);
 
 /*
- * mb_cache_entry_find_next - find next entry in cache with the same
+ * mb_cache_entry_find_next - find next reusable entry with the same key
  * @cache: cache where we should search
  * @entry: entry to start search from
  *
- * Finds next entry in the hash chain which has the same key as @entry.
- * If @entry is unhashed (which can happen when deletion of entry races
- * with the search), finds the first entry in the hash chain. The function
- * drops reference to @entry and returns with a reference to the found entry.
+ * Finds next reusable entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races with the
+ * search), finds the first reusable entry in the hash chain. The function drops
+ * reference to @entry and returns with a reference to the found entry.
  */
 struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
 						struct mb_cache_entry *entry)

From ab04df78181b271dca096a8050877469889e3c8c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 3 Dec 2016 16:20:53 -0500
Subject: [PATCH 67/86] ext4: fix checks for data=ordered and
 journal_async_commit options

Combination of data=ordered mode and journal_async_commit mount option
is invalid. However the check in parse_options() fails to detect the
case where we simply end up defaulting to data=ordered mode and we
detect the problem only on remount which triggers hard to understand
failure to remount the filesystem.

Fix the checking of mount options to take into account also the default
mode by moving the check somewhat later in the mount sequence.

Reported-by: Wolfgang Walter <linux@stwm.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32c0debbaa92..f185b9a5a024 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1901,12 +1901,6 @@ static int parse_options(char *options, struct super_block *sb,
 			return 0;
 		}
 	}
-	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
-	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-		ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
-			 "in data=ordered mode");
-		return 0;
-	}
 	return 1;
 }
 
@@ -4004,6 +3998,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	default:
 		break;
 	}
+
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+		ext4_msg(sb, KERN_ERR, "can't mount with "
+			"journal_async_commit in data=ordered mode");
+		goto failed_mount_wq;
+	}
+
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
@@ -4897,6 +4899,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			err = -EINVAL;
 			goto restore_opts;
 		}
+	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
+		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				"journal_async_commit in data=ordered mode");
+			err = -EINVAL;
+			goto restore_opts;
+		}
 	}
 
 	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {

From 9e645ab72e6f73873a00d042d0ecc9c148d5a6b6 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Sat, 3 Dec 2016 16:44:45 -0500
Subject: [PATCH 68/86] Documentation: fix description of ext4's block_validity
 mount option

Fix ext4 documentation according to commit 45f1a9c3f63d
("ext4: enable block_validity by default")

Also fix some typos.

[ Further documentation cleanups by tytso ]

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 6c0108eb0137..3698ed3146e3 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -351,14 +351,13 @@ nouid32			Disables 32-bit UIDs and GIDs.  This is for
 			interoperability  with  older kernels which only
 			store and expect 16-bit values.
 
-block_validity		This options allows to enables/disables the in-kernel
+block_validity(*)	These options enable or disable the in-kernel
 noblock_validity	facility for tracking filesystem metadata blocks
-			within internal data structures. This allows multi-
-			block allocator and other routines to quickly locate
-			extents which might overlap with filesystem metadata
-			blocks. This option is intended for debugging
-			purposes and since it negatively affects the
-			performance, it is off by default.
+			within internal data structures.  This allows multi-
+			block allocator and other routines to notice
+			bugs or corrupted allocation bitmaps which cause
+			blocks to be allocated which overlap with
+			filesystem metadata blocks.
 
 dioread_lock		Controls whether or not ext4 should use the DIO read
 dioread_nolock		locking. If the dioread_nolock option is specified

From 011c88e36c26a08590b2ebe214ac854289f85f26 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 3 Dec 2016 16:46:58 -0500
Subject: [PATCH 69/86] ext4: remove another test in ext4_alloc_file_blocks()

Before commit c3fe493ccdb1 ('ext4: remove unneeded test in
ext4_alloc_file_blocks()') then it was possible for "depth" to be -1
but now, it's not possible that it is negative.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 65dbd2bcf775..3e1014fe835e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4700,7 +4700,7 @@ retry:
 		/*
 		 * Recalculate credits when extent tree depth changes.
 		 */
-		if (depth >= 0 && depth != ext_depth(inode)) {
+		if (depth != ext_depth(inode)) {
 			credits = ext4_chunk_trans_blocks(inode, len);
 			depth = ext_depth(inode);
 		}

From 7e6e1ef48fc02f3ac5d0edecbb0c6087cd758d58 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sat, 10 Dec 2016 09:55:01 -0500
Subject: [PATCH 70/86] ext4: reject inodes with negative size

Don't load an inode with a negative size; this causes integer overflow
problems in the VFS.

[ Added EXT4_ERROR_INODE() to mark file system as corrupted. -TYT]

Fixes: a48380f769df (ext4: rename i_dir_acl to i_size_high)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/inode.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a1eac0054203..f9f892212308 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4553,6 +4553,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	struct inode *inode;
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
+	loff_t size;
 	int block;
 	uid_t i_uid;
 	gid_t i_gid;
@@ -4655,6 +4656,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	inode->i_size = ext4_isize(raw_inode);
+	if ((size = i_size_read(inode)) < 0) {
+		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+		ret = -EFSCORRUPTED;
+		goto bad_inode;
+	}
 	ei->i_disksize = inode->i_size;
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;

From 578620f451f836389424833f1454eeeb2ffc9e9f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 10 Dec 2016 09:56:01 -0500
Subject: [PATCH 71/86] ext4: return -ENOMEM instead of success

We should set the error code if kzalloc() fails.

Fixes: 67cf5b09a46f ("ext4: add the basic function for inline data support")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/inline.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9b67f75bdcf7..437df6a1a841 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -341,8 +341,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
 
 	len -= EXT4_MIN_INLINE_DATA_SIZE;
 	value = kzalloc(len, GFP_NOFS);
-	if (!value)
+	if (!value) {
+		error = -ENOMEM;
 		goto out;
+	}
 
 	error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
 				     value, len);

From 73b92a2a5e97d17cc4d5c4fe9d724d3273fb6fd2 Mon Sep 17 00:00:00 2001
From: Sergey Karamov <skaramov@google.com>
Date: Sat, 10 Dec 2016 17:54:58 -0500
Subject: [PATCH 72/86] ext4: do not perform data journaling when data is
 encrypted

Currently data journalling is incompatible with encryption: enabling both
at the same time has never been supported by design, and would result in
unpredictable behavior. However, users are not precluded from turning on
both features simultaneously. This change programmatically replaces data
journaling for encrypted regular files with ordered data journaling mode.

Background:
Journaling encrypted data has not been supported because it operates on
buffer heads of the page in the page cache. Namely, when the commit
happens, which could be up to five seconds after caching, the commit
thread uses the buffer heads attached to the page to copy the contents of
the page to the journal. With encryption, it would have been required to
keep the bounce buffer with ciphertext for up to the aforementioned five
seconds, since the page cache can only hold plaintext and could not be
used for journaling. Alternatively, it would be required to setup the
journal to initiate a callback at the commit time to perform deferred
encryption - in this case, not only would the data have to be written
twice, but it would also have to be encrypted twice. This level of
complexity was not justified for a mode that in practice is very rarely
used because of the overhead from the data journalling.

Solution:
If data=journaled has been set as a mount option for a filesystem, or if
journaling is enabled on a regular file, do not perform journaling if the
file is also encrypted, instead fall back to the data=ordered mode for the
file.

Rationale:
The intent is to allow seamless and proper filesystem operation when
journaling and encryption have both been enabled, and have these two
conflicting features gracefully resolved by the filesystem.

Fixes: 4461471107b7
Signed-off-by: Sergey Karamov <skaramov@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/ext4_jbd2.h | 14 ++++++++------
 fs/ext4/super.c     |  5 +++++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b1d52c14098e..f97611171023 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
 		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
 	/* We do not support data journalling with delayed allocation */
 	if (!S_ISREG(inode->i_mode) ||
-	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
-	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
-	    !test_opt(inode->i_sb, DELALLOC))
+	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+	    !test_opt(inode->i_sb, DELALLOC))) {
+		/* We do not support data journalling for encrypted data */
+		if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
 		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	}
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
 		return EXT4_INODE_ORDERED_DATA_MODE;	/* ordered */
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
-	else
-		BUG();
+	BUG();
 }
 
 static inline int ext4_should_journal_data(struct inode *inode)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f185b9a5a024..79af71d4fccd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3530,6 +3530,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				 "both data=journal and dax");
 			goto failed_mount;
 		}
+		if (ext4_has_feature_encrypt(sb)) {
+			ext4_msg(sb, KERN_WARNING,
+				 "encrypted files will use data=ordered "
+				 "instead of data journaling mode");
+		}
 		if (test_opt(sb, DELALLOC))
 			clear_opt(sb, DELALLOC);
 	} else {

From c1b2212990e1567dc4a4d130e89b0f946e1c38f5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 26 Nov 2016 14:33:40 -0500
Subject: [PATCH 73/86] MAINTAINERS: fscrypto: recommend linux-fsdevel for
 fscrypto patches

The filesystem level encryption support, currently used by ext4 and f2fs
and proposed for ubifs, does not yet have a dedicated mailing list.
Since no mailing lists were specified in MAINTAINERS, get_maintainer.pl
only recommended to send patches directly to the maintainers and to
linux-kernel.  This patch adds linux-fsdevel as the preferred mailing
list for fscrypto patches for the time being.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 411e3b87b8c2..0774714c4e67 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5138,6 +5138,7 @@ F:	include/linux/fscache*.h
 FS-CRYPTO: FILE SYSTEM LEVEL ENCRYPTION SUPPORT
 M:	Theodore Y. Ts'o <tytso@mit.edu>
 M:	Jaegeuk Kim <jaegeuk@kernel.org>
+L:	linux-fsdevel@vger.kernel.org
 S:	Supported
 F:	fs/crypto/
 F:	include/linux/fscrypto.h

From 8048123576d1011e186b1f62478bcd06ac7cabc8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 26 Nov 2016 15:06:43 -0500
Subject: [PATCH 74/86] fscrypto: remove unneeded Kconfig dependencies

SHA256 and ENCRYPTED_KEYS are not needed.  CTR shouldn't be needed
either, but I left it for now because it was intentionally added by
commit 71dea01ea2ed ("ext4 crypto: require CONFIG_CRYPTO_CTR if ext4
encryption is enabled").  So it sounds like there may be a dependency
problem elsewhere, which I have not been able to identify specifically,
that must be solved before CTR can be removed.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 92348faf9865..f514978f6688 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -8,9 +8,7 @@ config FS_ENCRYPTION
 	select CRYPTO_XTS
 	select CRYPTO_CTS
 	select CRYPTO_CTR
-	select CRYPTO_SHA256
 	select KEYS
-	select ENCRYPTED_KEYS
 	help
 	  Enable encryption of files and directories.  This
 	  feature is similar to ecryptfs, but it is more memory

From db717d8e26c2d1b0dba3e08668a1e6a7f665adde Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 26 Nov 2016 19:07:49 -0500
Subject: [PATCH 75/86] fscrypto: move ioctl processing more fully into common
 code

Multiple bugs were recently fixed in the "set encryption policy" ioctl.
To make it clear that fscrypt_process_policy() and fscrypt_get_policy()
implement ioctls and therefore their implementations must take standard
security and correctness precautions, rename them to
fscrypt_ioctl_set_policy() and fscrypt_ioctl_get_policy().  Make the
latter take in a struct file * to make it consistent with the former.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/policy.c       | 34 +++++++++++++++++++++-------------
 fs/ext4/ext4.h           |  4 ++--
 fs/ext4/ioctl.c          | 32 ++++----------------------------
 fs/f2fs/f2fs.h           |  4 ++--
 fs/f2fs/file.c           | 19 ++-----------------
 include/linux/fscrypto.h | 12 ++++++------
 6 files changed, 37 insertions(+), 68 deletions(-)

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6865663aac69..b96a10e3cf78 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode,
 	return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
 }
 
-int fscrypt_process_policy(struct file *filp,
-				const struct fscrypt_policy *policy)
+int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 {
+	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (copy_from_user(&policy, arg, sizeof(policy)))
+		return -EFAULT;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	if (policy->version != 0)
+	if (policy.version != 0)
 		return -EINVAL;
 
 	ret = mnt_want_write_file(filp);
@@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp,
 			ret = -ENOTEMPTY;
 		else
 			ret = create_encryption_context_from_policy(inode,
-								    policy);
+								    &policy);
 	} else if (!is_encryption_context_consistent_with_policy(inode,
-								 policy)) {
+								 &policy)) {
 		printk(KERN_WARNING
 		       "%s: Policy inconsistent with encryption context\n",
 		       __func__);
@@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp,
 	mnt_drop_write_file(filp);
 	return ret;
 }
-EXPORT_SYMBOL(fscrypt_process_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
 
-int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
 {
+	struct inode *inode = file_inode(filp);
 	struct fscrypt_context ctx;
+	struct fscrypt_policy policy;
 	int res;
 
 	if (!inode->i_sb->s_cop->get_context ||
@@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
 	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
 		return -EINVAL;
 
-	policy->version = 0;
-	policy->contents_encryption_mode = ctx.contents_encryption_mode;
-	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
-	policy->flags = ctx.flags;
-	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+	policy.version = 0;
+	policy.contents_encryption_mode = ctx.contents_encryption_mode;
+	policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
+	policy.flags = ctx.flags;
+	memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
 				FS_KEY_DESCRIPTOR_SIZE);
+
+	if (copy_to_user(arg, &policy, sizeof(policy)))
+		return -EFAULT;
 	return 0;
 }
-EXPORT_SYMBOL(fscrypt_get_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
 
 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 282a51b07c57..bd8bc3be93a5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2338,8 +2338,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
 #define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
 #define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
 #define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy		fscrypt_notsupp_process_policy
-#define fscrypt_get_policy		fscrypt_notsupp_get_policy
+#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
 #define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
 #define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
 #define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae8ebbc97..70083863ce51 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -765,22 +765,12 @@ resizefs_out:
 	}
 	case EXT4_IOC_PRECACHE_EXTENTS:
 		return ext4_ext_precache(inode);
-	case EXT4_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-		struct fscrypt_policy policy;
 
+	case EXT4_IOC_SET_ENCRYPTION_POLICY:
 		if (!ext4_has_feature_encrypt(sb))
 			return -EOPNOTSUPP;
+		return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
 
-		if (copy_from_user(&policy,
-				   (struct fscrypt_policy __user *)arg,
-				   sizeof(policy)))
-			return -EFAULT;
-		return fscrypt_process_policy(filp, &policy);
-#else
-		return -EOPNOTSUPP;
-#endif
-	}
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
 		int err, err2;
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -817,23 +807,9 @@ resizefs_out:
 			return -EFAULT;
 		return 0;
 	}
-	case EXT4_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-		struct fscrypt_policy policy;
-		int err = 0;
+	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+		return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 
-		if (!ext4_encrypted_inode(inode))
-			return -ENOENT;
-		err = fscrypt_get_policy(inode, &policy);
-		if (err)
-			return err;
-		if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
-			return -EFAULT;
-		return 0;
-#else
-		return -EOPNOTSUPP;
-#endif
-	}
 	case EXT4_IOC_FSGETXATTR:
 	{
 		struct fsxattr fa;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9e8de18a168a..8e94b7bda42b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2453,8 +2453,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
 #define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
 #define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy		fscrypt_notsupp_process_policy
-#define fscrypt_get_policy		fscrypt_notsupp_get_policy
+#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
 #define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
 #define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
 #define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c7865073cd26..f0c83f74557d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1752,31 +1752,16 @@ static bool uuid_is_nonzero(__u8 u[16])
 
 static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
-	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 
-	if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
-							sizeof(policy)))
-		return -EFAULT;
-
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
-	return fscrypt_process_policy(filp, &policy);
+	return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
 }
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-	struct fscrypt_policy policy;
-	struct inode *inode = file_inode(filp);
-	int err;
-
-	err = fscrypt_get_policy(inode, &policy);
-	if (err)
-		return err;
-
-	if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
-		return -EFAULT;
-	return 0;
+	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
 
 static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 98c71e973a96..be94684dc05f 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -259,8 +259,8 @@ extern void fscrypt_restore_control_page(struct page *);
 extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
 						unsigned int);
 /* policy.c */
-extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *);
-extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *);
+extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
+extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
 extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
 extern int fscrypt_inherit_context(struct inode *, struct inode *,
 					void *, bool);
@@ -334,14 +334,14 @@ static inline int fscrypt_notsupp_zeroout_range(const struct inode *i, pgoff_t p
 }
 
 /* policy.c */
-static inline int fscrypt_notsupp_process_policy(struct file *f,
-				const struct fscrypt_policy *p)
+static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f,
+				const void __user *arg)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int fscrypt_notsupp_get_policy(struct inode *i,
-				struct fscrypt_policy *p)
+static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f,
+				void __user *arg)
 {
 	return -EOPNOTSUPP;
 }

From 3325bea5b26ac67e2521383f10e5ea0156c9a4b6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 26 Nov 2016 20:32:46 -0500
Subject: [PATCH 76/86] fscrypt: rename get_crypt_info() to
 fscrypt_get_crypt_info()

To avoid namespace collisions, rename get_crypt_info() to
fscrypt_get_crypt_info().  The function is only used inside the
fs/crypto directory, so declare it in the new header file,
fscrypt_private.h.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fname.c           |  4 ++--
 fs/crypto/fscrypt_private.h | 19 +++++++++++++++++++
 fs/crypto/keyinfo.c         |  6 +++---
 include/linux/fscrypto.h    |  1 -
 4 files changed, 24 insertions(+), 6 deletions(-)
 create mode 100644 fs/crypto/fscrypt_private.h

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6b45d9caeeb0..56ad9d195f18 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -12,7 +12,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
 
 /**
  * fname_crypt_complete() - completion callback for filename crypto
@@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		fname->disk_name.len = iname->len;
 		return 0;
 	}
-	ret = get_crypt_info(dir);
+	ret = fscrypt_get_crypt_info(dir);
 	if (ret && ret != -EOPNOTSUPP)
 		return ret;
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
new file mode 100644
index 000000000000..7c31108728e4
--- /dev/null
+++ b/fs/crypto/fscrypt_private.h
@@ -0,0 +1,19 @@
+/*
+ * fscrypt_private.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#ifndef _FSCRYPT_PRIVATE_H
+#define _FSCRYPT_PRIVATE_H
+
+#include <linux/fscrypto.h>
+
+/* keyinfo.c */
+extern int fscrypt_get_crypt_info(struct inode *);
+
+#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 67fb6d8876d0..35d3317a27b3 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,7 +10,7 @@
 
 #include <keys/user-type.h>
 #include <linux/scatterlist.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
 
 static void derive_crypt_complete(struct crypto_async_request *req, int rc)
 {
@@ -178,7 +178,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 	kmem_cache_free(fscrypt_info_cachep, ci);
 }
 
-int get_crypt_info(struct inode *inode)
+int fscrypt_get_crypt_info(struct inode *inode)
 {
 	struct fscrypt_info *crypt_info;
 	struct fscrypt_context ctx;
@@ -327,7 +327,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
 		 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
 					       (1 << KEY_FLAG_REVOKED) |
 					       (1 << KEY_FLAG_DEAD)))))
-		return get_crypt_info(inode);
+		return fscrypt_get_crypt_info(inode);
 	return 0;
 }
 EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index be94684dc05f..2f8894f0696c 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -265,7 +265,6 @@ extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
 extern int fscrypt_inherit_context(struct inode *, struct inode *,
 					void *, bool);
 /* keyinfo.c */
-extern int get_crypt_info(struct inode *);
 extern int fscrypt_get_encryption_info(struct inode *);
 extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
 

From b98701df349b7003efd52d9330acbb7be5a255c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 26 Nov 2016 20:43:09 -0500
Subject: [PATCH 77/86] fscrypt: unexport fscrypt_initialize()

The fscrypt_initalize() function isn't used outside fs/crypto, so
there's no point making it be an exported symbol.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/crypto.c          | 1 -
 fs/crypto/fscrypt_private.h | 3 +++
 include/linux/fscrypto.h    | 1 -
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index b6029785714c..56f98f45cece 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -540,7 +540,6 @@ fail:
 	mutex_unlock(&fscrypt_init_mutex);
 	return res;
 }
-EXPORT_SYMBOL(fscrypt_initialize);
 
 /**
  * fscrypt_init() - Set up for fs encryption.
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7c31108728e4..bb92f0c0961b 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -13,6 +13,9 @@
 
 #include <linux/fscrypto.h>
 
+/* crypto.c */
+int fscrypt_initialize(void);
+
 /* keyinfo.c */
 extern int fscrypt_get_crypt_info(struct inode *);
 
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 2f8894f0696c..ce2ebdee6a89 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -244,7 +244,6 @@ static inline void fscrypt_set_d_op(struct dentry *dentry)
 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
 /* crypto.c */
 extern struct kmem_cache *fscrypt_info_cachep;
-int fscrypt_initialize(void);
 
 extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);

From cc4e0df038ddb73510c01712abf302b3f0130147 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 26 Nov 2016 22:05:18 -0500
Subject: [PATCH 78/86] fscrypt: move non-public structures and constants to
 fscrypt_private.h

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/crypto.c          |  2 +-
 fs/crypto/fscrypt_private.h | 71 +++++++++++++++++++++++++++++++++++++
 fs/crypto/policy.c          |  2 +-
 include/linux/fscrypto.h    | 68 ++---------------------------------
 4 files changed, 76 insertions(+), 67 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 56f98f45cece..4d9d221b1d60 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -27,7 +27,7 @@
 #include <linux/bio.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
 
 static unsigned int num_prealloc_crypto_pages = 32;
 static unsigned int num_prealloc_crypto_ctxs = 128;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index bb92f0c0961b..c98b2a7fb6d3 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -13,6 +13,77 @@
 
 #include <linux/fscrypto.h>
 
+#define FS_FNAME_CRYPTO_DIGEST_SIZE	32
+
+/* Encryption parameters */
+#define FS_XTS_TWEAK_SIZE		16
+#define FS_AES_128_ECB_KEY_SIZE		16
+#define FS_AES_256_GCM_KEY_SIZE		32
+#define FS_AES_256_CBC_KEY_SIZE		32
+#define FS_AES_256_CTS_KEY_SIZE		32
+#define FS_AES_256_XTS_KEY_SIZE		64
+#define FS_MAX_KEY_SIZE			64
+
+#define FS_KEY_DESC_PREFIX		"fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE		8
+
+#define FS_KEY_DERIVATION_NONCE_SIZE		16
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Flags
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+	u8 format;
+	u8 contents_encryption_mode;
+	u8 filenames_encryption_mode;
+	u8 flags;
+	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
+
+/* This is passed in from userspace into the kernel keyring */
+struct fscrypt_key {
+	u32 mode;
+	u8 raw[FS_MAX_KEY_SIZE];
+	u32 size;
+} __packed;
+
+/*
+ * A pointer to this structure is stored in the file system's in-core
+ * representation of an inode.
+ */
+struct fscrypt_info {
+	u8 ci_data_mode;
+	u8 ci_filename_mode;
+	u8 ci_flags;
+	struct crypto_skcipher *ci_ctfm;
+	struct key *ci_keyring_key;
+	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
+#define FS_WRITE_PATH_FL			0x00000002
+
+struct fscrypt_completion_result {
+	struct completion completion;
+	int res;
+};
+
+#define DECLARE_FS_COMPLETION_RESULT(ecr) \
+	struct fscrypt_completion_result ecr = { \
+		COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+
 /* crypto.c */
 int fscrypt_initialize(void);
 
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index b96a10e3cf78..6ed7c2eebeec 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,8 +10,8 @@
 
 #include <linux/random.h>
 #include <linux/string.h>
-#include <linux/fscrypto.h>
 #include <linux/mount.h>
+#include "fscrypt_private.h"
 
 static int inode_has_encryption_context(struct inode *inode)
 {
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index ce2ebdee6a89..71e8a20711ec 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -18,9 +18,6 @@
 #include <crypto/skcipher.h>
 #include <uapi/linux/fs.h>
 
-#define FS_KEY_DERIVATION_NONCE_SIZE		16
-#define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
-
 #define FS_POLICY_FLAGS_PAD_4		0x00
 #define FS_POLICY_FLAGS_PAD_8		0x01
 #define FS_POLICY_FLAGS_PAD_16		0x02
@@ -35,56 +32,10 @@
 #define FS_ENCRYPTION_MODE_AES_256_CBC		3
 #define FS_ENCRYPTION_MODE_AES_256_CTS		4
 
-/**
- * Encryption context for inode
- *
- * Protector format:
- *  1 byte: Protector format (1 = this version)
- *  1 byte: File contents encryption mode
- *  1 byte: File names encryption mode
- *  1 byte: Flags
- *  8 bytes: Master Key descriptor
- *  16 bytes: Encryption Key derivation nonce
- */
-struct fscrypt_context {
-	u8 format;
-	u8 contents_encryption_mode;
-	u8 filenames_encryption_mode;
-	u8 flags;
-	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
-	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
-} __packed;
+#define FS_CRYPTO_BLOCK_SIZE		16
 
-/* Encryption parameters */
-#define FS_XTS_TWEAK_SIZE		16
-#define FS_AES_128_ECB_KEY_SIZE		16
-#define FS_AES_256_GCM_KEY_SIZE		32
-#define FS_AES_256_CBC_KEY_SIZE		32
-#define FS_AES_256_CTS_KEY_SIZE		32
-#define FS_AES_256_XTS_KEY_SIZE		64
-#define FS_MAX_KEY_SIZE			64
-
-#define FS_KEY_DESC_PREFIX		"fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE		8
-
-/* This is passed in from userspace into the kernel keyring */
-struct fscrypt_key {
-	u32 mode;
-	u8 raw[FS_MAX_KEY_SIZE];
-	u32 size;
-} __packed;
-
-struct fscrypt_info {
-	u8 ci_data_mode;
-	u8 ci_filename_mode;
-	u8 ci_flags;
-	struct crypto_skcipher *ci_ctfm;
-	struct key *ci_keyring_key;
-	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
-#define FS_WRITE_PATH_FL			0x00000002
+struct fscrypt_info;
+struct fscrypt_ctx;
 
 struct fscrypt_ctx {
 	union {
@@ -102,19 +53,6 @@ struct fscrypt_ctx {
 	u8 mode;				/* Encryption mode for tfm */
 };
 
-struct fscrypt_completion_result {
-	struct completion completion;
-	int res;
-};
-
-#define DECLARE_FS_COMPLETION_RESULT(ecr) \
-	struct fscrypt_completion_result ecr = { \
-		COMPLETION_INITIALIZER((ecr).completion), 0 }
-
-#define FS_FNAME_NUM_SCATTER_ENTRIES	4
-#define FS_CRYPTO_BLOCK_SIZE		16
-#define FS_FNAME_CRYPTO_DIGEST_SIZE	32
-
 /**
  * For encrypted symlinks, the ciphertext length is stored at the beginning
  * of the string in little-endian format.

From 41d5319af3368127b55f6587f1c747dd6a7b9b04 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 26 Nov 2016 22:18:59 -0500
Subject: [PATCH 79/86] fscrypt: move the policy flags and encryption mode
 definitions to uapi header

These constants are part of the UAPI, so they belong in
include/uapi/linux/fs.h instead of include/linux/fscrypto.h

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fscrypto.h | 14 --------------
 include/uapi/linux/fs.h  | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 71e8a20711ec..42ef82d60790 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -18,20 +18,6 @@
 #include <crypto/skcipher.h>
 #include <uapi/linux/fs.h>
 
-#define FS_POLICY_FLAGS_PAD_4		0x00
-#define FS_POLICY_FLAGS_PAD_8		0x01
-#define FS_POLICY_FLAGS_PAD_16		0x02
-#define FS_POLICY_FLAGS_PAD_32		0x03
-#define FS_POLICY_FLAGS_PAD_MASK	0x03
-#define FS_POLICY_FLAGS_VALID		0x03
-
-/* Encryption algorithms */
-#define FS_ENCRYPTION_MODE_INVALID		0
-#define FS_ENCRYPTION_MODE_AES_256_XTS		1
-#define FS_ENCRYPTION_MODE_AES_256_GCM		2
-#define FS_ENCRYPTION_MODE_AES_256_CBC		3
-#define FS_ENCRYPTION_MODE_AES_256_CTS		4
-
 #define FS_CRYPTO_BLOCK_SIZE		16
 
 struct fscrypt_info;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index acb2b6152ba0..0496d37abe28 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -254,6 +254,20 @@ struct fsxattr {
 /* Policy provided via an ioctl on the topmost directory */
 #define FS_KEY_DESCRIPTOR_SIZE	8
 
+#define FS_POLICY_FLAGS_PAD_4		0x00
+#define FS_POLICY_FLAGS_PAD_8		0x01
+#define FS_POLICY_FLAGS_PAD_16		0x02
+#define FS_POLICY_FLAGS_PAD_32		0x03
+#define FS_POLICY_FLAGS_PAD_MASK	0x03
+#define FS_POLICY_FLAGS_VALID		0x03
+
+/* Encryption algorithms */
+#define FS_ENCRYPTION_MODE_INVALID		0
+#define FS_ENCRYPTION_MODE_AES_256_XTS		1
+#define FS_ENCRYPTION_MODE_AES_256_GCM		2
+#define FS_ENCRYPTION_MODE_AES_256_CBC		3
+#define FS_ENCRYPTION_MODE_AES_256_CTS		4
+
 struct fscrypt_policy {
 	__u8 version;
 	__u8 contents_encryption_mode;

From e550c16c8aab95c7996b0c864d5fcedc2c430a43 Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Tue, 6 Dec 2016 23:53:53 +0100
Subject: [PATCH 80/86] fscrypt: Use correct index in decrypt path.

Actually use the fs-provided index instead of always using page->index
which is only set for page-cache pages.

Fixes: 9c4bb8a3a9b4 ("fscrypt: Let fs select encryption index/tweak")

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 4d9d221b1d60..d19a42f3a983 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -296,7 +296,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
 int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
 			unsigned int len, unsigned int offs, pgoff_t index)
 {
-	return do_page_crypto(inode, FS_DECRYPT, page->index, page, page, len, offs,
+	return do_page_crypto(inode, FS_DECRYPT, index, page, page, len, offs,
 			GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);

From 9e532772b4e36888584efc7a9531143bd43355b3 Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Tue, 6 Dec 2016 23:53:54 +0100
Subject: [PATCH 81/86] fscrypt: Never allocate fscrypt_ctx on in-place
 encryption

In case of in-place encryption fscrypt_ctx was allocated but never
released. Since we don't need it for in-place encryption, we skip
allocating it.

Fixes: 1c7dcf69eea3 ("fscrypt: Add in-place encryption mode")

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index d19a42f3a983..f287f76cc906 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -246,16 +246,26 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 
 	BUG_ON(plaintext_len % FS_CRYPTO_BLOCK_SIZE != 0);
 
+	if (inode->i_sb->s_cop->flags & FS_CFLG_INPLACE_ENCRYPTION) {
+		/* with inplace-encryption we just encrypt the page */
+		err = do_page_crypto(inode, FS_ENCRYPT, index,
+					plaintext_page, ciphertext_page,
+					plaintext_len, plaintext_offset,
+					gfp_flags);
+		if (err)
+			return ERR_PTR(err);
+
+		return ciphertext_page;
+	}
+
 	ctx = fscrypt_get_ctx(inode, gfp_flags);
 	if (IS_ERR(ctx))
 		return (struct page *)ctx;
 
-	if (!(inode->i_sb->s_cop->flags & FS_CFLG_INPLACE_ENCRYPTION)) {
-		/* The encryption operation will require a bounce page. */
-		ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
-		if (IS_ERR(ciphertext_page))
-			goto errout;
-	}
+	/* The encryption operation will require a bounce page. */
+	ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+	if (IS_ERR(ciphertext_page))
+		goto errout;
 
 	ctx->w.control_page = plaintext_page;
 	err = do_page_crypto(inode, FS_ENCRYPT, index,
@@ -266,11 +276,9 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 		ciphertext_page = ERR_PTR(err);
 		goto errout;
 	}
-	if (!(inode->i_sb->s_cop->flags & FS_CFLG_INPLACE_ENCRYPTION)) {
-		SetPagePrivate(ciphertext_page);
-		set_page_private(ciphertext_page, (unsigned long)ctx);
-		lock_page(ciphertext_page);
-	}
+	SetPagePrivate(ciphertext_page);
+	set_page_private(ciphertext_page, (unsigned long)ctx);
+	lock_page(ciphertext_page);
 	return ciphertext_page;
 
 errout:

From 1400451f04f2ff28b658b92557495e5090914aee Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Tue, 6 Dec 2016 23:53:55 +0100
Subject: [PATCH 82/86] fscrypt: Cleanup fscrypt_{decrypt,encrypt}_page()

- Improve documentation
- Add BUG_ON(len == 0) to avoid accidental switch of offs and len
parameters
- Improve variable names for readability

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c       | 93 ++++++++++++++++++++++------------------
 include/linux/fscrypto.h |  8 ++--
 2 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index f287f76cc906..8c104e712bb2 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -147,9 +147,9 @@ typedef enum {
 } fscrypt_direction_t;
 
 static int do_page_crypto(const struct inode *inode,
-			fscrypt_direction_t rw, pgoff_t index,
+			fscrypt_direction_t rw, u64 lblk_num,
 			struct page *src_page, struct page *dest_page,
-			unsigned int src_len, unsigned int src_offset,
+			unsigned int len, unsigned int offs,
 			gfp_t gfp_flags)
 {
 	struct {
@@ -163,6 +163,8 @@ static int do_page_crypto(const struct inode *inode,
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 
+	BUG_ON(len == 0);
+
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
@@ -176,14 +178,14 @@ static int do_page_crypto(const struct inode *inode,
 		page_crypt_complete, &ecr);
 
 	BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
-	xts_tweak.index = cpu_to_le64(index);
+	xts_tweak.index = cpu_to_le64(lblk_num);
 	memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
 
 	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, src_len, src_offset);
+	sg_set_page(&dst, dest_page, len, offs);
 	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, src_len, src_offset);
-	skcipher_request_set_crypt(req, &src, &dst, src_len, &xts_tweak);
+	sg_set_page(&src, src_page, len, offs);
+	skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak);
 	if (rw == FS_DECRYPT)
 		res = crypto_skcipher_decrypt(req);
 	else
@@ -214,44 +216,53 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
 
 /**
  * fscypt_encrypt_page() - Encrypts a page
- * @inode:            The inode for which the encryption should take place
- * @plaintext_page:   The page to encrypt. Must be locked.
- * @plaintext_len:    Length of plaintext within page
- * @plaintext_offset: Offset of plaintext within page
- * @index:            Index for encryption. This is mainly the page index, but
- *                    but might be different for multiple calls on same page.
- * @gfp_flags:        The gfp flag for memory allocation
+ * @inode:     The inode for which the encryption should take place
+ * @page:      The page to encrypt. Must be locked for bounce-page
+ *             encryption.
+ * @len:       Length of data to encrypt in @page and encrypted
+ *             data in returned page.
+ * @offs:      Offset of data within @page and returned
+ *             page holding encrypted data.
+ * @lblk_num:  Logical block number. This must be unique for multiple
+ *             calls with same inode, except when overwriting
+ *             previously written data.
+ * @gfp_flags: The gfp flag for memory allocation
  *
- * Encrypts plaintext_page using the ctx encryption context. If
- * the filesystem supports it, encryption is performed in-place, otherwise a
- * new ciphertext_page is allocated and returned.
+ * Encrypts @page using the ctx encryption context. Performs encryption
+ * either in-place or into a newly allocated bounce page.
+ * Called on the page write path.
  *
- * Called on the page write path.  The caller must call
+ * Bounce page allocation is the default.
+ * In this case, the contents of @page are encrypted and stored in an
+ * allocated bounce page. @page has to be locked and the caller must call
  * fscrypt_restore_control_page() on the returned ciphertext page to
  * release the bounce buffer and the encryption context.
  *
- * Return: An allocated page with the encrypted content on success. Else, an
+ * In-place encryption is used by setting the FS_CFLG_INPLACE_ENCRYPTION flag in
+ * fscrypt_operations. Here, the input-page is returned with its content
+ * encrypted.
+ *
+ * Return: A page with the encrypted content on success. Else, an
  * error value or NULL.
  */
 struct page *fscrypt_encrypt_page(const struct inode *inode,
-				struct page *plaintext_page,
-				unsigned int plaintext_len,
-				unsigned int plaintext_offset,
-				pgoff_t index, gfp_t gfp_flags)
+				struct page *page,
+				unsigned int len,
+				unsigned int offs,
+				u64 lblk_num, gfp_t gfp_flags)
 
 {
 	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = plaintext_page;
+	struct page *ciphertext_page = page;
 	int err;
 
-	BUG_ON(plaintext_len % FS_CRYPTO_BLOCK_SIZE != 0);
+	BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
 
 	if (inode->i_sb->s_cop->flags & FS_CFLG_INPLACE_ENCRYPTION) {
 		/* with inplace-encryption we just encrypt the page */
-		err = do_page_crypto(inode, FS_ENCRYPT, index,
-					plaintext_page, ciphertext_page,
-					plaintext_len, plaintext_offset,
-					gfp_flags);
+		err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+					page, ciphertext_page,
+					len, offs, gfp_flags);
 		if (err)
 			return ERR_PTR(err);
 
@@ -267,11 +278,10 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 	if (IS_ERR(ciphertext_page))
 		goto errout;
 
-	ctx->w.control_page = plaintext_page;
-	err = do_page_crypto(inode, FS_ENCRYPT, index,
-					plaintext_page, ciphertext_page,
-					plaintext_len, plaintext_offset,
-					gfp_flags);
+	ctx->w.control_page = page;
+	err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+					page, ciphertext_page,
+					len, offs, gfp_flags);
 	if (err) {
 		ciphertext_page = ERR_PTR(err);
 		goto errout;
@@ -289,11 +299,12 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
 
 /**
  * fscrypt_decrypt_page() - Decrypts a page in-place
- * @inode: Encrypted inode to decrypt.
- * @page:  The page to decrypt. Must be locked.
- * @len:   Number of bytes in @page to be decrypted.
- * @offs:  Start of data in @page.
- * @index: Index for encryption.
+ * @inode:     The corresponding inode for the page to decrypt.
+ * @page:      The page to decrypt. Must be locked in case
+ *             it is a writeback page.
+ * @len:       Number of bytes in @page to be decrypted.
+ * @offs:      Start of data in @page.
+ * @lblk_num:  Logical block number.
  *
  * Decrypts page in-place using the ctx encryption context.
  *
@@ -302,10 +313,10 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  * Return: Zero on success, non-zero otherwise.
  */
 int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
-			unsigned int len, unsigned int offs, pgoff_t index)
+			unsigned int len, unsigned int offs, u64 lblk_num)
 {
-	return do_page_crypto(inode, FS_DECRYPT, index, page, page, len, offs,
-			GFP_NOFS);
+	return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len,
+			offs, GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
 
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 42ef82d60790..2d9abfa22b94 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -173,9 +173,9 @@ extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
 extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
 						unsigned int, unsigned int,
-						pgoff_t, gfp_t);
+						u64, gfp_t);
 extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
-				unsigned int, pgoff_t);
+				unsigned int, u64);
 extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
 extern void fscrypt_pullback_bio_page(struct page **, bool);
 extern void fscrypt_restore_control_page(struct page *);
@@ -221,14 +221,14 @@ static inline struct page *fscrypt_notsupp_encrypt_page(const struct inode *i,
 						struct page *p,
 						unsigned int len,
 						unsigned int offs,
-						pgoff_t index, gfp_t f)
+						u64 lblk_num, gfp_t f)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline int fscrypt_notsupp_decrypt_page(const struct inode *i, struct page *p,
 						unsigned int len, unsigned int offs,
-						pgoff_t index)
+						u64 lblk_num)
 {
 	return -EOPNOTSUPP;
 }

From bd7b8290388dd58a8c0a3710b171e58ef952ca4d Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Tue, 6 Dec 2016 23:53:56 +0100
Subject: [PATCH 83/86] fscrypt: Cleanup page locking requirements for
 fscrypt_{decrypt,encrypt}_page()

Rename the FS_CFLG_INPLACE_ENCRYPTION flag to FS_CFLG_OWN_PAGES which,
when set, indicates that the fs uses pages under its own control as
opposed to writeback pages which require locking and a bounce buffer for
encryption.

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c       | 11 ++++++++---
 fs/ext4/inode.c          |  1 -
 fs/f2fs/data.c           |  1 -
 include/linux/fscrypto.h |  2 +-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 8c104e712bb2..5ffc59436397 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -238,7 +238,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
  * fscrypt_restore_control_page() on the returned ciphertext page to
  * release the bounce buffer and the encryption context.
  *
- * In-place encryption is used by setting the FS_CFLG_INPLACE_ENCRYPTION flag in
+ * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
  * fscrypt_operations. Here, the input-page is returned with its content
  * encrypted.
  *
@@ -258,7 +258,7 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 
 	BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
 
-	if (inode->i_sb->s_cop->flags & FS_CFLG_INPLACE_ENCRYPTION) {
+	if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
 		/* with inplace-encryption we just encrypt the page */
 		err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
 					page, ciphertext_page,
@@ -269,6 +269,8 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 		return ciphertext_page;
 	}
 
+	BUG_ON(!PageLocked(page));
+
 	ctx = fscrypt_get_ctx(inode, gfp_flags);
 	if (IS_ERR(ctx))
 		return (struct page *)ctx;
@@ -301,7 +303,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  * fscrypt_decrypt_page() - Decrypts a page in-place
  * @inode:     The corresponding inode for the page to decrypt.
  * @page:      The page to decrypt. Must be locked in case
- *             it is a writeback page.
+ *             it is a writeback page (FS_CFLG_OWN_PAGES unset).
  * @len:       Number of bytes in @page to be decrypted.
  * @offs:      Start of data in @page.
  * @lblk_num:  Logical block number.
@@ -315,6 +317,9 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
 int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
 			unsigned int len, unsigned int offs, u64 lblk_num)
 {
+	if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
+		BUG_ON(!PageLocked(page));
+
 	return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len,
 			offs, GFP_NOFS);
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1485ac273bfb..fb2b514f675b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3744,7 +3744,6 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 			/* We expect the key to be set. */
 			BUG_ON(!fscrypt_has_encryption_key(inode));
 			BUG_ON(blocksize != PAGE_SIZE);
-			BUG_ON(!PageLocked(page));
 			WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
 						page, PAGE_SIZE, 0, page->index));
 		}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 435590c4b341..9f0ba90b92e4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1194,7 +1194,6 @@ int do_write_data_page(struct f2fs_io_info *fio)
 		f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
 							fio->old_blkaddr);
 retry_encrypt:
-		BUG_ON(!PageLocked(fio->page));
 		fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
 							PAGE_SIZE, 0,
 							fio->page->index,
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 2d9abfa22b94..188b4fa59cbf 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -80,7 +80,7 @@ struct fscrypt_name {
 /*
  * fscrypt superblock flags
  */
-#define FS_CFLG_INPLACE_ENCRYPTION (1U << 1)
+#define FS_CFLG_OWN_PAGES (1U << 1)
 
 /*
  * crypto opertions for filesystems

From f32d7ac20a5864483c1f96e4970daa083e18bfd1 Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Tue, 6 Dec 2016 23:53:57 +0100
Subject: [PATCH 84/86] fscrypt: Delay bounce page pool allocation until needed

Since fscrypt users can now indicated if fscrypt_encrypt_page() should
use a bounce page, we can delay the bounce page pool initialization util
it is really needed. That is until fscrypt_operations has no
FS_CFLG_OWN_PAGES flag set.

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c          | 9 +++++++--
 fs/crypto/fscrypt_private.h | 2 +-
 fs/crypto/keyinfo.c         | 2 +-
 include/linux/fscrypto.h    | 1 -
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 5ffc59436397..bc1d4781b9ec 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -525,17 +525,22 @@ static void fscrypt_destroy(void)
 
 /**
  * fscrypt_initialize() - allocate major buffers for fs encryption.
+ * @cop_flags:  fscrypt operations flags
  *
  * We only call this when we start accessing encrypted files, since it
  * results in memory getting allocated that wouldn't otherwise be used.
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int fscrypt_initialize(void)
+int fscrypt_initialize(unsigned int cop_flags)
 {
 	int i, res = -ENOMEM;
 
-	if (fscrypt_bounce_page_pool)
+	/*
+	 * No need to allocate a bounce page pool if there already is one or
+	 * this FS won't use it.
+	 */
+	if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool)
 		return 0;
 
 	mutex_lock(&fscrypt_init_mutex);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index c98b2a7fb6d3..7ba10cd45a2e 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -85,7 +85,7 @@ struct fscrypt_completion_result {
 
 
 /* crypto.c */
-int fscrypt_initialize(void);
+int fscrypt_initialize(unsigned int cop_flags);
 
 /* keyinfo.c */
 extern int fscrypt_get_crypt_info(struct inode *);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 35d3317a27b3..6eeea1dcba41 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -188,7 +188,7 @@ int fscrypt_get_crypt_info(struct inode *inode)
 	u8 *raw_key = NULL;
 	int res;
 
-	res = fscrypt_initialize();
+	res = fscrypt_initialize(inode->i_sb->s_cop->flags);
 	if (res)
 		return res;
 
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 188b4fa59cbf..1adc1c758d31 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -168,7 +168,6 @@ static inline void fscrypt_set_d_op(struct dentry *dentry)
 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
 /* crypto.c */
 extern struct kmem_cache *fscrypt_info_cachep;
-
 extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
 extern void fscrypt_release_ctx(struct fscrypt_ctx *);
 extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,

From 6a34e4d2be07255cc59e6ccaf820669cfd7f815c Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Tue, 6 Dec 2016 23:53:58 +0100
Subject: [PATCH 85/86] fscrypt: Rename FS_WRITE_PATH_FL to
 FS_CTX_HAS_BOUNCE_BUFFER_FL

... to better explain its purpose after introducing in-place encryption
without bounce buffer.

Signed-off-by: David Gstir <david@sigma-star.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/crypto.c          | 6 +++---
 fs/crypto/fscrypt_private.h | 2 +-
 include/linux/fscrypto.h    | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index bc1d4781b9ec..ac8e4f6a3773 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
 {
 	unsigned long flags;
 
-	if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+	if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
 		mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
 		ctx->w.bounce_page = NULL;
 	}
@@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
 	} else {
 		ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
 	}
-	ctx->flags &= ~FS_WRITE_PATH_FL;
+	ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
 	return ctx;
 }
 EXPORT_SYMBOL(fscrypt_get_ctx);
@@ -210,7 +210,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
 	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
 	if (ctx->w.bounce_page == NULL)
 		return ERR_PTR(-ENOMEM);
-	ctx->flags |= FS_WRITE_PATH_FL;
+	ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
 	return ctx->w.bounce_page;
 }
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 7ba10cd45a2e..aeab032d7d35 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -72,7 +72,7 @@ struct fscrypt_info {
 };
 
 #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
-#define FS_WRITE_PATH_FL			0x00000002
+#define FS_CTX_HAS_BOUNCE_BUFFER_FL		0x00000002
 
 struct fscrypt_completion_result {
 	struct completion completion;
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
index 1adc1c758d31..c074b670aa99 100644
--- a/include/linux/fscrypto.h
+++ b/include/linux/fscrypto.h
@@ -21,7 +21,6 @@
 #define FS_CRYPTO_BLOCK_SIZE		16
 
 struct fscrypt_info;
-struct fscrypt_ctx;
 
 struct fscrypt_ctx {
 	union {

From 0cb80b4847553582830a59da2c022c37a1f4a119 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Dec 2016 21:34:12 -0500
Subject: [PATCH 86/86] dax: Fix sleep in atomic contex in grab_mapping_entry()

Commit 642261ac995e: "dax: add struct iomap based DAX PMD support" has
introduced unmapping of page tables if huge page needs to be split in
grab_mapping_entry(). However the unmapping happens after
radix_tree_preload() call which disables preemption and thus
unmap_mapping_range() tries to acquire i_mmap_lock in atomic context
which is a bug. Fix the problem by moving unmapping before
radix_tree_preload() call.

Fixes: 642261ac995e01d7837db1f4b90181496f7e6835
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/dax.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index ad131cd2605d..5bfd27b4a69c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -333,14 +333,6 @@ restart:
 		}
 
 		spin_unlock_irq(&mapping->tree_lock);
-		err = radix_tree_preload(
-				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
-		if (err) {
-			if (pmd_downgrade)
-				put_locked_mapping_entry(mapping, index, entry);
-			return ERR_PTR(err);
-		}
-
 		/*
 		 * Besides huge zero pages the only other thing that gets
 		 * downgraded are empty entries which don't need to be
@@ -350,6 +342,13 @@ restart:
 			unmap_mapping_range(mapping,
 				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
 
+		err = radix_tree_preload(
+				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+		if (err) {
+			if (pmd_downgrade)
+				put_locked_mapping_entry(mapping, index, entry);
+			return ERR_PTR(err);
+		}
 		spin_lock_irq(&mapping->tree_lock);
 
 		if (pmd_downgrade) {