From 742c8fdc31e820503f9267070311d894978d1349 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 21 Oct 2016 10:06:40 -0400
Subject: [PATCH 01/50] dm bio prison v2: new interface for the bio prison

The deferred set is gone and all methods have _v2 appended to the end of
their names to allow for continued use of the original bio prison in DM
thin-provisioning.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Makefile                           |   1 +
 .../{dm-bio-prison.c => dm-bio-prison-v1.c}   |  46 ++-
 .../{dm-bio-prison.h => dm-bio-prison-v1.h}   |   2 +-
 drivers/md/dm-bio-prison-v2.c                 | 369 ++++++++++++++++++
 drivers/md/dm-bio-prison-v2.h                 | 152 ++++++++
 drivers/md/dm-cache-target.c                  |   2 +-
 drivers/md/dm-thin.c                          |   2 +-
 7 files changed, 568 insertions(+), 6 deletions(-)
 rename drivers/md/{dm-bio-prison.c => dm-bio-prison-v1.c} (94%)
 rename drivers/md/{dm-bio-prison.h => dm-bio-prison-v1.h} (99%)
 create mode 100644 drivers/md/dm-bio-prison-v2.c
 create mode 100644 drivers/md/dm-bio-prison-v2.h

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..d378b1db7852 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,6 +11,7 @@ dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 dm-mirror-y	+= dm-raid1.o
 dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
+dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
 dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
 dm-cache-smq-y   += dm-cache-policy-smq.o
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison-v1.c
similarity index 94%
rename from drivers/md/dm-bio-prison.c
rename to drivers/md/dm-bio-prison-v1.c
index 03af174485d3..ae7da2c30a57 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -5,7 +5,8 @@
  */
 
 #include "dm.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v1.h"
+#include "dm-bio-prison-v2.h"
 
 #include <linux/spinlock.h>
 #include <linux/mempool.h>
@@ -398,7 +399,7 @@ EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
 
 /*----------------------------------------------------------------*/
 
-static int __init dm_bio_prison_init(void)
+static int __init dm_bio_prison_init_v1(void)
 {
 	_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
 	if (!_cell_cache)
@@ -407,12 +408,51 @@ static int __init dm_bio_prison_init(void)
 	return 0;
 }
 
-static void __exit dm_bio_prison_exit(void)
+static void dm_bio_prison_exit_v1(void)
 {
 	kmem_cache_destroy(_cell_cache);
 	_cell_cache = NULL;
 }
 
+static int (*_inits[])(void) __initdata = {
+	dm_bio_prison_init_v1,
+	dm_bio_prison_init_v2,
+};
+
+static void (*_exits[])(void) = {
+	dm_bio_prison_exit_v1,
+	dm_bio_prison_exit_v2,
+};
+
+static int __init dm_bio_prison_init(void)
+{
+	const int count = ARRAY_SIZE(_inits);
+
+	int r, i;
+
+	for (i = 0; i < count; i++) {
+		r = _inits[i]();
+		if (r)
+			goto bad;
+	}
+
+	return 0;
+
+      bad:
+	while (i--)
+		_exits[i]();
+
+	return r;
+}
+
+static void __exit dm_bio_prison_exit(void)
+{
+	int i = ARRAY_SIZE(_exits);
+
+	while (i--)
+		_exits[i]();
+}
+
 /*
  * module hooks
  */
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison-v1.h
similarity index 99%
rename from drivers/md/dm-bio-prison.h
rename to drivers/md/dm-bio-prison-v1.h
index 54352f009bfd..cddd4ac07e2c 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011-2012 Red Hat, Inc.
+ * Copyright (C) 2011-2017 Red Hat, Inc.
  *
  * This file is released under the GPL.
  */
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
new file mode 100644
index 000000000000..c9b11f799cd8
--- /dev/null
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2012-2017 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison-v2.h"
+
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+
+/*----------------------------------------------------------------*/
+
+#define MIN_CELLS 1024
+
+struct dm_bio_prison_v2 {
+	struct workqueue_struct *wq;
+
+	spinlock_t lock;
+	mempool_t *cell_pool;
+	struct rb_root cells;
+};
+
+static struct kmem_cache *_cell_cache;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * @nr_cells should be the number of cells you want in use _concurrently_.
+ * Don't confuse it with the number of distinct keys.
+ */
+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
+{
+	struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+
+	if (!prison)
+		return NULL;
+
+	prison->wq = wq;
+	spin_lock_init(&prison->lock);
+
+	prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
+	if (!prison->cell_pool) {
+		kfree(prison);
+		return NULL;
+	}
+
+	prison->cells = RB_ROOT;
+
+	return prison;
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
+
+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
+{
+	mempool_destroy(prison->cell_pool);
+	kfree(prison);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
+
+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
+{
+	return mempool_alloc(prison->cell_pool, gfp);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
+
+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
+				struct dm_bio_prison_cell_v2 *cell)
+{
+	mempool_free(cell, prison->cell_pool);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
+
+static void __setup_new_cell(struct dm_cell_key_v2 *key,
+			     struct dm_bio_prison_cell_v2 *cell)
+{
+	memset(cell, 0, sizeof(*cell));
+	memcpy(&cell->key, key, sizeof(cell->key));
+	bio_list_init(&cell->bios);
+}
+
+static int cmp_keys(struct dm_cell_key_v2 *lhs,
+		    struct dm_cell_key_v2 *rhs)
+{
+	if (lhs->virtual < rhs->virtual)
+		return -1;
+
+	if (lhs->virtual > rhs->virtual)
+		return 1;
+
+	if (lhs->dev < rhs->dev)
+		return -1;
+
+	if (lhs->dev > rhs->dev)
+		return 1;
+
+	if (lhs->block_end <= rhs->block_begin)
+		return -1;
+
+	if (lhs->block_begin >= rhs->block_end)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Returns true if node found, otherwise it inserts a new one.
+ */
+static bool __find_or_insert(struct dm_bio_prison_v2 *prison,
+			     struct dm_cell_key_v2 *key,
+			     struct dm_bio_prison_cell_v2 *cell_prealloc,
+			     struct dm_bio_prison_cell_v2 **result)
+{
+	int r;
+	struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+	while (*new) {
+		struct dm_bio_prison_cell_v2 *cell =
+			container_of(*new, struct dm_bio_prison_cell_v2, node);
+
+		r = cmp_keys(key, &cell->key);
+
+		parent = *new;
+		if (r < 0)
+			new = &((*new)->rb_left);
+
+		else if (r > 0)
+			new = &((*new)->rb_right);
+
+		else {
+			*result = cell;
+			return true;
+		}
+	}
+
+	__setup_new_cell(key, cell_prealloc);
+	*result = cell_prealloc;
+	rb_link_node(&cell_prealloc->node, parent, new);
+	rb_insert_color(&cell_prealloc->node, &prison->cells);
+
+	return false;
+}
+
+static bool __get(struct dm_bio_prison_v2 *prison,
+		  struct dm_cell_key_v2 *key,
+		  unsigned lock_level,
+		  struct bio *inmate,
+		  struct dm_bio_prison_cell_v2 *cell_prealloc,
+		  struct dm_bio_prison_cell_v2 **cell)
+{
+	if (__find_or_insert(prison, key, cell_prealloc, cell)) {
+		if ((*cell)->exclusive_lock) {
+			if (lock_level <= (*cell)->exclusive_level) {
+				bio_list_add(&(*cell)->bios, inmate);
+				return false;
+			}
+		}
+
+		(*cell)->shared_count++;
+
+	} else
+		(*cell)->shared_count = 1;
+
+	return true;
+}
+
+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_cell_key_v2 *key,
+		    unsigned lock_level,
+		    struct bio *inmate,
+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
+		    struct dm_bio_prison_cell_v2 **cell_result)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_get_v2);
+
+static bool __put(struct dm_bio_prison_v2 *prison,
+		  struct dm_bio_prison_cell_v2 *cell)
+{
+	BUG_ON(!cell->shared_count);
+	cell->shared_count--;
+
+	// FIXME: shared locks granted above the lock level could starve this
+	if (!cell->shared_count) {
+		if (cell->exclusive_lock){
+			if (cell->quiesce_continuation) {
+				queue_work(prison->wq, cell->quiesce_continuation);
+				cell->quiesce_continuation = NULL;
+			}
+		} else {
+			rb_erase(&cell->node, &prison->cells);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_bio_prison_cell_v2 *cell)
+{
+	bool r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __put(prison, cell);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_put_v2);
+
+static int __lock(struct dm_bio_prison_v2 *prison,
+		  struct dm_cell_key_v2 *key,
+		  unsigned lock_level,
+		  struct dm_bio_prison_cell_v2 *cell_prealloc,
+		  struct dm_bio_prison_cell_v2 **cell_result)
+{
+	struct dm_bio_prison_cell_v2 *cell;
+
+	if (__find_or_insert(prison, key, cell_prealloc, &cell)) {
+		if (cell->exclusive_lock)
+			return -EBUSY;
+
+		cell->exclusive_lock = true;
+		cell->exclusive_level = lock_level;
+		*cell_result = cell;
+
+		// FIXME: we don't yet know what level these shared locks
+		// were taken at, so have to quiesce them all.
+		return cell->shared_count > 0;
+
+	} else {
+		cell = cell_prealloc;
+		cell->shared_count = 0;
+		cell->exclusive_lock = true;
+		cell->exclusive_level = lock_level;
+		*cell_result = cell;
+	}
+
+	return 0;
+}
+
+int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_cell_key_v2 *key,
+		    unsigned lock_level,
+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
+		    struct dm_bio_prison_cell_v2 **cell_result)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __lock(prison, key, lock_level, cell_prealloc, cell_result);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_lock_v2);
+
+static void __quiesce(struct dm_bio_prison_v2 *prison,
+		      struct dm_bio_prison_cell_v2 *cell,
+		      struct work_struct *continuation)
+{
+	if (!cell->shared_count)
+		queue_work(prison->wq, continuation);
+	else
+		cell->quiesce_continuation = continuation;
+}
+
+void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
+			struct dm_bio_prison_cell_v2 *cell,
+			struct work_struct *continuation)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	__quiesce(prison, cell, continuation);
+	spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2);
+
+static int __promote(struct dm_bio_prison_v2 *prison,
+		     struct dm_bio_prison_cell_v2 *cell,
+		     unsigned new_lock_level)
+{
+	if (!cell->exclusive_lock)
+		return -EINVAL;
+
+	cell->exclusive_level = new_lock_level;
+	return cell->shared_count > 0;
+}
+
+int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
+			    struct dm_bio_prison_cell_v2 *cell,
+			    unsigned new_lock_level)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __promote(prison, cell, new_lock_level);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_lock_promote_v2);
+
+static bool __unlock(struct dm_bio_prison_v2 *prison,
+		     struct dm_bio_prison_cell_v2 *cell,
+		     struct bio_list *bios)
+{
+	BUG_ON(!cell->exclusive_lock);
+
+	bio_list_merge(bios, &cell->bios);
+	bio_list_init(&cell->bios);
+
+	if (cell->shared_count) {
+		cell->exclusive_lock = 0;
+		return false;
+	}
+
+	rb_erase(&cell->node, &prison->cells);
+	return true;
+}
+
+bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
+		       struct dm_bio_prison_cell_v2 *cell,
+		       struct bio_list *bios)
+{
+	bool r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __unlock(prison, cell, bios);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_unlock_v2);
+
+/*----------------------------------------------------------------*/
+
+int __init dm_bio_prison_init_v2(void)
+{
+	_cell_cache = KMEM_CACHE(dm_bio_prison_cell_v2, 0);
+	if (!_cell_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void dm_bio_prison_exit_v2(void)
+{
+	kmem_cache_destroy(_cell_cache);
+	_cell_cache = NULL;
+}
diff --git a/drivers/md/dm-bio-prison-v2.h b/drivers/md/dm-bio-prison-v2.h
new file mode 100644
index 000000000000..6e04234268db
--- /dev/null
+++ b/drivers/md/dm-bio-prison-v2.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2011-2017 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BIO_PRISON_V2_H
+#define DM_BIO_PRISON_V2_H
+
+#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
+#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
+
+#include <linux/bio.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+
+/*----------------------------------------------------------------*/
+
+int dm_bio_prison_init_v2(void);
+void dm_bio_prison_exit_v2(void);
+
+/*
+ * Sometimes we can't deal with a bio straight away.  We put them in prison
+ * where they can't cause any mischief.  Bios are put in a cell identified
+ * by a key, multiple bios can be in the same cell.  When the cell is
+ * subsequently unlocked the bios become available.
+ */
+struct dm_bio_prison_v2;
+
+/*
+ * Keys define a range of blocks within either a virtual or physical
+ * device.
+ */
+struct dm_cell_key_v2 {
+	int virtual;
+	dm_thin_id dev;
+	dm_block_t block_begin, block_end;
+};
+
+/*
+ * Treat this as opaque, only in header so callers can manage allocation
+ * themselves.
+ */
+struct dm_bio_prison_cell_v2 {
+	// FIXME: pack these
+	bool exclusive_lock;
+	unsigned exclusive_level;
+	unsigned shared_count;
+	struct work_struct *quiesce_continuation;
+
+	struct rb_node node;
+	struct dm_cell_key_v2 key;
+	struct bio_list bios;
+};
+
+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq);
+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison);
+
+/*
+ * These two functions just wrap a mempool.  This is a transitory step:
+ * Eventually all bio prison clients should manage their own cell memory.
+ *
+ * Like mempool_alloc(), dm_bio_prison_alloc_cell_v2() can only fail if called
+ * in interrupt context or passed GFP_NOWAIT.
+ */
+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison,
+						    gfp_t gfp);
+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
+				struct dm_bio_prison_cell_v2 *cell);
+
+/*
+ * Shared locks have a bio associated with them.
+ *
+ * If the lock is granted the caller can continue to use the bio, and must
+ * call dm_cell_put_v2() to drop the reference count when finished using it.
+ *
+ * If the lock cannot be granted then the bio will be tracked within the
+ * cell, and later given to the holder of the exclusive lock.
+ *
+ * See dm_cell_lock_v2() for discussion of the lock_level parameter.
+ *
+ * Compare *cell_result with cell_prealloc to see if the prealloc was used.
+ * If cell_prealloc was used then inmate wasn't added to it.
+ *
+ * Returns true if the lock is granted.
+ */
+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_cell_key_v2 *key,
+		    unsigned lock_level,
+		    struct bio *inmate,
+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
+		    struct dm_bio_prison_cell_v2 **cell_result);
+
+/*
+ * Decrement the shared reference count for the lock.  Returns true if
+ * returning ownership of the cell (ie. you should free it).
+ */
+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_bio_prison_cell_v2 *cell);
+
+/*
+ * Locks a cell.  No associated bio.  Exclusive locks get priority.  These
+ * locks constrain whether the io locks are granted according to level.
+ *
+ * Shared locks will still be granted if the lock_level is > (not = to) the
+ * exclusive lock level.
+ *
+ * If an _exclusive_ lock is already held then -EBUSY is returned.
+ *
+ * Return values:
+ *  < 0 - error
+ *  0   - locked; no quiescing needed
+ *  1   - locked; quiescing needed
+ */
+int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_cell_key_v2 *key,
+		    unsigned lock_level,
+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
+		    struct dm_bio_prison_cell_v2 **cell_result);
+
+void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
+			struct dm_bio_prison_cell_v2 *cell,
+			struct work_struct *continuation);
+
+/*
+ * Promotes an _exclusive_ lock to a higher lock level.
+ *
+ * Return values:
+ *  < 0 - error
+ *  0   - promoted; no quiescing needed
+ *  1   - promoted; quiescing needed
+ */
+int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
+			    struct dm_bio_prison_cell_v2 *cell,
+			    unsigned new_lock_level);
+
+/*
+ * Adds any held bios to the bio list.
+ *
+ * There may be shared locks still held at this point even if you quiesced
+ * (ie. different lock levels).
+ *
+ * Returns true if returning ownership of the cell (ie. you should free
+ * it).
+ */
+bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
+		       struct dm_bio_prison_cell_v2 *cell,
+		       struct bio_list *bios);
+
+/*----------------------------------------------------------------*/
+
+#endif
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e7..2eaa414e1509 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -5,7 +5,7 @@
  */
 
 #include "dm.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v1.h"
 #include "dm-bio-record.h"
 #include "dm-cache-metadata.h"
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b266a2b5035..9b3e2fcbfb1b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -5,7 +5,7 @@
  */
 
 #include "dm-thin-metadata.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v1.h"
 #include "dm.h"
 
 #include <linux/device-mapper.h>

From b29d4986d0da1a27cd35917cdb433672f5c95d7f Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 15 Dec 2016 04:57:31 -0500
Subject: [PATCH 02/50] dm cache: significant rework to leverage
 dm-bio-prison-v2

The cache policy interfaces have been updated to work well with the new
bio-prison v2 interface's ability to queue work immediately (promotion,
demotion, etc) -- overriding benefit being reduced latency on processing
IO through the cache.  Previously such work would be left for the DM
cache core to queue on various lists and then process in batches later
-- this caused a serious delay in latency for IO driven by the cache.

The background tracker code was factored out so that all cache policies
can make use of it.

Also, the "cleaner" policy has been removed and is now a variant of the
smq policy that simply disallows migrations.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Kconfig                       |    8 -
 drivers/md/Makefile                      |    5 +-
 drivers/md/dm-cache-background-tracker.c |  238 ++
 drivers/md/dm-cache-background-tracker.h |   46 +
 drivers/md/dm-cache-metadata.h           |    2 +
 drivers/md/dm-cache-policy-cleaner.c     |  469 ----
 drivers/md/dm-cache-policy-internal.h    |   86 +-
 drivers/md/dm-cache-policy-smq.c         |  877 ++++---
 drivers/md/dm-cache-policy.h             |  197 +-
 drivers/md/dm-cache-target.c             | 2723 ++++++++++------------
 10 files changed, 2087 insertions(+), 2564 deletions(-)
 create mode 100644 drivers/md/dm-cache-background-tracker.c
 create mode 100644 drivers/md/dm-cache-background-tracker.h
 delete mode 100644 drivers/md/dm-cache-policy-cleaner.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da50c26..982cd0626bc7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -325,14 +325,6 @@ config DM_CACHE_SMQ
          of less memory utilization, improved performance and increased
          adaptability in the face of changing workloads.
 
-config DM_CACHE_CLEANER
-       tristate "Cleaner Cache Policy (EXPERIMENTAL)"
-       depends on DM_CACHE
-       default y
-       ---help---
-         A simple cache policy that writes back all data to the
-         origin.  Used when decommissioning a dm-cache.
-
 config DM_ERA
        tristate "Era target (EXPERIMENTAL)"
        depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d378b1db7852..2801b2fb452d 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -13,9 +13,9 @@ dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
 dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
-dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
+		    dm-cache-background-tracker.o
 dm-cache-smq-y   += dm-cache-policy-smq.o
-dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
@@ -57,7 +57,6 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
-obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
new file mode 100644
index 000000000000..9b1afdfb13f0
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-background-tracker.h"
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "dm-background-tracker"
+
+struct bt_work {
+	struct list_head list;
+	struct rb_node node;
+	struct policy_work work;
+};
+
+struct background_tracker {
+	unsigned max_work;
+	atomic_t pending_promotes;
+	atomic_t pending_writebacks;
+	atomic_t pending_demotes;
+
+	struct list_head issued;
+	struct list_head queued;
+	struct rb_root pending;
+
+	struct kmem_cache *work_cache;
+};
+
+struct background_tracker *btracker_create(unsigned max_work)
+{
+	struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
+
+	b->max_work = max_work;
+	atomic_set(&b->pending_promotes, 0);
+	atomic_set(&b->pending_writebacks, 0);
+	atomic_set(&b->pending_demotes, 0);
+
+	INIT_LIST_HEAD(&b->issued);
+	INIT_LIST_HEAD(&b->queued);
+
+	b->pending = RB_ROOT;
+	b->work_cache = KMEM_CACHE(bt_work, 0);
+	if (!b->work_cache) {
+		DMERR("couldn't create mempool for background work items");
+		kfree(b);
+		b = NULL;
+	}
+
+	return b;
+}
+EXPORT_SYMBOL_GPL(btracker_create);
+
+void btracker_destroy(struct background_tracker *b)
+{
+	kmem_cache_destroy(b->work_cache);
+	kfree(b);
+}
+EXPORT_SYMBOL_GPL(btracker_destroy);
+
+static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs)
+{
+	if (from_oblock(lhs) < from_oblock(rhs))
+		return -1;
+
+	if (from_oblock(rhs) < from_oblock(lhs))
+		return 1;
+
+	return 0;
+}
+
+static bool __insert_pending(struct background_tracker *b,
+			     struct bt_work *nw)
+{
+	int cmp;
+	struct bt_work *w;
+	struct rb_node **new = &b->pending.rb_node, *parent = NULL;
+
+	while (*new) {
+		w = container_of(*new, struct bt_work, node);
+
+		parent = *new;
+		cmp = cmp_oblock(w->work.oblock, nw->work.oblock);
+		if (cmp < 0)
+			new = &((*new)->rb_left);
+
+		else if (cmp > 0)
+			new = &((*new)->rb_right);
+
+		else
+			/* already present */
+			return false;
+	}
+
+	rb_link_node(&nw->node, parent, new);
+	rb_insert_color(&nw->node, &b->pending);
+
+	return true;
+}
+
+static struct bt_work *__find_pending(struct background_tracker *b,
+				      dm_oblock_t oblock)
+{
+	int cmp;
+	struct bt_work *w;
+	struct rb_node **new = &b->pending.rb_node;
+
+	while (*new) {
+		w = container_of(*new, struct bt_work, node);
+
+		cmp = cmp_oblock(w->work.oblock, oblock);
+		if (cmp < 0)
+			new = &((*new)->rb_left);
+
+		else if (cmp > 0)
+			new = &((*new)->rb_right);
+
+		else
+			break;
+	}
+
+	return *new ? w : NULL;
+}
+
+
+static void update_stats(struct background_tracker *b, struct policy_work *w, int delta)
+{
+	switch (w->op) {
+	case POLICY_PROMOTE:
+		atomic_add(delta, &b->pending_promotes);
+		break;
+
+	case POLICY_DEMOTE:
+		atomic_add(delta, &b->pending_demotes);
+		break;
+
+	case POLICY_WRITEBACK:
+		atomic_add(delta, &b->pending_writebacks);
+		break;
+	}
+}
+
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b)
+{
+	return atomic_read(&b->pending_writebacks);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
+
+unsigned btracker_nr_demotions_queued(struct background_tracker *b)
+{
+	return atomic_read(&b->pending_demotes);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
+
+static bool max_work_reached(struct background_tracker *b)
+{
+	// FIXME: finish
+	return false;
+}
+
+int btracker_queue(struct background_tracker *b,
+		   struct policy_work *work,
+		   struct policy_work **pwork)
+{
+	struct bt_work *w;
+
+	if (pwork)
+		*pwork = NULL;
+
+	if (max_work_reached(b))
+		return -ENOMEM;
+
+	w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+	if (!w)
+		return -ENOMEM;
+
+	memcpy(&w->work, work, sizeof(*work));
+
+	if (!__insert_pending(b, w)) {
+		/*
+		 * There was a race, we'll just ignore this second
+		 * bit of work for the same oblock.
+		 */
+		kmem_cache_free(b->work_cache, w);
+		return -EINVAL;
+	}
+
+	if (pwork) {
+		*pwork = &w->work;
+		list_add(&w->list, &b->issued);
+	} else
+		list_add(&w->list, &b->queued);
+	update_stats(b, &w->work, 1);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_queue);
+
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work)
+{
+	struct bt_work *w;
+
+	if (list_empty(&b->queued))
+		return -ENODATA;
+
+	w = list_first_entry(&b->queued, struct bt_work, list);
+	list_move(&w->list, &b->issued);
+	*work = &w->work;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_issue);
+
+void btracker_complete(struct background_tracker *b,
+		       struct policy_work *op)
+{
+	struct bt_work *w = container_of(op, struct bt_work, work);
+
+	update_stats(b, &w->work, -1);
+	rb_erase(&w->node, &b->pending);
+	list_del(&w->list);
+	kmem_cache_free(b->work_cache, w);
+}
+EXPORT_SYMBOL_GPL(btracker_complete);
+
+bool btracker_promotion_already_present(struct background_tracker *b,
+					dm_oblock_t oblock)
+{
+	return __find_pending(b, oblock) != NULL;
+}
+EXPORT_SYMBOL_GPL(btracker_promotion_already_present);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h
new file mode 100644
index 000000000000..27ab90dbc275
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_BACKGROUND_WORK_H
+#define DM_CACHE_BACKGROUND_WORK_H
+
+#include <linux/vmalloc.h>
+#include "dm-cache-policy.h"
+
+/*----------------------------------------------------------------*/
+
+struct background_work;
+struct background_tracker;
+
+/*
+ * FIXME: discuss lack of locking in all methods.
+ */
+struct background_tracker *btracker_create(unsigned max_work);
+void btracker_destroy(struct background_tracker *b);
+
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b);
+unsigned btracker_nr_demotions_queued(struct background_tracker *b);
+
+/*
+ * returns -EINVAL iff the work is already queued.  -ENOMEM if the work
+ * couldn't be queued for another reason.
+ */
+int btracker_queue(struct background_tracker *b,
+		   struct policy_work *work,
+		   struct policy_work **pwork);
+
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work);
+void btracker_complete(struct background_tracker *b,
+		       struct policy_work *op);
+bool btracker_promotion_already_present(struct background_tracker *b,
+					dm_oblock_t oblock);
+
+/*----------------------------------------------------------------*/
+
+#endif
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4f07c08cf107..179ed5bf81a3 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -50,6 +50,8 @@
 #define DM_CACHE_FEATURE_COMPAT_RO_SUPP	  0UL
 #define DM_CACHE_FEATURE_INCOMPAT_SUPP	  0UL
 
+struct dm_cache_metadata;
+
 /*
  * Reopens or creates a new, empty metadata volume.  Returns an ERR_PTR on
  * failure.  If reopening then features must match.
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
deleted file mode 100644
index 2e8a8f1d8358..000000000000
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- *
- * writeback cache policy supporting flushing out dirty cache blocks.
- *
- * This file is released under the GPL.
- */
-
-#include "dm-cache-policy.h"
-#include "dm.h"
-
-#include <linux/hash.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-/*----------------------------------------------------------------*/
-
-#define DM_MSG_PREFIX "cache cleaner"
-
-/* Cache entry struct. */
-struct wb_cache_entry {
-	struct list_head list;
-	struct hlist_node hlist;
-
-	dm_oblock_t oblock;
-	dm_cblock_t cblock;
-	bool dirty:1;
-	bool pending:1;
-};
-
-struct hash {
-	struct hlist_head *table;
-	dm_block_t hash_bits;
-	unsigned nr_buckets;
-};
-
-struct policy {
-	struct dm_cache_policy policy;
-	spinlock_t lock;
-
-	struct list_head free;
-	struct list_head clean;
-	struct list_head clean_pending;
-	struct list_head dirty;
-
-	/*
-	 * We know exactly how many cblocks will be needed,
-	 * so we can allocate them up front.
-	 */
-	dm_cblock_t cache_size, nr_cblocks_allocated;
-	struct wb_cache_entry *cblocks;
-	struct hash chash;
-};
-
-/*----------------------------------------------------------------------------*/
-
-/*
- * Low-level functions.
- */
-static unsigned next_power(unsigned n, unsigned min)
-{
-	return roundup_pow_of_two(max(n, min));
-}
-
-static struct policy *to_policy(struct dm_cache_policy *p)
-{
-	return container_of(p, struct policy, policy);
-}
-
-static struct list_head *list_pop(struct list_head *q)
-{
-	struct list_head *r = q->next;
-
-	list_del(r);
-
-	return r;
-}
-
-/*----------------------------------------------------------------------------*/
-
-/* Allocate/free various resources. */
-static int alloc_hash(struct hash *hash, unsigned elts)
-{
-	hash->nr_buckets = next_power(elts >> 4, 16);
-	hash->hash_bits = __ffs(hash->nr_buckets);
-	hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
-
-	return hash->table ? 0 : -ENOMEM;
-}
-
-static void free_hash(struct hash *hash)
-{
-	vfree(hash->table);
-}
-
-static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
-{
-	int r = -ENOMEM;
-
-	p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
-	if (p->cblocks) {
-		unsigned u = from_cblock(cache_size);
-
-		while (u--)
-			list_add(&p->cblocks[u].list, &p->free);
-
-		p->nr_cblocks_allocated = 0;
-
-		/* Cache entries hash. */
-		r = alloc_hash(&p->chash, from_cblock(cache_size));
-		if (r)
-			vfree(p->cblocks);
-	}
-
-	return r;
-}
-
-static void free_cache_blocks_and_hash(struct policy *p)
-{
-	free_hash(&p->chash);
-	vfree(p->cblocks);
-}
-
-static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
-{
-	struct wb_cache_entry *e;
-
-	BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
-
-	e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
-	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
-
-	return e;
-}
-
-/*----------------------------------------------------------------------------*/
-
-/* Hash functions (lookup, insert, remove). */
-static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
-{
-	struct hash *hash = &p->chash;
-	unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
-	struct wb_cache_entry *cur;
-	struct hlist_head *bucket = &hash->table[h];
-
-	hlist_for_each_entry(cur, bucket, hlist) {
-		if (cur->oblock == oblock) {
-			/* Move upfront bucket for faster access. */
-			hlist_del(&cur->hlist);
-			hlist_add_head(&cur->hlist, bucket);
-			return cur;
-		}
-	}
-
-	return NULL;
-}
-
-static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
-{
-	unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
-
-	hlist_add_head(&e->hlist, &p->chash.table[h]);
-}
-
-static void remove_cache_hash_entry(struct wb_cache_entry *e)
-{
-	hlist_del(&e->hlist);
-}
-
-/* Public interface (see dm-cache-policy.h */
-static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
-		  bool can_block, bool can_migrate, bool discarded_oblock,
-		  struct bio *bio, struct policy_locker *locker,
-		  struct policy_result *result)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	result->op = POLICY_MISS;
-
-	if (can_block)
-		spin_lock_irqsave(&p->lock, flags);
-
-	else if (!spin_trylock_irqsave(&p->lock, flags))
-		return -EWOULDBLOCK;
-
-	e = lookup_cache_entry(p, oblock);
-	if (e) {
-		result->op = POLICY_HIT;
-		result->cblock = e->cblock;
-
-	}
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return 0;
-}
-
-static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
-{
-	int r;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	if (!spin_trylock_irqsave(&p->lock, flags))
-		return -EWOULDBLOCK;
-
-	e = lookup_cache_entry(p, oblock);
-	if (e) {
-		*cblock = e->cblock;
-		r = 0;
-
-	} else
-		r = -ENOENT;
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return r;
-}
-
-static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-
-	e = lookup_cache_entry(p, oblock);
-	BUG_ON(!e);
-
-	if (set) {
-		if (!e->dirty) {
-			e->dirty = true;
-			list_move(&e->list, &p->dirty);
-		}
-
-	} else {
-		if (e->dirty) {
-			e->pending = false;
-			e->dirty = false;
-			list_move(&e->list, &p->clean);
-		}
-	}
-}
-
-static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	__set_clear_dirty(pe, oblock, true);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	__set_clear_dirty(pe, oblock, false);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
-{
-	insert_cache_hash_entry(p, e);
-	if (e->dirty)
-		list_add(&e->list, &p->dirty);
-	else
-		list_add(&e->list, &p->clean);
-}
-
-static int wb_load_mapping(struct dm_cache_policy *pe,
-			   dm_oblock_t oblock, dm_cblock_t cblock,
-			   uint32_t hint, bool hint_valid)
-{
-	int r;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e = alloc_cache_entry(p);
-
-	if (e) {
-		e->cblock = cblock;
-		e->oblock = oblock;
-		e->dirty = false; /* blocks default to clean */
-		add_cache_entry(p, e);
-		r = 0;
-
-	} else
-		r = -ENOMEM;
-
-	return r;
-}
-
-static void wb_destroy(struct dm_cache_policy *pe)
-{
-	struct policy *p = to_policy(pe);
-
-	free_cache_blocks_and_hash(p);
-	kfree(p);
-}
-
-static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
-{
-	struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
-
-	BUG_ON(!r);
-
-	remove_cache_hash_entry(r);
-	list_del(&r->list);
-
-	return r;
-}
-
-static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	e = __wb_force_remove_mapping(p, oblock);
-	list_add_tail(&e->list, &p->free);
-	BUG_ON(!from_cblock(p->nr_cblocks_allocated));
-	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void wb_force_mapping(struct dm_cache_policy *pe,
-				dm_oblock_t current_oblock, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	e = __wb_force_remove_mapping(p, current_oblock);
-	e->oblock = oblock;
-	add_cache_entry(p, e);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
-{
-	struct list_head *l;
-	struct wb_cache_entry *r;
-
-	if (list_empty(&p->dirty))
-		return NULL;
-
-	l = list_pop(&p->dirty);
-	r = container_of(l, struct wb_cache_entry, list);
-	list_add(l, &p->clean_pending);
-
-	return r;
-}
-
-static int wb_writeback_work(struct dm_cache_policy *pe,
-			     dm_oblock_t *oblock,
-			     dm_cblock_t *cblock,
-			     bool critical_only)
-{
-	int r = -ENOENT;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-
-	e = get_next_dirty_entry(p);
-	if (e) {
-		*oblock = e->oblock;
-		*cblock = e->cblock;
-		r = 0;
-	}
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return r;
-}
-
-static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
-{
-	return to_policy(pe)->nr_cblocks_allocated;
-}
-
-/* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct policy *p)
-{
-	p->policy.destroy = wb_destroy;
-	p->policy.map = wb_map;
-	p->policy.lookup = wb_lookup;
-	p->policy.set_dirty = wb_set_dirty;
-	p->policy.clear_dirty = wb_clear_dirty;
-	p->policy.load_mapping = wb_load_mapping;
-	p->policy.get_hint = NULL;
-	p->policy.remove_mapping = wb_remove_mapping;
-	p->policy.writeback_work = wb_writeback_work;
-	p->policy.force_mapping = wb_force_mapping;
-	p->policy.residency = wb_residency;
-	p->policy.tick = NULL;
-}
-
-static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
-					 sector_t origin_size,
-					 sector_t cache_block_size)
-{
-	int r;
-	struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
-
-	if (!p)
-		return NULL;
-
-	init_policy_functions(p);
-	INIT_LIST_HEAD(&p->free);
-	INIT_LIST_HEAD(&p->clean);
-	INIT_LIST_HEAD(&p->clean_pending);
-	INIT_LIST_HEAD(&p->dirty);
-
-	p->cache_size = cache_size;
-	spin_lock_init(&p->lock);
-
-	/* Allocate cache entry structs and add them to free list. */
-	r = alloc_cache_blocks_with_hash(p, cache_size);
-	if (!r)
-		return &p->policy;
-
-	kfree(p);
-
-	return NULL;
-}
-/*----------------------------------------------------------------------------*/
-
-static struct dm_cache_policy_type wb_policy_type = {
-	.name = "cleaner",
-	.version = {1, 0, 0},
-	.hint_size = 4,
-	.owner = THIS_MODULE,
-	.create = wb_create
-};
-
-static int __init wb_init(void)
-{
-	int r = dm_cache_policy_register(&wb_policy_type);
-
-	if (r < 0)
-		DMERR("register failed %d", r);
-	else
-		DMINFO("version %u.%u.%u loaded",
-		       wb_policy_type.version[0],
-		       wb_policy_type.version[1],
-		       wb_policy_type.version[2]);
-
-	return r;
-}
-
-static void __exit wb_exit(void)
-{
-	dm_cache_policy_unregister(&wb_policy_type);
-}
-
-module_init(wb_init);
-module_exit(wb_exit);
-
-MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 808ee0e2b2c4..56f0a23f698c 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -12,40 +12,59 @@
 
 /*----------------------------------------------------------------*/
 
-/*
- * Little inline functions that simplify calling the policy methods.
- */
-static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
-			     bool can_block, bool can_migrate, bool discarded_oblock,
-			     struct bio *bio, struct policy_locker *locker,
-			     struct policy_result *result)
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+				int data_dir, bool fast_copy, bool *background_queued)
 {
-	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
+	return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued);
 }
 
-static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static inline int policy_lookup_with_work(struct dm_cache_policy *p,
+					  dm_oblock_t oblock, dm_cblock_t *cblock,
+					  int data_dir, bool fast_copy,
+					  struct policy_work **work)
 {
-	BUG_ON(!p->lookup);
-	return p->lookup(p, oblock, cblock);
+	if (!p->lookup_with_work) {
+		*work = NULL;
+		return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL);
+	}
+
+	return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work);
 }
 
-static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline int policy_get_background_work(struct dm_cache_policy *p,
+					     bool idle, struct policy_work **result)
 {
-	if (p->set_dirty)
-		p->set_dirty(p, oblock);
+	return p->get_background_work(p, idle, result);
 }
 
-static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline void policy_complete_background_work(struct dm_cache_policy *p,
+						   struct policy_work *work,
+						   bool success)
 {
-	if (p->clear_dirty)
-		p->clear_dirty(p, oblock);
+	return p->complete_background_work(p, work, success);
+}
+
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	p->set_dirty(p, cblock);
+}
+
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	p->clear_dirty(p, cblock);
 }
 
 static inline int policy_load_mapping(struct dm_cache_policy *p,
 				      dm_oblock_t oblock, dm_cblock_t cblock,
-				      uint32_t hint, bool hint_valid)
+				      bool dirty, uint32_t hint, bool hint_valid)
 {
-	return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+	return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid);
+}
+
+static inline int policy_invalidate_mapping(struct dm_cache_policy *p,
+					    dm_cblock_t cblock)
+{
+	return p->invalidate_mapping(p, cblock);
 }
 
 static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
@@ -54,30 +73,6 @@ static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
 	return p->get_hint ? p->get_hint(p, cblock) : 0;
 }
 
-static inline int policy_writeback_work(struct dm_cache_policy *p,
-					dm_oblock_t *oblock,
-					dm_cblock_t *cblock,
-					bool critical_only)
-{
-	return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
-}
-
-static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
-	p->remove_mapping(p, oblock);
-}
-
-static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
-	return p->remove_cblock(p, cblock);
-}
-
-static inline void policy_force_mapping(struct dm_cache_policy *p,
-					dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-	return p->force_mapping(p, current_oblock, new_oblock);
-}
-
 static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
 {
 	return p->residency(p);
@@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
 	return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
 }
 
+static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+	return p->allow_migrations(p, allow);
+}
+
 /*----------------------------------------------------------------*/
 
 /*
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index f19c6930a67c..74436dc2122f 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -4,8 +4,9 @@
  * This file is released under the GPL.
  */
 
-#include "dm-cache-policy.h"
+#include "dm-cache-background-tracker.h"
 #include "dm-cache-policy-internal.h"
+#include "dm-cache-policy.h"
 #include "dm.h"
 
 #include <linux/hash.h>
@@ -38,10 +39,11 @@ struct entry {
 	unsigned hash_next:28;
 	unsigned prev:28;
 	unsigned next:28;
-	unsigned level:7;
+	unsigned level:6;
 	bool dirty:1;
 	bool allocated:1;
 	bool sentinel:1;
+	bool pending_work:1;
 
 	dm_oblock_t oblock;
 };
@@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q)
  */
 static void q_push(struct queue *q, struct entry *e)
 {
+	BUG_ON(e->pending_work);
+
 	if (!e->sentinel)
 		q->nr_elts++;
 
 	l_add_tail(q->es, q->qs + e->level, e);
 }
 
+static void q_push_front(struct queue *q, struct entry *e)
+{
+	BUG_ON(e->pending_work);
+
+	if (!e->sentinel)
+		q->nr_elts++;
+
+	l_add_head(q->es, q->qs + e->level, e);
+}
+
 static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
 {
+	BUG_ON(e->pending_work);
+
 	if (!e->sentinel)
 		q->nr_elts++;
 
@@ -335,19 +351,6 @@ static struct entry *q_pop(struct queue *q)
 	return e;
 }
 
-/*
- * Pops an entry from a level that is not past a sentinel.
- */
-static struct entry *q_pop_old(struct queue *q, unsigned max_level)
-{
-	struct entry *e = q_peek(q, max_level, false);
-
-	if (e)
-		q_del(q, e);
-
-	return e;
-}
-
 /*
  * This function assumes there is a non-sentinel entry to pop.  It's only
  * used by redistribute, so we know this is true.  It also doesn't adjust
@@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q)
 				break;
 
 			e->level = level + 1u;
-			l_add_head(q->es, l_above, e);
+			l_add_tail(q->es, l_above, e);
 		}
 	}
 }
 
-static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
+		      struct entry *s1, struct entry *s2)
 {
 	struct entry *de;
-	unsigned new_level;
-
-	q_del(q, e);
+	unsigned sentinels_passed = 0;
+	unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
 
+	/* try and find an entry to swap with */
 	if (extra_levels && (e->level < q->nr_levels - 1u)) {
-		new_level = min(q->nr_levels - 1u, e->level + extra_levels);
-		for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
-			if (de->sentinel)
-				continue;
+		for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
+			sentinels_passed++;
 
+		if (de) {
 			q_del(q, de);
 			de->level = e->level;
+			if (s1) {
+				switch (sentinels_passed) {
+				case 0:
+					q_push_before(q, s1, de);
+					break;
 
-			if (dest)
-				q_push_before(q, dest, de);
-			else
+				case 1:
+					q_push_before(q, s2, de);
+					break;
+
+				default:
+					q_push(q, de);
+				}
+			} else
 				q_push(q, de);
-			break;
 		}
-
-		e->level = new_level;
 	}
 
+	q_del(q, e);
+	e->level = new_level;
 	q_push(q, e);
 }
 
-static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
-{
-	q_requeue_before(q, NULL, e, extra_levels);
-}
-
 /*----------------------------------------------------------------*/
 
 #define FP_SHIFT 8
@@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s)
 
 /*----------------------------------------------------------------*/
 
-struct hash_table {
+struct smq_hash_table {
 	struct entry_space *es;
 	unsigned long long hash_bits;
 	unsigned *buckets;
@@ -560,7 +567,7 @@ struct hash_table {
  * All cache entries are stored in a chained hash table.  To save space we
  * use indexing again, and only store indexes to the next entry.
  */
-static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
 {
 	unsigned i, nr_buckets;
 
@@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
 	return 0;
 }
 
-static void h_exit(struct hash_table *ht)
+static void h_exit(struct smq_hash_table *ht)
 {
 	vfree(ht->buckets);
 }
 
-static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
 {
 	return to_entry(ht->es, ht->buckets[bucket]);
 }
 
-static struct entry *h_next(struct hash_table *ht, struct entry *e)
+static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
 {
 	return to_entry(ht->es, e->hash_next);
 }
 
-static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
 {
 	e->hash_next = ht->buckets[bucket];
 	ht->buckets[bucket] = to_index(ht->es, e);
 }
 
-static void h_insert(struct hash_table *ht, struct entry *e)
+static void h_insert(struct smq_hash_table *ht, struct entry *e)
 {
 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
 	__h_insert(ht, h, e);
 }
 
-static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
 				struct entry **prev)
 {
 	struct entry *e;
@@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o
 	return NULL;
 }
 
-static void __h_unlink(struct hash_table *ht, unsigned h,
+static void __h_unlink(struct smq_hash_table *ht, unsigned h,
 		       struct entry *e, struct entry *prev)
 {
 	if (prev)
@@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h,
 /*
  * Also moves each entry to the front of the bucket.
  */
-static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
 {
 	struct entry *e, *prev;
 	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
@@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
 	return e;
 }
 
-static void h_remove(struct hash_table *ht, struct entry *e)
+static void h_remove(struct smq_hash_table *ht, struct entry *e)
 {
 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
 	struct entry *prev;
@@ -699,7 +706,10 @@ static void init_entry(struct entry *e)
 	e->next = INDEXER_NULL;
 	e->prev = INDEXER_NULL;
 	e->level = 0u;
+	e->dirty = true;	/* FIXME: audit */
 	e->allocated = true;
+	e->sentinel = false;
+	e->pending_work = false;
 }
 
 static struct entry *alloc_entry(struct entry_alloc *ea)
@@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
 #define NR_HOTSPOT_LEVELS 64u
 #define NR_CACHE_LEVELS 64u
 
-#define WRITEBACK_PERIOD (10 * HZ)
-#define DEMOTE_PERIOD (60 * HZ)
+#define WRITEBACK_PERIOD (10ul * HZ)
+#define DEMOTE_PERIOD (60ul * HZ)
 
 #define HOTSPOT_UPDATE_PERIOD (HZ)
-#define CACHE_UPDATE_PERIOD (10u * HZ)
+#define CACHE_UPDATE_PERIOD (60ul * HZ)
 
 struct smq_policy {
 	struct dm_cache_policy policy;
@@ -814,8 +824,8 @@ struct smq_policy {
 	 * The hash tables allows us to quickly find an entry by origin
 	 * block.
 	 */
-	struct hash_table table;
-	struct hash_table hotspot_table;
+	struct smq_hash_table table;
+	struct smq_hash_table hotspot_table;
 
 	bool current_writeback_sentinels;
 	unsigned long next_writeback_period;
@@ -828,6 +838,10 @@ struct smq_policy {
 
 	unsigned long next_hotspot_period;
 	unsigned long next_cache_period;
+
+	struct background_tracker *bg_work;
+
+	bool migrations_allowed;
 };
 
 /*----------------------------------------------------------------*/
@@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq)
 static void update_sentinels(struct smq_policy *mq)
 {
 	if (time_after(jiffies, mq->next_writeback_period)) {
-		__update_writeback_sentinels(mq);
 		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
 		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+		__update_writeback_sentinels(mq);
 	}
 
 	if (time_after(jiffies, mq->next_demote_period)) {
-		__update_demote_sentinels(mq);
 		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
 		mq->current_demote_sentinels = !mq->current_demote_sentinels;
+		__update_demote_sentinels(mq);
 	}
 }
 
@@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq)
 
 /*----------------------------------------------------------------*/
 
-/*
- * These methods tie together the dirty queue, clean queue and hash table.
- */
-static void push_new(struct smq_policy *mq, struct entry *e)
+static void del_queue(struct smq_policy *mq, struct entry *e)
 {
-	struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
-	h_insert(&mq->table, e);
-	q_push(q, e);
+	q_del(e->dirty ? &mq->dirty : &mq->clean, e);
 }
 
+static void push_queue(struct smq_policy *mq, struct entry *e)
+{
+	if (e->dirty)
+		q_push(&mq->dirty, e);
+	else
+		q_push(&mq->clean, e);
+}
+
+// !h, !q, a -> h, q, a
 static void push(struct smq_policy *mq, struct entry *e)
 {
-	struct entry *sentinel;
-
 	h_insert(&mq->table, e);
-
-	/*
-	 * Punch this into the queue just in front of the sentinel, to
-	 * ensure it's cleaned straight away.
-	 */
-	if (e->dirty) {
-		sentinel = writeback_sentinel(mq, e->level);
-		q_push_before(&mq->dirty, sentinel, e);
-	} else {
-		sentinel = demote_sentinel(mq, e->level);
-		q_push_before(&mq->clean, sentinel, e);
-	}
+	if (!e->pending_work)
+		push_queue(mq, e);
 }
 
-/*
- * Removes an entry from cache.  Removes from the hash table.
- */
-static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+static void push_queue_front(struct smq_policy *mq, struct entry *e)
 {
-	q_del(q, e);
-	h_remove(&mq->table, e);
+	if (e->dirty)
+		q_push_front(&mq->dirty, e);
+	else
+		q_push_front(&mq->clean, e);
 }
 
-static void del(struct smq_policy *mq, struct entry *e)
+static void push_front(struct smq_policy *mq, struct entry *e)
 {
-	__del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
-}
-
-static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
-{
-	struct entry *e = q_pop_old(q, max_level);
-	if (e)
-		h_remove(&mq->table, e);
-	return e;
+	h_insert(&mq->table, e);
+	if (!e->pending_work)
+		push_queue_front(mq, e);
 }
 
 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
@@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
 
 static void requeue(struct smq_policy *mq, struct entry *e)
 {
-	struct entry *sentinel;
+	/*
+	 * Pending work has temporarily been taken out of the queues.
+	 */
+	if (e->pending_work)
+		return;
 
 	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
-		if (e->dirty) {
-			sentinel = writeback_sentinel(mq, e->level);
-			q_requeue_before(&mq->dirty, sentinel, e, 1u);
-		} else {
-			sentinel = demote_sentinel(mq, e->level);
-			q_requeue_before(&mq->clean, sentinel, e, 1u);
+		if (!e->dirty) {
+			q_requeue(&mq->clean, e, 1u, NULL, NULL);
+			return;
 		}
+
+		q_requeue(&mq->dirty, e, 1u,
+			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
+			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
 	}
 }
 
@@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq)
 	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
 		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
 
+	threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
+
 	/*
 	 * If the hotspot queue is performing badly then we have little
 	 * confidence that we know which blocks to promote.  So we cut down
@@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq)
 	}
 
 	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
-	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
 }
 
 /*
@@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq)
 	}
 }
 
-static int demote_cblock(struct smq_policy *mq,
-			 struct policy_locker *locker,
-			 dm_oblock_t *oblock)
+/*----------------------------------------------------------------*/
+
+/*
+ * Targets are given as a percentage.
+ */
+#define CLEAN_TARGET 25u
+#define FREE_TARGET 25u
+
+static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
 {
-	struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
-	if (!demoted)
-		/*
-		 * We could get a block from mq->dirty, but that
-		 * would add extra latency to the triggering bio as it
-		 * waits for the writeback.  Better to not promote this
-		 * time and hope there's a clean block next time this block
-		 * is hit.
-		 */
-		return -ENOSPC;
-
-	if (locker->fn(locker, demoted->oblock))
-		/*
-		 * We couldn't lock this block.
-		 */
-		return -EBUSY;
-
-	del(mq, demoted);
-	*oblock = demoted->oblock;
-	free_entry(&mq->cache_alloc, demoted);
-
-	return 0;
+	return from_cblock(mq->cache_size) * p / 100u;
 }
 
+static bool clean_target_met(struct smq_policy *mq, bool idle)
+{
+	/*
+	 * Cache entries may not be populated.  So we cannot rely on the
+	 * size of the clean queue.
+	 */
+	unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
+
+	if (idle)
+		/*
+		 * We'd like to clean everything.
+		 */
+		return q_size(&mq->dirty) == 0u;
+	else
+		return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
+		       percent_to_target(mq, CLEAN_TARGET);
+}
+
+static bool free_target_met(struct smq_policy *mq, bool idle)
+{
+	unsigned nr_free = from_cblock(mq->cache_size) -
+			   mq->cache_alloc.nr_allocated;
+
+	if (idle)
+		return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
+		       percent_to_target(mq, FREE_TARGET);
+	else
+		return true;
+}
+
+/*----------------------------------------------------------------*/
+
+static void mark_pending(struct smq_policy *mq, struct entry *e)
+{
+	BUG_ON(e->sentinel);
+	BUG_ON(!e->allocated);
+	BUG_ON(e->pending_work);
+	e->pending_work = true;
+}
+
+static void clear_pending(struct smq_policy *mq, struct entry *e)
+{
+	BUG_ON(!e->pending_work);
+	e->pending_work = false;
+}
+
+static void queue_writeback(struct smq_policy *mq)
+{
+	int r;
+	struct policy_work work;
+	struct entry *e;
+
+	e = q_peek(&mq->dirty, mq->dirty.nr_levels, false);
+	if (e) {
+		mark_pending(mq, e);
+		q_del(&mq->dirty, e);
+
+		work.op = POLICY_WRITEBACK;
+		work.oblock = e->oblock;
+		work.cblock = infer_cblock(mq, e);
+
+		r = btracker_queue(mq->bg_work, &work, NULL);
+		WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
+	}
+}
+
+static void queue_demotion(struct smq_policy *mq)
+{
+	struct policy_work work;
+	struct entry *e;
+
+	if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
+		return;
+
+	e = q_peek(&mq->clean, mq->clean.nr_levels, true);
+	if (!e) {
+		if (!clean_target_met(mq, false))
+			queue_writeback(mq);
+		return;
+	}
+
+	mark_pending(mq, e);
+	q_del(&mq->clean, e);
+
+	work.op = POLICY_DEMOTE;
+	work.oblock = e->oblock;
+	work.cblock = infer_cblock(mq, e);
+	btracker_queue(mq->bg_work, &work, NULL);
+}
+
+static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
+			    struct policy_work **workp)
+{
+	struct entry *e;
+	struct policy_work work;
+
+	if (!mq->migrations_allowed)
+		return;
+
+	if (allocator_empty(&mq->cache_alloc)) {
+		if (!free_target_met(mq, false))
+			queue_demotion(mq);
+		return;
+	}
+
+	if (btracker_promotion_already_present(mq->bg_work, oblock))
+		return;
+
+	/*
+	 * We allocate the entry now to reserve the cblock.  If the
+	 * background work is aborted we must remember to free it.
+	 */
+	e = alloc_entry(&mq->cache_alloc);
+	BUG_ON(!e);
+	e->pending_work = true;
+	work.op = POLICY_PROMOTE;
+	work.oblock = oblock;
+	work.cblock = infer_cblock(mq, e);
+	btracker_queue(mq->bg_work, &work, workp);
+}
+
+/*----------------------------------------------------------------*/
+
 enum promote_result {
 	PROMOTE_NOT,
 	PROMOTE_TEMPORARY,
@@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote)
 	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
 }
 
-static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
-					  bool fast_promote)
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
+					  int data_dir, bool fast_promote)
 {
-	if (bio_data_dir(bio) == WRITE) {
+	if (data_dir == WRITE) {
 		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
 			return PROMOTE_TEMPORARY;
 
-		else
-			return maybe_promote(hs_e->level >= mq->write_promote_level);
+		return maybe_promote(hs_e->level >= mq->write_promote_level);
 	} else
 		return maybe_promote(hs_e->level >= mq->read_promote_level);
 }
 
-static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
-			    struct policy_locker *locker,
-			    struct policy_result *result, enum promote_result pr)
-{
-	int r;
-	struct entry *e;
-
-	if (allocator_empty(&mq->cache_alloc)) {
-		result->op = POLICY_REPLACE;
-		r = demote_cblock(mq, locker, &result->old_oblock);
-		if (r) {
-			result->op = POLICY_MISS;
-			return;
-		}
-
-	} else
-		result->op = POLICY_NEW;
-
-	e = alloc_entry(&mq->cache_alloc);
-	BUG_ON(!e);
-	e->oblock = oblock;
-
-	if (pr == PROMOTE_TEMPORARY)
-		push(mq, e);
-	else
-		push_new(mq, e);
-
-	result->cblock = infer_cblock(mq, e);
-}
-
 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
 {
 	sector_t r = from_oblock(b);
@@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
 	return to_oblock(r);
 }
 
-static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
 {
 	unsigned hi;
 	dm_oblock_t hb = to_hblock(mq, b);
@@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
 		hi = get_index(&mq->hotspot_alloc, e);
 		q_requeue(&mq->hotspot, e,
 			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
-			  0u : mq->hotspot_level_jump);
+			  0u : mq->hotspot_level_jump,
+			  NULL, NULL);
 
 	} else {
 		stats_miss(&mq->hotspot_stats);
@@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
 	return e;
 }
 
-/*
- * Looks the oblock up in the hash table, then decides whether to put in
- * pre_cache, or cache etc.
- */
-static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
-	       bool can_migrate, bool fast_promote,
-	       struct policy_locker *locker, struct policy_result *result)
-{
-	struct entry *e, *hs_e;
-	enum promote_result pr;
-
-	hs_e = update_hotspot_queue(mq, oblock, bio);
-
-	e = h_lookup(&mq->table, oblock);
-	if (e) {
-		stats_level_accessed(&mq->cache_stats, e->level);
-
-		requeue(mq, e);
-		result->op = POLICY_HIT;
-		result->cblock = infer_cblock(mq, e);
-
-	} else {
-		stats_miss(&mq->cache_stats);
-
-		pr = should_promote(mq, hs_e, bio, fast_promote);
-		if (pr == PROMOTE_NOT)
-			result->op = POLICY_MISS;
-
-		else {
-			if (!can_migrate) {
-				result->op = POLICY_MISS;
-				return -EWOULDBLOCK;
-			}
-
-			insert_in_cache(mq, oblock, locker, result, pr);
-		}
-	}
-
-	return 0;
-}
-
 /*----------------------------------------------------------------*/
 
 /*
@@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p)
 {
 	struct smq_policy *mq = to_smq_policy(p);
 
+	btracker_destroy(mq->bg_work);
 	h_exit(&mq->hotspot_table);
 	h_exit(&mq->table);
 	free_bitset(mq->hotspot_hit_bits);
@@ -1290,72 +1334,193 @@ static void smq_destroy(struct dm_cache_policy *p)
 	kfree(mq);
 }
 
-static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
-		   bool can_block, bool can_migrate, bool fast_promote,
-		   struct bio *bio, struct policy_locker *locker,
-		   struct policy_result *result)
+/*----------------------------------------------------------------*/
+
+static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
+		    int data_dir, bool fast_copy,
+		    struct policy_work **work, bool *background_work)
 {
-	int r;
-	unsigned long flags;
-	struct smq_policy *mq = to_smq_policy(p);
+	struct entry *e, *hs_e;
+	enum promote_result pr;
 
-	result->op = POLICY_MISS;
+	*background_work = false;
 
-	spin_lock_irqsave(&mq->lock, flags);
-	r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
-	spin_unlock_irqrestore(&mq->lock, flags);
-
-	return r;
-}
-
-static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
-{
-	int r;
-	unsigned long flags;
-	struct smq_policy *mq = to_smq_policy(p);
-	struct entry *e;
-
-	spin_lock_irqsave(&mq->lock, flags);
 	e = h_lookup(&mq->table, oblock);
 	if (e) {
+		stats_level_accessed(&mq->cache_stats, e->level);
+
+		requeue(mq, e);
 		*cblock = infer_cblock(mq, e);
-		r = 0;
-	} else
-		r = -ENOENT;
+		return 0;
+
+	} else {
+		stats_miss(&mq->cache_stats);
+
+		/*
+		 * The hotspot queue only gets updated with misses.
+		 */
+		hs_e = update_hotspot_queue(mq, oblock);
+
+		pr = should_promote(mq, hs_e, data_dir, fast_copy);
+		if (pr != PROMOTE_NOT) {
+			queue_promotion(mq, oblock, work);
+			*background_work = true;
+		}
+
+		return -ENOENT;
+	}
+}
+
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+		      int data_dir, bool fast_copy,
+		      bool *background_work)
+{
+	int r;
+	unsigned long flags;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	spin_lock_irqsave(&mq->lock, flags);
+	r = __lookup(mq, oblock, cblock,
+		     data_dir, fast_copy,
+		     NULL, background_work);
 	spin_unlock_irqrestore(&mq->lock, flags);
 
 	return r;
 }
 
-static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
+static int smq_lookup_with_work(struct dm_cache_policy *p,
+				dm_oblock_t oblock, dm_cblock_t *cblock,
+				int data_dir, bool fast_copy,
+				struct policy_work **work)
 {
-	struct entry *e;
+	int r;
+	bool background_queued;
+	unsigned long flags;
+	struct smq_policy *mq = to_smq_policy(p);
 
-	e = h_lookup(&mq->table, oblock);
-	BUG_ON(!e);
+	spin_lock_irqsave(&mq->lock, flags);
+	r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
-	del(mq, e);
-	e->dirty = set;
-	push(mq, e);
+	return r;
 }
 
-static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
+				   struct policy_work **result)
+{
+	int r;
+	unsigned long flags;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	spin_lock_irqsave(&mq->lock, flags);
+	r = btracker_issue(mq->bg_work, result);
+	if (r == -ENODATA) {
+		/* find some writeback work to do */
+		if (mq->migrations_allowed && !free_target_met(mq, idle))
+			queue_demotion(mq);
+
+		else if (!clean_target_met(mq, idle))
+			queue_writeback(mq);
+
+		r = btracker_issue(mq->bg_work, result);
+	}
+	spin_unlock_irqrestore(&mq->lock, flags);
+
+	return r;
+}
+
+/*
+ * We need to clear any pending work flags that have been set, and in the
+ * case of promotion free the entry for the destination cblock.
+ */
+static void __complete_background_work(struct smq_policy *mq,
+				       struct policy_work *work,
+				       bool success)
+{
+	struct entry *e = get_entry(&mq->cache_alloc,
+				    from_cblock(work->cblock));
+
+	switch (work->op) {
+	case POLICY_PROMOTE:
+		// !h, !q, a
+		clear_pending(mq, e);
+		if (success) {
+			e->oblock = work->oblock;
+			push(mq, e);
+			// h, q, a
+		} else {
+			free_entry(&mq->cache_alloc, e);
+			// !h, !q, !a
+		}
+		break;
+
+	case POLICY_DEMOTE:
+		// h, !q, a
+		if (success) {
+			h_remove(&mq->table, e);
+			free_entry(&mq->cache_alloc, e);
+			// !h, !q, !a
+		} else {
+			clear_pending(mq, e);
+			push_queue(mq, e);
+			// h, q, a
+		}
+		break;
+
+	case POLICY_WRITEBACK:
+		// h, !q, a
+		clear_pending(mq, e);
+		push_queue(mq, e);
+		// h, q, a
+		break;
+	}
+
+	btracker_complete(mq->bg_work, work);
+}
+
+static void smq_complete_background_work(struct dm_cache_policy *p,
+					 struct policy_work *work,
+					 bool success)
 {
 	unsigned long flags;
 	struct smq_policy *mq = to_smq_policy(p);
 
 	spin_lock_irqsave(&mq->lock, flags);
-	__smq_set_clear_dirty(mq, oblock, true);
+	__complete_background_work(mq, work, success);
 	spin_unlock_irqrestore(&mq->lock, flags);
 }
 
-static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+// in_hash(oblock) -> in_hash(oblock)
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
+{
+	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+
+	if (e->pending_work)
+		e->dirty = set;
+	else {
+		del_queue(mq, e);
+		e->dirty = set;
+		push_queue(mq, e);
+	}
+}
+
+static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	unsigned long flags;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	spin_lock_irqsave(&mq->lock, flags);
+	__smq_set_clear_dirty(mq, cblock, true);
+	spin_unlock_irqrestore(&mq->lock, flags);
+}
+
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
 {
 	struct smq_policy *mq = to_smq_policy(p);
 	unsigned long flags;
 
 	spin_lock_irqsave(&mq->lock, flags);
-	__smq_set_clear_dirty(mq, oblock, false);
+	__smq_set_clear_dirty(mq, cblock, false);
 	spin_unlock_irqrestore(&mq->lock, flags);
 }
 
@@ -1366,17 +1531,38 @@ static unsigned random_level(dm_cblock_t cblock)
 
 static int smq_load_mapping(struct dm_cache_policy *p,
 			    dm_oblock_t oblock, dm_cblock_t cblock,
-			    uint32_t hint, bool hint_valid)
+			    bool dirty, uint32_t hint, bool hint_valid)
 {
 	struct smq_policy *mq = to_smq_policy(p);
 	struct entry *e;
 
 	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
 	e->oblock = oblock;
-	e->dirty = false;	/* this gets corrected in a minute */
+	e->dirty = dirty;
 	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
-	push(mq, e);
+	e->pending_work = false;
 
+	/*
+	 * When we load mappings we push ahead of both sentinels in order to
+	 * allow demotions and cleaning to occur immediately.
+	 */
+	push_front(mq, e);
+
+	return 0;
+}
+
+static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+
+	if (!e->allocated)
+		return -ENODATA;
+
+	// FIXME: what if this block has pending background work?
+	del_queue(mq, e);
+	h_remove(&mq->table, e);
+	free_entry(&mq->cache_alloc, e);
 	return 0;
 }
 
@@ -1391,135 +1577,6 @@ static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
 	return e->level;
 }
 
-static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
-{
-	struct entry *e;
-
-	e = h_lookup(&mq->table, oblock);
-	BUG_ON(!e);
-
-	del(mq, e);
-	free_entry(&mq->cache_alloc, e);
-}
-
-static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
-	struct smq_policy *mq = to_smq_policy(p);
-	unsigned long flags;
-
-	spin_lock_irqsave(&mq->lock, flags);
-	__remove_mapping(mq, oblock);
-	spin_unlock_irqrestore(&mq->lock, flags);
-}
-
-static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
-{
-	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
-
-	if (!e || !e->allocated)
-		return -ENODATA;
-
-	del(mq, e);
-	free_entry(&mq->cache_alloc, e);
-
-	return 0;
-}
-
-static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
-	int r;
-	unsigned long flags;
-	struct smq_policy *mq = to_smq_policy(p);
-
-	spin_lock_irqsave(&mq->lock, flags);
-	r = __remove_cblock(mq, cblock);
-	spin_unlock_irqrestore(&mq->lock, flags);
-
-	return r;
-}
-
-
-#define CLEAN_TARGET_CRITICAL 5u /* percent */
-
-static bool clean_target_met(struct smq_policy *mq, bool critical)
-{
-	if (critical) {
-		/*
-		 * Cache entries may not be populated.  So we're cannot rely on the
-		 * size of the clean queue.
-		 */
-		unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
-		unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
-
-		return nr_clean >= target;
-	} else
-		return !q_size(&mq->dirty);
-}
-
-static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
-				dm_cblock_t *cblock, bool critical_only)
-{
-	struct entry *e = NULL;
-	bool target_met = clean_target_met(mq, critical_only);
-
-	if (critical_only)
-		/*
-		 * Always try and keep the bottom level clean.
-		 */
-		e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
-
-	else
-		e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
-
-	if (!e)
-		return -ENODATA;
-
-	*oblock = e->oblock;
-	*cblock = infer_cblock(mq, e);
-	e->dirty = false;
-	push_new(mq, e);
-
-	return 0;
-}
-
-static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
-			      dm_cblock_t *cblock, bool critical_only)
-{
-	int r;
-	unsigned long flags;
-	struct smq_policy *mq = to_smq_policy(p);
-
-	spin_lock_irqsave(&mq->lock, flags);
-	r = __smq_writeback_work(mq, oblock, cblock, critical_only);
-	spin_unlock_irqrestore(&mq->lock, flags);
-
-	return r;
-}
-
-static void __force_mapping(struct smq_policy *mq,
-			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-	struct entry *e = h_lookup(&mq->table, current_oblock);
-
-	if (e) {
-		del(mq, e);
-		e->oblock = new_oblock;
-		e->dirty = true;
-		push(mq, e);
-	}
-}
-
-static void smq_force_mapping(struct dm_cache_policy *p,
-			      dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-	unsigned long flags;
-	struct smq_policy *mq = to_smq_policy(p);
-
-	spin_lock_irqsave(&mq->lock, flags);
-	__force_mapping(mq, current_oblock, new_oblock);
-	spin_unlock_irqrestore(&mq->lock, flags);
-}
-
 static dm_cblock_t smq_residency(struct dm_cache_policy *p)
 {
 	dm_cblock_t r;
@@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
 	spin_unlock_irqrestore(&mq->lock, flags);
 }
 
+static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+	mq->migrations_allowed = allow;
+}
+
 /*
  * smq has no config values, but the old mq policy did.  To avoid breaking
  * software we continue to accept these configurables for the mq policy,
@@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
 {
 	mq->policy.destroy = smq_destroy;
-	mq->policy.map = smq_map;
 	mq->policy.lookup = smq_lookup;
+	mq->policy.lookup_with_work = smq_lookup_with_work;
+	mq->policy.get_background_work = smq_get_background_work;
+	mq->policy.complete_background_work = smq_complete_background_work;
 	mq->policy.set_dirty = smq_set_dirty;
 	mq->policy.clear_dirty = smq_clear_dirty;
 	mq->policy.load_mapping = smq_load_mapping;
+	mq->policy.invalidate_mapping = smq_invalidate_mapping;
 	mq->policy.get_hint = smq_get_hint;
-	mq->policy.remove_mapping = smq_remove_mapping;
-	mq->policy.remove_cblock = smq_remove_cblock;
-	mq->policy.writeback_work = smq_writeback_work;
-	mq->policy.force_mapping = smq_force_mapping;
 	mq->policy.residency = smq_residency;
 	mq->policy.tick = smq_tick;
+	mq->policy.allow_migrations = smq_allow_migrations;
 
 	if (mimic_mq) {
 		mq->policy.set_config_value = mq_set_config_value;
@@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size,
 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
 					    sector_t origin_size,
 					    sector_t cache_block_size,
-					    bool mimic_mq)
+					    bool mimic_mq,
+					    bool migrations_allowed)
 {
 	unsigned i;
 	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
 	}
 
 	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
-        for (i = 0; i < nr_sentinels_per_queue; i++)
+	for (i = 0; i < nr_sentinels_per_queue; i++)
 		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
 
 	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
-        for (i = 0; i < nr_sentinels_per_queue; i++)
+	for (i = 0; i < nr_sentinels_per_queue; i++)
 		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
 
 	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
@@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
 	mq->next_hotspot_period = jiffies;
 	mq->next_cache_period = jiffies;
 
+	mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
+	if (!mq->bg_work)
+		goto bad_btracker;
+
+	mq->migrations_allowed = migrations_allowed;
+
 	return &mq->policy;
 
+bad_btracker:
+	h_exit(&mq->hotspot_table);
 bad_alloc_hotspot_table:
 	h_exit(&mq->table);
 bad_alloc_table:
@@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
 					  sector_t origin_size,
 					  sector_t cache_block_size)
 {
-	return __smq_create(cache_size, origin_size, cache_block_size, false);
+	return __smq_create(cache_size, origin_size, cache_block_size, false, true);
 }
 
 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
 					 sector_t origin_size,
 					 sector_t cache_block_size)
 {
-	return __smq_create(cache_size, origin_size, cache_block_size, true);
+	return __smq_create(cache_size, origin_size, cache_block_size, true, true);
+}
+
+static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
+					      sector_t origin_size,
+					      sector_t cache_block_size)
+{
+	return __smq_create(cache_size, origin_size, cache_block_size, false, false);
 }
 
 /*----------------------------------------------------------------*/
 
 static struct dm_cache_policy_type smq_policy_type = {
 	.name = "smq",
-	.version = {1, 5, 0},
+	.version = {2, 0, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = smq_create
@@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = {
 
 static struct dm_cache_policy_type mq_policy_type = {
 	.name = "mq",
-	.version = {1, 5, 0},
+	.version = {2, 0, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = mq_create,
 };
 
+static struct dm_cache_policy_type cleaner_policy_type = {
+	.name = "cleaner",
+	.version = {2, 0, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = cleaner_create,
+};
+
 static struct dm_cache_policy_type default_policy_type = {
 	.name = "default",
-	.version = {1, 5, 0},
+	.version = {2, 0, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = smq_create,
@@ -1785,23 +1872,36 @@ static int __init smq_init(void)
 	r = dm_cache_policy_register(&mq_policy_type);
 	if (r) {
 		DMERR("register failed (as mq) %d", r);
-		dm_cache_policy_unregister(&smq_policy_type);
-		return -ENOMEM;
+		goto out_mq;
+	}
+
+	r = dm_cache_policy_register(&cleaner_policy_type);
+	if (r) {
+		DMERR("register failed (as cleaner) %d", r);
+		goto out_cleaner;
 	}
 
 	r = dm_cache_policy_register(&default_policy_type);
 	if (r) {
 		DMERR("register failed (as default) %d", r);
-		dm_cache_policy_unregister(&mq_policy_type);
-		dm_cache_policy_unregister(&smq_policy_type);
-		return -ENOMEM;
+		goto out_default;
 	}
 
 	return 0;
+
+out_default:
+	dm_cache_policy_unregister(&cleaner_policy_type);
+out_cleaner:
+	dm_cache_policy_unregister(&mq_policy_type);
+out_mq:
+	dm_cache_policy_unregister(&smq_policy_type);
+
+	return -ENOMEM;
 }
 
 static void __exit smq_exit(void)
 {
+	dm_cache_policy_unregister(&cleaner_policy_type);
 	dm_cache_policy_unregister(&smq_policy_type);
 	dm_cache_policy_unregister(&mq_policy_type);
 	dm_cache_policy_unregister(&default_policy_type);
@@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy");
 
 MODULE_ALIAS("dm-cache-default");
 MODULE_ALIAS("dm-cache-mq");
+MODULE_ALIAS("dm-cache-cleaner");
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index aa10b1493f34..c05fc3436cef 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -13,147 +13,94 @@
 
 /*----------------------------------------------------------------*/
 
-/* FIXME: make it clear which methods are optional.  Get debug policy to
- * double check this at start.
- */
-
 /*
  * The cache policy makes the important decisions about which blocks get to
  * live on the faster cache device.
- *
- * When the core target has to remap a bio it calls the 'map' method of the
- * policy.  This returns an instruction telling the core target what to do.
- *
- * POLICY_HIT:
- *   That block is in the cache.  Remap to the cache and carry on.
- *
- * POLICY_MISS:
- *   This block is on the origin device.  Remap and carry on.
- *
- * POLICY_NEW:
- *   This block is currently on the origin device, but the policy wants to
- *   move it.  The core should:
- *
- *   - hold any further io to this origin block
- *   - copy the origin to the given cache block
- *   - release all the held blocks
- *   - remap the original block to the cache
- *
- * POLICY_REPLACE:
- *   This block is currently on the origin device.  The policy wants to
- *   move it to the cache, with the added complication that the destination
- *   cache block needs a writeback first.  The core should:
- *
- *   - hold any further io to this origin block
- *   - hold any further io to the origin block that's being written back
- *   - writeback
- *   - copy new block to cache
- *   - release held blocks
- *   - remap bio to cache and reissue.
- *
- * Should the core run into trouble while processing a POLICY_NEW or
- * POLICY_REPLACE instruction it will roll back the policies mapping using
- * remove_mapping() or force_mapping().  These methods must not fail.  This
- * approach avoids having transactional semantics in the policy (ie, the
- * core informing the policy when a migration is complete), and hence makes
- * it easier to write new policies.
- *
- * In general policy methods should never block, except in the case of the
- * map function when can_migrate is set.  So be careful to implement using
- * bounded, preallocated memory.
  */
 enum policy_operation {
-	POLICY_HIT,
-	POLICY_MISS,
-	POLICY_NEW,
-	POLICY_REPLACE
-};
-
-/*
- * When issuing a POLICY_REPLACE the policy needs to make a callback to
- * lock the block being demoted.  This doesn't need to occur during a
- * writeback operation since the block remains in the cache.
- */
-struct policy_locker;
-typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
-
-struct policy_locker {
-	policy_lock_fn fn;
+	POLICY_PROMOTE,
+	POLICY_DEMOTE,
+	POLICY_WRITEBACK
 };
 
 /*
  * This is the instruction passed back to the core target.
  */
-struct policy_result {
+struct policy_work {
 	enum policy_operation op;
-	dm_oblock_t old_oblock;	/* POLICY_REPLACE */
-	dm_cblock_t cblock;	/* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+	dm_oblock_t oblock;
+	dm_cblock_t cblock;
 };
 
 /*
- * The cache policy object.  Just a bunch of methods.  It is envisaged that
- * this structure will be embedded in a bigger, policy specific structure
- * (ie. use container_of()).
+ * The cache policy object.  It is envisaged that this structure will be
+ * embedded in a bigger, policy specific structure (ie. use container_of()).
  */
 struct dm_cache_policy {
-
-	/*
-	 * FIXME: make it clear which methods are optional, and which may
-	 * block.
-	 */
-
 	/*
 	 * Destroys this object.
 	 */
 	void (*destroy)(struct dm_cache_policy *p);
 
 	/*
-	 * See large comment above.
-	 *
-	 * oblock      - the origin block we're interested in.
-	 *
-	 * can_block - indicates whether the current thread is allowed to
-	 *             block.  -EWOULDBLOCK returned if it can't and would.
-	 *
-	 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
-	 *               instructions.  If denied and the policy would have
-	 *               returned one of these instructions it should
-	 *               return -EWOULDBLOCK.
-	 *
-	 * discarded_oblock - indicates whether the whole origin block is
-	 *               in a discarded state (FIXME: better to tell the
-	 *               policy about this sooner, so it can recycle that
-	 *               cache block if it wants.)
-	 * bio         - the bio that triggered this call.
-	 * result      - gets filled in with the instruction.
-	 *
-	 * May only return 0, or -EWOULDBLOCK (if !can_migrate)
-	 */
-	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
-		   bool can_block, bool can_migrate, bool discarded_oblock,
-		   struct bio *bio, struct policy_locker *locker,
-		   struct policy_result *result);
-
-	/*
-	 * Sometimes we want to see if a block is in the cache, without
-	 * triggering any update of stats.  (ie. it's not a real hit).
+	 * Find the location of a block.
 	 *
 	 * Must not block.
 	 *
-	 * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
-	 * (-EWOULDBLOCK would be typical).
+	 * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for
+	 * other errors (-EWOULDBLOCK would be typical).  data_dir should be
+	 * READ or WRITE. fast_copy should be set if migrating this block would
+	 * be 'cheap' somehow (eg, discarded data). background_queued will be set
+	 * if a migration has just been queued.
 	 */
-	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
+	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+		      int data_dir, bool fast_copy, bool *background_queued);
 
-	void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
-	void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+	/*
+	 * Sometimes the core target can optimise a migration, eg, the
+	 * block may be discarded, or the bio may cover an entire block.
+	 * In order to optimise it needs the migration immediately though
+	 * so it knows to do something different with the bio.
+	 *
+	 * This method is optional (policy-internal will fallback to using
+	 * lookup).
+	 */
+	int (*lookup_with_work)(struct dm_cache_policy *p,
+				dm_oblock_t oblock, dm_cblock_t *cblock,
+				int data_dir, bool fast_copy,
+				struct policy_work **work);
+
+	/*
+	 * Retrieves background work.  Returns -ENODATA when there's no
+	 * background work.
+	 */
+	int (*get_background_work)(struct dm_cache_policy *p, bool idle,
+			           struct policy_work **result);
+
+	/*
+	 * You must pass in the same work pointer that you were given, not
+	 * a copy.
+	 */
+	void (*complete_background_work)(struct dm_cache_policy *p,
+					 struct policy_work *work,
+					 bool success);
+
+	void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
+	void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
 
 	/*
 	 * Called when a cache target is first created.  Used to load a
 	 * mapping from the metadata device into the policy.
 	 */
 	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
-			    dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+			    dm_cblock_t cblock, bool dirty,
+			    uint32_t hint, bool hint_valid);
+
+	/*
+	 * Drops the mapping, irrespective of whether it's clean or dirty.
+	 * Returns -ENODATA if cblock is not mapped.
+	 */
+	int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock);
 
 	/*
 	 * Gets the hint for a given cblock.  Called in a single threaded
@@ -161,36 +108,6 @@ struct dm_cache_policy {
 	 */
 	uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
 
-	/*
-	 * Override functions used on the error paths of the core target.
-	 * They must succeed.
-	 */
-	void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
-	void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
-			      dm_oblock_t new_oblock);
-
-	/*
-	 * This is called via the invalidate_cblocks message.  It is
-	 * possible the particular cblock has already been removed due to a
-	 * write io in passthrough mode.  In which case this should return
-	 * -ENODATA.
-	 */
-	int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
-
-	/*
-	 * Provide a dirty block to be written back by the core target.  If
-	 * critical_only is set then the policy should only provide work if
-	 * it urgently needs it.
-	 *
-	 * Returns:
-	 *
-	 * 0 and @cblock,@oblock: block to write back provided
-	 *
-	 * -ENODATA: no dirty blocks available
-	 */
-	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
-			      bool critical_only);
-
 	/*
 	 * How full is the cache?
 	 */
@@ -202,6 +119,8 @@ struct dm_cache_policy {
 	 * queue merging has occurred).  To stop the policy being fooled by
 	 * these, the core target sends regular tick() calls to the policy.
 	 * The policy should only count an entry as hit once per tick.
+	 *
+	 * This method is optional.
 	 */
 	void (*tick)(struct dm_cache_policy *p, bool can_block);
 
@@ -213,6 +132,8 @@ struct dm_cache_policy {
 	int (*set_config_value)(struct dm_cache_policy *p,
 				const char *key, const char *value);
 
+	void (*allow_migrations)(struct dm_cache_policy *p, bool allow);
+
 	/*
 	 * Book keeping ptr for the policy register, not for general use.
 	 */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2eaa414e1509..b7de289a10bb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -5,7 +5,7 @@
  */
 
 #include "dm.h"
-#include "dm-bio-prison-v1.h"
+#include "dm-bio-prison-v2.h"
 #include "dm-bio-record.h"
 #include "dm-cache-metadata.h"
 
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
+#include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
@@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
 
 /*----------------------------------------------------------------*/
 
-#define IOT_RESOLUTION 4
+/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ *	      either direction
+ */
+
+/*----------------------------------------------------------------*/
 
 struct io_tracker {
 	spinlock_t lock;
@@ -99,18 +111,177 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
 /*----------------------------------------------------------------*/
 
 /*
- * Glossary:
- *
- * oblock: index of an origin block
- * cblock: index of a cache block
- * promotion: movement of a block from origin to cache
- * demotion: movement of a block from cache to origin
- * migration: movement of a block between the origin and cache device,
- *	      either direction
+ * Represents a chunk of future work.  'input' allows continuations to pass
+ * values between themselves, typically error values.
  */
+struct continuation {
+	struct work_struct ws;
+	int input;
+};
+
+static inline void init_continuation(struct continuation *k,
+				     void (*fn)(struct work_struct *))
+{
+	INIT_WORK(&k->ws, fn);
+	k->input = 0;
+}
+
+static inline void queue_continuation(struct workqueue_struct *wq,
+				      struct continuation *k)
+{
+	queue_work(wq, &k->ws);
+}
 
 /*----------------------------------------------------------------*/
 
+/*
+ * The batcher collects together pieces of work that need a particular
+ * operation to occur before they can proceed (typically a commit).
+ */
+struct batcher {
+	/*
+	 * The operation that everyone is waiting for.
+	 */
+	int (*commit_op)(void *context);
+	void *commit_context;
+
+	/*
+	 * This is how bios should be issued once the commit op is complete
+	 * (accounted_request).
+	 */
+	void (*issue_op)(struct bio *bio, void *context);
+	void *issue_context;
+
+	/*
+	 * Queued work gets put on here after commit.
+	 */
+	struct workqueue_struct *wq;
+
+	spinlock_t lock;
+	struct list_head work_items;
+	struct bio_list bios;
+	struct work_struct commit_work;
+
+	bool commit_scheduled;
+};
+
+static void __commit(struct work_struct *_ws)
+{
+	struct batcher *b = container_of(_ws, struct batcher, commit_work);
+
+	int r;
+	unsigned long flags;
+	struct list_head work_items;
+	struct work_struct *ws, *tmp;
+	struct continuation *k;
+	struct bio *bio;
+	struct bio_list bios;
+
+	INIT_LIST_HEAD(&work_items);
+	bio_list_init(&bios);
+
+	/*
+	 * We have to grab these before the commit_op to avoid a race
+	 * condition.
+	 */
+	spin_lock_irqsave(&b->lock, flags);
+	list_splice_init(&b->work_items, &work_items);
+	bio_list_merge(&bios, &b->bios);
+	bio_list_init(&b->bios);
+	b->commit_scheduled = false;
+	spin_unlock_irqrestore(&b->lock, flags);
+
+	r = b->commit_op(b->commit_context);
+
+	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
+		k = container_of(ws, struct continuation, ws);
+		k->input = r;
+		INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
+		queue_work(b->wq, ws);
+	}
+
+	while ((bio = bio_list_pop(&bios))) {
+		if (r) {
+			bio->bi_error = r;
+			bio_endio(bio);
+		} else
+			b->issue_op(bio, b->issue_context);
+	}
+}
+
+static void batcher_init(struct batcher *b,
+			 int (*commit_op)(void *),
+			 void *commit_context,
+			 void (*issue_op)(struct bio *bio, void *),
+			 void *issue_context,
+			 struct workqueue_struct *wq)
+{
+	b->commit_op = commit_op;
+	b->commit_context = commit_context;
+	b->issue_op = issue_op;
+	b->issue_context = issue_context;
+	b->wq = wq;
+
+	spin_lock_init(&b->lock);
+	INIT_LIST_HEAD(&b->work_items);
+	bio_list_init(&b->bios);
+	INIT_WORK(&b->commit_work, __commit);
+	b->commit_scheduled = false;
+}
+
+static void async_commit(struct batcher *b)
+{
+	queue_work(b->wq, &b->commit_work);
+}
+
+static void continue_after_commit(struct batcher *b, struct continuation *k)
+{
+	unsigned long flags;
+	bool commit_scheduled;
+
+	spin_lock_irqsave(&b->lock, flags);
+	commit_scheduled = b->commit_scheduled;
+	list_add_tail(&k->ws.entry, &b->work_items);
+	spin_unlock_irqrestore(&b->lock, flags);
+
+	if (commit_scheduled)
+		async_commit(b);
+}
+
+/*
+ * Bios are errored if commit failed.
+ */
+static void issue_after_commit(struct batcher *b, struct bio *bio)
+{
+       unsigned long flags;
+       bool commit_scheduled;
+
+       spin_lock_irqsave(&b->lock, flags);
+       commit_scheduled = b->commit_scheduled;
+       bio_list_add(&b->bios, bio);
+       spin_unlock_irqrestore(&b->lock, flags);
+
+       if (commit_scheduled)
+	       async_commit(b);
+}
+
+/*
+ * Call this if some urgent work is waiting for the commit to complete.
+ */
+static void schedule_commit(struct batcher *b)
+{
+	bool immediate;
+	unsigned long flags;
+
+	spin_lock_irqsave(&b->lock, flags);
+	immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
+	b->commit_scheduled = true;
+	spin_unlock_irqrestore(&b->lock, flags);
+
+	if (immediate)
+		async_commit(b);
+}
+
 /*
  * There are a couple of places where we let a bio run, but want to do some
  * work before calling its endio function.  We do this by temporarily
@@ -189,31 +360,13 @@ struct cache_stats {
 	atomic_t write_miss;
 	atomic_t demotion;
 	atomic_t promotion;
+	atomic_t writeback;
 	atomic_t copies_avoided;
 	atomic_t cache_cell_clash;
 	atomic_t commit_count;
 	atomic_t discard_count;
 };
 
-/*
- * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
- * the one-past-the-end value.
- */
-struct cblock_range {
-	dm_cblock_t begin;
-	dm_cblock_t end;
-};
-
-struct invalidation_request {
-	struct list_head list;
-	struct cblock_range *cblocks;
-
-	atomic_t complete;
-	int err;
-
-	wait_queue_head_t result_wait;
-};
-
 struct cache {
 	struct dm_target *ti;
 	struct dm_target_callbacks callbacks;
@@ -255,11 +408,7 @@ struct cache {
 	spinlock_t lock;
 	struct list_head deferred_cells;
 	struct bio_list deferred_bios;
-	struct bio_list deferred_flush_bios;
 	struct bio_list deferred_writethrough_bios;
-	struct list_head quiesced_migrations;
-	struct list_head completed_migrations;
-	struct list_head need_commit_migrations;
 	sector_t migration_threshold;
 	wait_queue_head_t migration_wait;
 	atomic_t nr_allocated_migrations;
@@ -270,9 +419,7 @@ struct cache {
 	 */
 	atomic_t nr_io_migrations;
 
-	wait_queue_head_t quiescing_wait;
-	atomic_t quiescing;
-	atomic_t quiescing_ack;
+	struct rw_semaphore quiesce_lock;
 
 	/*
 	 * cache_size entries, dirty if set
@@ -296,13 +443,11 @@ struct cache {
 
 	struct dm_kcopyd_client *copier;
 	struct workqueue_struct *wq;
-	struct work_struct worker;
-
+	struct work_struct deferred_bio_worker;
+	struct work_struct deferred_writethrough_worker;
+	struct work_struct migration_worker;
 	struct delayed_work waker;
-	unsigned long last_commit_jiffies;
-
-	struct dm_bio_prison *prison;
-	struct dm_deferred_set *all_io_ds;
+	struct dm_bio_prison_v2 *prison;
 
 	mempool_t *migration_pool;
 
@@ -330,12 +475,17 @@ struct cache {
 	struct list_head invalidation_requests;
 
 	struct io_tracker origin_tracker;
+
+	struct work_struct commit_ws;
+	struct batcher committer;
+
+	struct rw_semaphore background_work_lock;
 };
 
 struct per_bio_data {
 	bool tick:1;
 	unsigned req_nr:2;
-	struct dm_deferred_entry *all_io_entry;
+	struct dm_bio_prison_cell_v2 *cell;
 	struct dm_hook_info hook_info;
 	sector_t len;
 
@@ -350,55 +500,64 @@ struct per_bio_data {
 };
 
 struct dm_cache_migration {
-	struct list_head list;
+	struct continuation k;
 	struct cache *cache;
 
-	unsigned long start_jiffies;
-	dm_oblock_t old_oblock;
-	dm_oblock_t new_oblock;
-	dm_cblock_t cblock;
+	struct policy_work *op;
+	struct bio *overwrite_bio;
+	struct dm_bio_prison_cell_v2 *cell;
 
-	bool err:1;
-	bool discard:1;
-	bool writeback:1;
-	bool demote:1;
-	bool promote:1;
-	bool requeue_holder:1;
-	bool invalidate:1;
-
-	struct dm_bio_prison_cell *old_ocell;
-	struct dm_bio_prison_cell *new_ocell;
+	dm_cblock_t invalidate_cblock;
+	dm_oblock_t invalidate_oblock;
 };
 
-/*
- * Processing a bio in the worker thread may require these memory
- * allocations.  We prealloc to avoid deadlocks (the same worker thread
- * frees them back to the mempool).
- */
-struct prealloc {
-	struct dm_cache_migration *mg;
-	struct dm_bio_prison_cell *cell1;
-	struct dm_bio_prison_cell *cell2;
-};
+/*----------------------------------------------------------------*/
 
-static enum cache_metadata_mode get_cache_mode(struct cache *cache);
-
-static void wake_worker(struct cache *cache)
+static bool writethrough_mode(struct cache_features *f)
 {
-	queue_work(cache->wq, &cache->worker);
+	return f->io_mode == CM_IO_WRITETHROUGH;
+}
+
+static bool writeback_mode(struct cache_features *f)
+{
+	return f->io_mode == CM_IO_WRITEBACK;
+}
+
+static inline bool passthrough_mode(struct cache_features *f)
+{
+	return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
 }
 
 /*----------------------------------------------------------------*/
 
-static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+static void wake_deferred_bio_worker(struct cache *cache)
 {
-	/* FIXME: change to use a local slab. */
-	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+	queue_work(cache->wq, &cache->deferred_bio_worker);
 }
 
-static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+static void wake_deferred_writethrough_worker(struct cache *cache)
 {
-	dm_bio_prison_free_cell(cache->prison, cell);
+	queue_work(cache->wq, &cache->deferred_writethrough_worker);
+}
+
+static void wake_migration_worker(struct cache *cache)
+{
+	if (passthrough_mode(&cache->features))
+		return;
+
+	queue_work(cache->wq, &cache->migration_worker);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
+{
+	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
+}
+
+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
+{
+	dm_bio_prison_free_cell_v2(cache->prison, cell);
 }
 
 static struct dm_cache_migration *alloc_migration(struct cache *cache)
@@ -424,91 +583,14 @@ static void free_migration(struct dm_cache_migration *mg)
 	mempool_free(mg, cache->migration_pool);
 }
 
-static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
-{
-	if (!p->mg) {
-		p->mg = alloc_migration(cache);
-		if (!p->mg)
-			return -ENOMEM;
-	}
-
-	if (!p->cell1) {
-		p->cell1 = alloc_prison_cell(cache);
-		if (!p->cell1)
-			return -ENOMEM;
-	}
-
-	if (!p->cell2) {
-		p->cell2 = alloc_prison_cell(cache);
-		if (!p->cell2)
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
-{
-	if (p->cell2)
-		free_prison_cell(cache, p->cell2);
-
-	if (p->cell1)
-		free_prison_cell(cache, p->cell1);
-
-	if (p->mg)
-		free_migration(p->mg);
-}
-
-static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
-{
-	struct dm_cache_migration *mg = p->mg;
-
-	BUG_ON(!mg);
-	p->mg = NULL;
-
-	return mg;
-}
-
-/*
- * You must have a cell within the prealloc struct to return.  If not this
- * function will BUG() rather than returning NULL.
- */
-static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
-{
-	struct dm_bio_prison_cell *r = NULL;
-
-	if (p->cell1) {
-		r = p->cell1;
-		p->cell1 = NULL;
-
-	} else if (p->cell2) {
-		r = p->cell2;
-		p->cell2 = NULL;
-	} else
-		BUG();
-
-	return r;
-}
-
-/*
- * You can't have more than two cells in a prealloc struct.  BUG() will be
- * called if you try and overfill.
- */
-static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
-{
-	if (!p->cell2)
-		p->cell2 = cell;
-
-	else if (!p->cell1)
-		p->cell1 = cell;
-
-	else
-		BUG();
-}
-
 /*----------------------------------------------------------------*/
 
-static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
+static inline dm_oblock_t oblock_succ(dm_oblock_t b)
+{
+	return to_oblock(from_oblock(b) + 1ull);
+}
+
+static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
 {
 	key->virtual = 0;
 	key->dev = 0;
@@ -517,53 +599,111 @@ static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *ke
 }
 
 /*
- * The caller hands in a preallocated cell, and a free function for it.
- * The cell will be freed if there's an error, or if it wasn't used because
- * a cell with that key already exists.
+ * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
+ * level 1 which prevents *both* READs and WRITEs.
  */
-typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+#define WRITE_LOCK_LEVEL 0
+#define READ_WRITE_LOCK_LEVEL 1
 
-static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
-			    struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
-			    cell_free_fn free_fn, void *free_context,
-			    struct dm_bio_prison_cell **cell_result)
+static unsigned lock_level(struct bio *bio)
 {
-	int r;
-	struct dm_cell_key key;
-
-	build_key(oblock_begin, oblock_end, &key);
-	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
-	if (r)
-		free_fn(free_context, cell_prealloc);
-
-	return r;
+	return bio_data_dir(bio) == WRITE ?
+		WRITE_LOCK_LEVEL :
+		READ_WRITE_LOCK_LEVEL;
 }
 
-static int bio_detain(struct cache *cache, dm_oblock_t oblock,
-		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
-		      cell_free_fn free_fn, void *free_context,
-		      struct dm_bio_prison_cell **cell_result)
+/*----------------------------------------------------------------
+ * Per bio data
+ *--------------------------------------------------------------*/
+
+/*
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
+ */
+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
+
+static size_t get_per_bio_data_size(struct cache *cache)
 {
+	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+}
+
+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
+{
+	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
+	BUG_ON(!pb);
+	return pb;
+}
+
+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
+{
+	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
+
+	pb->tick = false;
+	pb->req_nr = dm_bio_get_target_bio_nr(bio);
+	pb->cell = NULL;
+	pb->len = 0;
+
+	return pb;
+}
+
+/*----------------------------------------------------------------*/
+
+static void defer_bio(struct cache *cache, struct bio *bio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_add(&cache->deferred_bios, bio);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_deferred_bio_worker(cache);
+}
+
+static void defer_bios(struct cache *cache, struct bio_list *bios)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	bio_list_merge(&cache->deferred_bios, bios);
+	bio_list_init(bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	wake_deferred_bio_worker(cache);
+}
+
+/*----------------------------------------------------------------*/
+
+static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
+{
+	bool r;
+	size_t pb_size;
+	struct per_bio_data *pb;
+	struct dm_cell_key_v2 key;
 	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
-	return bio_detain_range(cache, oblock, end, bio,
-				cell_prealloc, free_fn, free_context, cell_result);
-}
+	struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
 
-static int get_cell(struct cache *cache,
-		    dm_oblock_t oblock,
-		    struct prealloc *structs,
-		    struct dm_bio_prison_cell **cell_result)
-{
-	int r;
-	struct dm_cell_key key;
-	struct dm_bio_prison_cell *cell_prealloc;
+	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
+	if (!cell_prealloc) {
+		defer_bio(cache, bio);
+		return false;
+	}
 
-	cell_prealloc = prealloc_get_cell(structs);
+	build_key(oblock, end, &key);
+	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
+	if (!r) {
+		/*
+		 * Failed to get the lock.
+		 */
+		free_prison_cell(cache, cell_prealloc);
+		return r;
+	}
 
-	build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
-	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
-	if (r)
-		prealloc_put_cell(structs, cell_prealloc);
+	if (cell != cell_prealloc)
+		free_prison_cell(cache, cell_prealloc);
+
+	pb_size = get_per_bio_data_size(cache);
+	pb = get_per_bio_data(bio, pb_size);
+	pb->cell = cell;
 
 	return r;
 }
@@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
 	return test_bit(from_cblock(b), cache->dirty_bitset);
 }
 
-static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+static void set_dirty(struct cache *cache, dm_cblock_t cblock)
 {
 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 		atomic_inc(&cache->nr_dirty);
-		policy_set_dirty(cache->policy, oblock);
+		policy_set_dirty(cache->policy, cblock);
 	}
 }
 
-static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+/*
+ * These two are called when setting after migrations to force the policy
+ * and dirty bitset to be in sync.
+ */
+static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
+{
+	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
+		atomic_inc(&cache->nr_dirty);
+	policy_set_dirty(cache->policy, cblock);
+}
+
+static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
 {
 	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
-		policy_clear_dirty(cache->policy, oblock);
 		if (atomic_dec_return(&cache->nr_dirty) == 0)
 			dm_table_event(cache->ti->table);
 	}
+
+	policy_clear_dirty(cache->policy, cblock);
 }
 
 /*----------------------------------------------------------------*/
@@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 				   oblocks_per_dblock(cache)));
 }
 
-static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
-{
-	return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
-}
-
 static void set_discard(struct cache *cache, dm_dblock_t b)
 {
 	unsigned long flags;
@@ -679,83 +826,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 	return r;
 }
 
-/*----------------------------------------------------------------*/
-
-static void load_stats(struct cache *cache)
-{
-	struct dm_cache_statistics stats;
-
-	dm_cache_metadata_get_stats(cache->cmd, &stats);
-	atomic_set(&cache->stats.read_hit, stats.read_hits);
-	atomic_set(&cache->stats.read_miss, stats.read_misses);
-	atomic_set(&cache->stats.write_hit, stats.write_hits);
-	atomic_set(&cache->stats.write_miss, stats.write_misses);
-}
-
-static void save_stats(struct cache *cache)
-{
-	struct dm_cache_statistics stats;
-
-	if (get_cache_mode(cache) >= CM_READ_ONLY)
-		return;
-
-	stats.read_hits = atomic_read(&cache->stats.read_hit);
-	stats.read_misses = atomic_read(&cache->stats.read_miss);
-	stats.write_hits = atomic_read(&cache->stats.write_hit);
-	stats.write_misses = atomic_read(&cache->stats.write_miss);
-
-	dm_cache_metadata_set_stats(cache->cmd, &stats);
-}
-
-/*----------------------------------------------------------------
- * Per bio data
- *--------------------------------------------------------------*/
-
-/*
- * If using writeback, leave out struct per_bio_data's writethrough fields.
- */
-#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
-#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
-
-static bool writethrough_mode(struct cache_features *f)
-{
-	return f->io_mode == CM_IO_WRITETHROUGH;
-}
-
-static bool writeback_mode(struct cache_features *f)
-{
-	return f->io_mode == CM_IO_WRITEBACK;
-}
-
-static bool passthrough_mode(struct cache_features *f)
-{
-	return f->io_mode == CM_IO_PASSTHROUGH;
-}
-
-static size_t get_per_bio_data_size(struct cache *cache)
-{
-	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
-}
-
-static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
-{
-	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
-	BUG_ON(!pb);
-	return pb;
-}
-
-static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
-{
-	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
-
-	pb->tick = false;
-	pb->req_nr = dm_bio_get_target_bio_nr(bio);
-	pb->all_io_entry = NULL;
-	pb->len = 0;
-
-	return pb;
-}
-
 /*----------------------------------------------------------------
  * Remapping
  *--------------------------------------------------------------*/
@@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 }
 
 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
-				  dm_oblock_t oblock)
+					  dm_oblock_t oblock)
 {
+	// FIXME: this is called way too much.
 	check_if_tick_bio_needed(cache, bio);
 	remap_to_origin(cache, bio);
 	if (bio_data_dir(bio) == WRITE)
@@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 	check_if_tick_bio_needed(cache, bio);
 	remap_to_cache(cache, bio, cblock);
 	if (bio_data_dir(bio) == WRITE) {
-		set_dirty(cache, oblock, cblock);
+		set_dirty(cache, cblock);
 		clear_discard(cache, oblock_to_dblock(cache, oblock));
 	}
 }
@@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 	return to_oblock(block_nr);
 }
 
-/*
- * You must increment the deferred set whilst the prison cell is held.  To
- * encourage this, we ask for 'cell' to be passed in.
- */
-static void inc_ds(struct cache *cache, struct bio *bio,
-		   struct dm_bio_prison_cell *cell)
-{
-	size_t pb_data_size = get_per_bio_data_size(cache);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-
-	BUG_ON(!cell);
-	BUG_ON(pb->all_io_entry);
-
-	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-}
-
 static bool accountable_bio(struct cache *cache, struct bio *bio)
 {
 	return ((bio->bi_bdev == cache->origin_dev->bdev) &&
@@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio)
 	generic_make_request(bio);
 }
 
-static void issue(struct cache *cache, struct bio *bio)
+static void issue_op(struct bio *bio, void *context)
 {
-	unsigned long flags;
-
-	if (!op_is_flush(bio->bi_opf)) {
-		accounted_request(cache, bio);
-		return;
-	}
-
-	/*
-	 * Batch together any bios that trigger commits and then issue a
-	 * single commit for them in do_worker().
-	 */
-	spin_lock_irqsave(&cache->lock, flags);
-	cache->commit_requested = true;
-	bio_list_add(&cache->deferred_flush_bios, bio);
-	spin_unlock_irqrestore(&cache->lock, flags);
-}
-
-static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
-{
-	inc_ds(cache, bio, cell);
-	issue(cache, bio);
+	struct cache *cache = context;
+	accounted_request(cache, bio);
 }
 
 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
@@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 	bio_list_add(&cache->deferred_writethrough_bios, bio);
 	spin_unlock_irqrestore(&cache->lock, flags);
 
-	wake_worker(cache);
+	wake_deferred_writethrough_worker(cache);
 }
 
 static void writethrough_endio(struct bio *bio)
@@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio)
 }
 
 /*
+ * FIXME: send in parallel, huge latency as is.
  * When running in writethrough mode we need to send writes to clean blocks
  * to both the cache and origin devices.  In future we'd like to clone the
  * bio and send them in parallel, but for now we're doing them in
@@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r
 	set_cache_mode(cache, CM_READ_ONLY);
 }
 
+/*----------------------------------------------------------------*/
+
+static void load_stats(struct cache *cache)
+{
+	struct dm_cache_statistics stats;
+
+	dm_cache_metadata_get_stats(cache->cmd, &stats);
+	atomic_set(&cache->stats.read_hit, stats.read_hits);
+	atomic_set(&cache->stats.read_miss, stats.read_misses);
+	atomic_set(&cache->stats.write_hit, stats.write_hits);
+	atomic_set(&cache->stats.write_miss, stats.write_misses);
+}
+
+static void save_stats(struct cache *cache)
+{
+	struct dm_cache_statistics stats;
+
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return;
+
+	stats.read_hits = atomic_read(&cache->stats.read_hit);
+	stats.read_misses = atomic_read(&cache->stats.read_miss);
+	stats.write_hits = atomic_read(&cache->stats.write_hit);
+	stats.write_misses = atomic_read(&cache->stats.write_miss);
+
+	dm_cache_metadata_set_stats(cache->cmd, &stats);
+}
+
+static void update_stats(struct cache_stats *stats, enum policy_operation op)
+{
+	switch (op) {
+	case POLICY_PROMOTE:
+		atomic_inc(&stats->promotion);
+		break;
+
+	case POLICY_DEMOTE:
+		atomic_inc(&stats->demotion);
+		break;
+
+	case POLICY_WRITEBACK:
+		atomic_inc(&stats->writeback);
+		break;
+	}
+}
+
 /*----------------------------------------------------------------
  * Migration processing
  *
  * Migration covers moving data from the origin device to the cache, or
  * vice versa.
  *--------------------------------------------------------------*/
+
 static void inc_io_migrations(struct cache *cache)
 {
 	atomic_inc(&cache->nr_io_migrations);
@@ -1067,264 +1150,6 @@ static bool discard_or_flush(struct bio *bio)
 	return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
 }
 
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
-{
-	if (discard_or_flush(cell->holder)) {
-		/*
-		 * We have to handle these bios individually.
-		 */
-		dm_cell_release(cache->prison, cell, &cache->deferred_bios);
-		free_prison_cell(cache, cell);
-	} else
-		list_add_tail(&cell->user_list, &cache->deferred_cells);
-}
-
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
-{
-	unsigned long flags;
-
-	if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
-		/*
-		 * There was no prisoner to promote to holder, the
-		 * cell has been released.
-		 */
-		free_prison_cell(cache, cell);
-		return;
-	}
-
-	spin_lock_irqsave(&cache->lock, flags);
-	__cell_defer(cache, cell);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	wake_worker(cache);
-}
-
-static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
-{
-	dm_cell_error(cache->prison, cell, err);
-	free_prison_cell(cache, cell);
-}
-
-static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
-{
-	cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
-}
-
-static void free_io_migration(struct dm_cache_migration *mg)
-{
-	struct cache *cache = mg->cache;
-
-	dec_io_migrations(cache);
-	free_migration(mg);
-	wake_worker(cache);
-}
-
-static void migration_failure(struct dm_cache_migration *mg)
-{
-	struct cache *cache = mg->cache;
-	const char *dev_name = cache_device_name(cache);
-
-	if (mg->writeback) {
-		DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
-		set_dirty(cache, mg->old_oblock, mg->cblock);
-		cell_defer(cache, mg->old_ocell, false);
-
-	} else if (mg->demote) {
-		DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
-		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
-
-		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
-		if (mg->promote)
-			cell_defer(cache, mg->new_ocell, true);
-	} else {
-		DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
-		policy_remove_mapping(cache->policy, mg->new_oblock);
-		cell_defer(cache, mg->new_ocell, true);
-	}
-
-	free_io_migration(mg);
-}
-
-static void migration_success_pre_commit(struct dm_cache_migration *mg)
-{
-	int r;
-	unsigned long flags;
-	struct cache *cache = mg->cache;
-
-	if (mg->writeback) {
-		clear_dirty(cache, mg->old_oblock, mg->cblock);
-		cell_defer(cache, mg->old_ocell, false);
-		free_io_migration(mg);
-		return;
-
-	} else if (mg->demote) {
-		r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
-		if (r) {
-			DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
-				    cache_device_name(cache));
-			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
-			policy_force_mapping(cache->policy, mg->new_oblock,
-					     mg->old_oblock);
-			if (mg->promote)
-				cell_defer(cache, mg->new_ocell, true);
-			free_io_migration(mg);
-			return;
-		}
-	} else {
-		r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
-		if (r) {
-			DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
-				    cache_device_name(cache));
-			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
-			policy_remove_mapping(cache->policy, mg->new_oblock);
-			free_io_migration(mg);
-			return;
-		}
-	}
-
-	spin_lock_irqsave(&cache->lock, flags);
-	list_add_tail(&mg->list, &cache->need_commit_migrations);
-	cache->commit_requested = true;
-	spin_unlock_irqrestore(&cache->lock, flags);
-}
-
-static void migration_success_post_commit(struct dm_cache_migration *mg)
-{
-	unsigned long flags;
-	struct cache *cache = mg->cache;
-
-	if (mg->writeback) {
-		DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
-			     cache_device_name(cache));
-		return;
-
-	} else if (mg->demote) {
-		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
-
-		if (mg->promote) {
-			mg->demote = false;
-
-			spin_lock_irqsave(&cache->lock, flags);
-			list_add_tail(&mg->list, &cache->quiesced_migrations);
-			spin_unlock_irqrestore(&cache->lock, flags);
-
-		} else {
-			if (mg->invalidate)
-				policy_remove_mapping(cache->policy, mg->old_oblock);
-			free_io_migration(mg);
-		}
-
-	} else {
-		if (mg->requeue_holder) {
-			clear_dirty(cache, mg->new_oblock, mg->cblock);
-			cell_defer(cache, mg->new_ocell, true);
-		} else {
-			/*
-			 * The block was promoted via an overwrite, so it's dirty.
-			 */
-			set_dirty(cache, mg->new_oblock, mg->cblock);
-			bio_endio(mg->new_ocell->holder);
-			cell_defer(cache, mg->new_ocell, false);
-		}
-		free_io_migration(mg);
-	}
-}
-
-static void copy_complete(int read_err, unsigned long write_err, void *context)
-{
-	unsigned long flags;
-	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
-	struct cache *cache = mg->cache;
-
-	if (read_err || write_err)
-		mg->err = true;
-
-	spin_lock_irqsave(&cache->lock, flags);
-	list_add_tail(&mg->list, &cache->completed_migrations);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	wake_worker(cache);
-}
-
-static void issue_copy(struct dm_cache_migration *mg)
-{
-	int r;
-	struct dm_io_region o_region, c_region;
-	struct cache *cache = mg->cache;
-	sector_t cblock = from_cblock(mg->cblock);
-
-	o_region.bdev = cache->origin_dev->bdev;
-	o_region.count = cache->sectors_per_block;
-
-	c_region.bdev = cache->cache_dev->bdev;
-	c_region.sector = cblock * cache->sectors_per_block;
-	c_region.count = cache->sectors_per_block;
-
-	if (mg->writeback || mg->demote) {
-		/* demote */
-		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
-		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
-	} else {
-		/* promote */
-		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
-		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
-	}
-
-	if (r < 0) {
-		DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
-		migration_failure(mg);
-	}
-}
-
-static void overwrite_endio(struct bio *bio)
-{
-	struct dm_cache_migration *mg = bio->bi_private;
-	struct cache *cache = mg->cache;
-	size_t pb_data_size = get_per_bio_data_size(cache);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-	unsigned long flags;
-
-	dm_unhook_bio(&pb->hook_info, bio);
-
-	if (bio->bi_error)
-		mg->err = true;
-
-	mg->requeue_holder = false;
-
-	spin_lock_irqsave(&cache->lock, flags);
-	list_add_tail(&mg->list, &cache->completed_migrations);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	wake_worker(cache);
-}
-
-static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
-{
-	size_t pb_data_size = get_per_bio_data_size(mg->cache);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-
-	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
-	remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
-
-	/*
-	 * No need to inc_ds() here, since the cell will be held for the
-	 * duration of the io.
-	 */
-	accounted_request(mg->cache, bio);
-}
-
-static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
-{
-	return (bio_data_dir(bio) == WRITE) &&
-		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
-}
-
-static void avoid_copy(struct dm_cache_migration *mg)
-{
-	atomic_inc(&mg->cache->stats.copies_avoided);
-	migration_success_pre_commit(mg);
-}
-
 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
 				     dm_dblock_t *b, dm_dblock_t *e)
 {
@@ -1339,311 +1164,572 @@ static void calc_discard_block_range(struct cache *cache, struct bio *bio,
 		*e = to_dblock(block_div(se, cache->discard_block_size));
 }
 
-static void issue_discard(struct dm_cache_migration *mg)
+/*----------------------------------------------------------------*/
+
+static void prevent_background_work(struct cache *cache)
 {
-	dm_dblock_t b, e;
-	struct bio *bio = mg->new_ocell->holder;
-	struct cache *cache = mg->cache;
-
-	calc_discard_block_range(cache, bio, &b, &e);
-	while (b != e) {
-		set_discard(cache, b);
-		b = to_dblock(from_dblock(b) + 1);
-	}
-
-	bio_endio(bio);
-	cell_defer(cache, mg->new_ocell, false);
-	free_migration(mg);
-	wake_worker(cache);
+	lockdep_off();
+	down_write(&cache->background_work_lock);
+	lockdep_on();
 }
 
-static void issue_copy_or_discard(struct dm_cache_migration *mg)
+static void allow_background_work(struct cache *cache)
 {
-	bool avoid;
-	struct cache *cache = mg->cache;
-
-	if (mg->discard) {
-		issue_discard(mg);
-		return;
-	}
-
-	if (mg->writeback || mg->demote)
-		avoid = !is_dirty(cache, mg->cblock) ||
-			is_discarded_oblock(cache, mg->old_oblock);
-	else {
-		struct bio *bio = mg->new_ocell->holder;
-
-		avoid = is_discarded_oblock(cache, mg->new_oblock);
-
-		if (writeback_mode(&cache->features) &&
-		    !avoid && bio_writes_complete_block(cache, bio)) {
-			issue_overwrite(mg, bio);
-			return;
-		}
-	}
-
-	avoid ? avoid_copy(mg) : issue_copy(mg);
+	lockdep_off();
+	up_write(&cache->background_work_lock);
+	lockdep_on();
 }
 
-static void complete_migration(struct dm_cache_migration *mg)
+static bool background_work_begin(struct cache *cache)
 {
-	if (mg->err)
-		migration_failure(mg);
+	bool r;
+
+	lockdep_off();
+	r = down_read_trylock(&cache->background_work_lock);
+	lockdep_on();
+
+	return r;
+}
+
+static void background_work_end(struct cache *cache)
+{
+	lockdep_off();
+	up_read(&cache->background_work_lock);
+	lockdep_on();
+}
+
+/*----------------------------------------------------------------*/
+
+static void quiesce(struct dm_cache_migration *mg,
+		    void (*continuation)(struct work_struct *))
+{
+	init_continuation(&mg->k, continuation);
+	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
+}
+
+static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
+{
+	struct continuation *k = container_of(ws, struct continuation, ws);
+	return container_of(k, struct dm_cache_migration, k);
+}
+
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
+
+	if (read_err || write_err)
+		mg->k.input = -EIO;
+
+	queue_continuation(mg->cache->wq, &mg->k);
+}
+
+static int copy(struct dm_cache_migration *mg, bool promote)
+{
+	int r;
+	struct dm_io_region o_region, c_region;
+	struct cache *cache = mg->cache;
+
+	o_region.bdev = cache->origin_dev->bdev;
+	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
+	o_region.count = cache->sectors_per_block;
+
+	c_region.bdev = cache->cache_dev->bdev;
+	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
+	c_region.count = cache->sectors_per_block;
+
+	if (promote)
+		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
 	else
-		migration_success_pre_commit(mg);
+		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
+
+	return r;
 }
 
-static void process_migrations(struct cache *cache, struct list_head *head,
-			       void (*fn)(struct dm_cache_migration *))
+static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
 {
-	unsigned long flags;
-	struct list_head list;
-	struct dm_cache_migration *mg, *tmp;
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
-	INIT_LIST_HEAD(&list);
-	spin_lock_irqsave(&cache->lock, flags);
-	list_splice_init(head, &list);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	list_for_each_entry_safe(mg, tmp, &list, list)
-		fn(mg);
+	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
+		free_prison_cell(cache, pb->cell);
+	pb->cell = NULL;
 }
 
-static void __queue_quiesced_migration(struct dm_cache_migration *mg)
+static void overwrite_endio(struct bio *bio)
 {
-	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
-}
-
-static void queue_quiesced_migration(struct dm_cache_migration *mg)
-{
-	unsigned long flags;
+	struct dm_cache_migration *mg = bio->bi_private;
 	struct cache *cache = mg->cache;
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
-	spin_lock_irqsave(&cache->lock, flags);
-	__queue_quiesced_migration(mg);
-	spin_unlock_irqrestore(&cache->lock, flags);
+	dm_unhook_bio(&pb->hook_info, bio);
 
-	wake_worker(cache);
+	if (bio->bi_error)
+		mg->k.input = bio->bi_error;
+
+	queue_continuation(mg->cache->wq, &mg->k);
 }
 
-static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
+static void overwrite(struct dm_cache_migration *mg,
+		      void (*continuation)(struct work_struct *))
 {
-	unsigned long flags;
-	struct dm_cache_migration *mg, *tmp;
+	struct bio *bio = mg->overwrite_bio;
+	size_t pb_data_size = get_per_bio_data_size(mg->cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
-	spin_lock_irqsave(&cache->lock, flags);
-	list_for_each_entry_safe(mg, tmp, work, list)
-		__queue_quiesced_migration(mg);
-	spin_unlock_irqrestore(&cache->lock, flags);
+	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
 
-	wake_worker(cache);
-}
+	/*
+	 * The overwrite bio is part of the copy operation, as such it does
+	 * not set/clear discard or dirty flags.
+	 */
+	if (mg->op->op == POLICY_PROMOTE)
+		remap_to_cache(mg->cache, bio, mg->op->cblock);
+	else
+		remap_to_origin(mg->cache, bio);
 
-static void check_for_quiesced_migrations(struct cache *cache,
-					  struct per_bio_data *pb)
-{
-	struct list_head work;
-
-	if (!pb->all_io_entry)
-		return;
-
-	INIT_LIST_HEAD(&work);
-	dm_deferred_entry_dec(pb->all_io_entry, &work);
-
-	if (!list_empty(&work))
-		queue_quiesced_migrations(cache, &work);
-}
-
-static void quiesce_migration(struct dm_cache_migration *mg)
-{
-	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
-		queue_quiesced_migration(mg);
-}
-
-static void promote(struct cache *cache, struct prealloc *structs,
-		    dm_oblock_t oblock, dm_cblock_t cblock,
-		    struct dm_bio_prison_cell *cell)
-{
-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
-	mg->err = false;
-	mg->discard = false;
-	mg->writeback = false;
-	mg->demote = false;
-	mg->promote = true;
-	mg->requeue_holder = true;
-	mg->invalidate = false;
-	mg->cache = cache;
-	mg->new_oblock = oblock;
-	mg->cblock = cblock;
-	mg->old_ocell = NULL;
-	mg->new_ocell = cell;
-	mg->start_jiffies = jiffies;
-
-	inc_io_migrations(cache);
-	quiesce_migration(mg);
-}
-
-static void writeback(struct cache *cache, struct prealloc *structs,
-		      dm_oblock_t oblock, dm_cblock_t cblock,
-		      struct dm_bio_prison_cell *cell)
-{
-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
-	mg->err = false;
-	mg->discard = false;
-	mg->writeback = true;
-	mg->demote = false;
-	mg->promote = false;
-	mg->requeue_holder = true;
-	mg->invalidate = false;
-	mg->cache = cache;
-	mg->old_oblock = oblock;
-	mg->cblock = cblock;
-	mg->old_ocell = cell;
-	mg->new_ocell = NULL;
-	mg->start_jiffies = jiffies;
-
-	inc_io_migrations(cache);
-	quiesce_migration(mg);
-}
-
-static void demote_then_promote(struct cache *cache, struct prealloc *structs,
-				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
-				dm_cblock_t cblock,
-				struct dm_bio_prison_cell *old_ocell,
-				struct dm_bio_prison_cell *new_ocell)
-{
-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
-
-	mg->err = false;
-	mg->discard = false;
-	mg->writeback = false;
-	mg->demote = true;
-	mg->promote = true;
-	mg->requeue_holder = true;
-	mg->invalidate = false;
-	mg->cache = cache;
-	mg->old_oblock = old_oblock;
-	mg->new_oblock = new_oblock;
-	mg->cblock = cblock;
-	mg->old_ocell = old_ocell;
-	mg->new_ocell = new_ocell;
-	mg->start_jiffies = jiffies;
-
-	inc_io_migrations(cache);
-	quiesce_migration(mg);
+	init_continuation(&mg->k, continuation);
+	accounted_request(mg->cache, bio);
 }
 
 /*
- * Invalidate a cache entry.  No writeback occurs; any changes in the cache
- * block are thrown away.
+ * Migration steps:
+ *
+ * 1) exclusive lock preventing WRITEs
+ * 2) quiesce
+ * 3) copy or issue overwrite bio
+ * 4) upgrade to exclusive lock preventing READs and WRITEs
+ * 5) quiesce
+ * 6) update metadata and commit
+ * 7) unlock
  */
-static void invalidate(struct cache *cache, struct prealloc *structs,
-		       dm_oblock_t oblock, dm_cblock_t cblock,
-		       struct dm_bio_prison_cell *cell)
+static void mg_complete(struct dm_cache_migration *mg, bool success)
 {
-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
+	struct bio_list bios;
+	struct cache *cache = mg->cache;
+	struct policy_work *op = mg->op;
+	dm_cblock_t cblock = op->cblock;
 
-	mg->err = false;
-	mg->discard = false;
-	mg->writeback = false;
-	mg->demote = true;
-	mg->promote = false;
-	mg->requeue_holder = true;
-	mg->invalidate = true;
-	mg->cache = cache;
-	mg->old_oblock = oblock;
-	mg->cblock = cblock;
-	mg->old_ocell = cell;
-	mg->new_ocell = NULL;
-	mg->start_jiffies = jiffies;
+	if (success)
+		update_stats(&cache->stats, op->op);
 
-	inc_io_migrations(cache);
-	quiesce_migration(mg);
+	switch (op->op) {
+	case POLICY_PROMOTE:
+		clear_discard(cache, oblock_to_dblock(cache, op->oblock));
+		policy_complete_background_work(cache->policy, op, success);
+
+		if (mg->overwrite_bio) {
+			if (success)
+				force_set_dirty(cache, cblock);
+			else
+				mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
+			bio_endio(mg->overwrite_bio);
+		} else {
+			if (success)
+				force_clear_dirty(cache, cblock);
+			dec_io_migrations(cache);
+		}
+		break;
+
+	case POLICY_DEMOTE:
+		/*
+		 * We clear dirty here to update the nr_dirty counter.
+		 */
+		if (success)
+			force_clear_dirty(cache, cblock);
+		policy_complete_background_work(cache->policy, op, success);
+		dec_io_migrations(cache);
+		break;
+
+	case POLICY_WRITEBACK:
+		if (success)
+			force_clear_dirty(cache, cblock);
+		policy_complete_background_work(cache->policy, op, success);
+		dec_io_migrations(cache);
+		break;
+	}
+
+	bio_list_init(&bios);
+	if (mg->cell) {
+		if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+			free_prison_cell(cache, mg->cell);
+	}
+
+	free_migration(mg);
+	defer_bios(cache, &bios);
+	wake_migration_worker(cache);
+
+	background_work_end(cache);
 }
 
-static void discard(struct cache *cache, struct prealloc *structs,
-		    struct dm_bio_prison_cell *cell)
+static void mg_success(struct work_struct *ws)
 {
-	struct dm_cache_migration *mg = prealloc_get_migration(structs);
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+	mg_complete(mg, mg->k.input == 0);
+}
+
+static void mg_update_metadata(struct work_struct *ws)
+{
+	int r;
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+	struct cache *cache = mg->cache;
+	struct policy_work *op = mg->op;
+
+	switch (op->op) {
+	case POLICY_PROMOTE:
+		r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
+		if (r) {
+			DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
+				    cache_device_name(cache));
+			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
+
+			mg_complete(mg, false);
+			return;
+		}
+		mg_complete(mg, true);
+		break;
+
+	case POLICY_DEMOTE:
+		r = dm_cache_remove_mapping(cache->cmd, op->cblock);
+		if (r) {
+			DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
+				    cache_device_name(cache));
+			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
+
+			mg_complete(mg, false);
+			return;
+		}
+
+		/*
+		 * It would be nice if we only had to commit when a REQ_FLUSH
+		 * comes through.  But there's one scenario that we have to
+		 * look out for:
+		 *
+		 * - vblock x in a cache block
+		 * - domotion occurs
+		 * - cache block gets reallocated and over written
+		 * - crash
+		 *
+		 * When we recover, because there was no commit the cache will
+		 * rollback to having the data for vblock x in the cache block.
+		 * But the cache block has since been overwritten, so it'll end
+		 * up pointing to data that was never in 'x' during the history
+		 * of the device.
+		 *
+		 * To avoid this issue we require a commit as part of the
+		 * demotion operation.
+		 */
+		init_continuation(&mg->k, mg_success);
+		continue_after_commit(&cache->committer, &mg->k);
+		schedule_commit(&cache->committer);
+		break;
+
+	case POLICY_WRITEBACK:
+		mg_complete(mg, true);
+		break;
+	}
+}
+
+static void mg_update_metadata_after_copy(struct work_struct *ws)
+{
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+
+	/*
+	 * Did the copy succeed?
+	 */
+	if (mg->k.input)
+		mg_complete(mg, false);
+	else
+		mg_update_metadata(ws);
+}
+
+static void mg_upgrade_lock(struct work_struct *ws)
+{
+	int r;
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+
+	/*
+	 * Did the copy succeed?
+	 */
+	if (mg->k.input)
+		mg_complete(mg, false);
+
+	else {
+		/*
+		 * Now we want the lock to prevent both reads and writes.
+		 */
+		r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
+					    READ_WRITE_LOCK_LEVEL);
+		if (r < 0)
+			mg_complete(mg, false);
+
+		else if (r)
+			quiesce(mg, mg_update_metadata);
+
+		else
+			mg_update_metadata(ws);
+	}
+}
+
+static void mg_copy(struct work_struct *ws)
+{
+	int r;
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+
+	if (mg->overwrite_bio) {
+		/*
+		 * It's safe to do this here, even though it's new data
+		 * because all IO has been locked out of the block.
+		 *
+		 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
+		 * so _not_ using mg_upgrade_lock() as continutation.
+		 */
+		overwrite(mg, mg_update_metadata_after_copy);
+
+	} else {
+		struct cache *cache = mg->cache;
+		struct policy_work *op = mg->op;
+		bool is_policy_promote = (op->op == POLICY_PROMOTE);
+
+		if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
+		    is_discarded_oblock(cache, op->oblock)) {
+			mg_upgrade_lock(ws);
+			return;
+		}
+
+		init_continuation(&mg->k, mg_upgrade_lock);
+
+		r = copy(mg, is_policy_promote);
+		if (r) {
+			DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
+			mg->k.input = -EIO;
+			mg_complete(mg, false);
+		}
+	}
+}
+
+static int mg_lock_writes(struct dm_cache_migration *mg)
+{
+	int r;
+	struct dm_cell_key_v2 key;
+	struct cache *cache = mg->cache;
+	struct dm_bio_prison_cell_v2 *prealloc;
+
+	prealloc = alloc_prison_cell(cache);
+	if (!prealloc) {
+		DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
+		mg_complete(mg, false);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Prevent writes to the block, but allow reads to continue.
+	 * Unless we're using an overwrite bio, in which case we lock
+	 * everything.
+	 */
+	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
+	r = dm_cell_lock_v2(cache->prison, &key,
+			    mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
+			    prealloc, &mg->cell);
+	if (r < 0) {
+		free_prison_cell(cache, prealloc);
+		mg_complete(mg, false);
+		return r;
+	}
+
+	if (mg->cell != prealloc)
+		free_prison_cell(cache, prealloc);
+
+	if (r == 0)
+		mg_copy(&mg->k.ws);
+	else
+		quiesce(mg, mg_copy);
+
+	return 0;
+}
+
+static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
+{
+	struct dm_cache_migration *mg;
+
+	if (!background_work_begin(cache)) {
+		policy_complete_background_work(cache->policy, op, false);
+		return -EPERM;
+	}
+
+	mg = alloc_migration(cache);
+	if (!mg) {
+		policy_complete_background_work(cache->policy, op, false);
+		background_work_end(cache);
+		return -ENOMEM;
+	}
+
+	memset(mg, 0, sizeof(*mg));
 
-	mg->err = false;
-	mg->discard = true;
-	mg->writeback = false;
-	mg->demote = false;
-	mg->promote = false;
-	mg->requeue_holder = false;
-	mg->invalidate = false;
 	mg->cache = cache;
-	mg->old_ocell = NULL;
-	mg->new_ocell = cell;
-	mg->start_jiffies = jiffies;
+	mg->op = op;
+	mg->overwrite_bio = bio;
 
-	quiesce_migration(mg);
+	if (!bio)
+		inc_io_migrations(cache);
+
+	return mg_lock_writes(mg);
+}
+
+/*----------------------------------------------------------------
+ * invalidation processing
+ *--------------------------------------------------------------*/
+
+static void invalidate_complete(struct dm_cache_migration *mg, bool success)
+{
+	struct bio_list bios;
+	struct cache *cache = mg->cache;
+
+	bio_list_init(&bios);
+	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+		free_prison_cell(cache, mg->cell);
+
+	if (!success && mg->overwrite_bio)
+		bio_io_error(mg->overwrite_bio);
+
+	free_migration(mg);
+	defer_bios(cache, &bios);
+
+	background_work_end(cache);
+}
+
+static void invalidate_completed(struct work_struct *ws)
+{
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+	invalidate_complete(mg, !mg->k.input);
+}
+
+static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
+{
+	int r = policy_invalidate_mapping(cache->policy, cblock);
+	if (!r) {
+		r = dm_cache_remove_mapping(cache->cmd, cblock);
+		if (r) {
+			DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
+				    cache_device_name(cache));
+			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
+		}
+
+	} else if (r == -ENODATA) {
+		/*
+		 * Harmless, already unmapped.
+		 */
+		r = 0;
+
+	} else
+		DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
+
+	return r;
+}
+
+static void invalidate_remove(struct work_struct *ws)
+{
+	int r;
+	struct dm_cache_migration *mg = ws_to_mg(ws);
+	struct cache *cache = mg->cache;
+
+	r = invalidate_cblock(cache, mg->invalidate_cblock);
+	if (r) {
+		invalidate_complete(mg, false);
+		return;
+	}
+
+	init_continuation(&mg->k, invalidate_completed);
+	continue_after_commit(&cache->committer, &mg->k);
+	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
+	mg->overwrite_bio = NULL;
+	schedule_commit(&cache->committer);
+}
+
+static int invalidate_lock(struct dm_cache_migration *mg)
+{
+	int r;
+	struct dm_cell_key_v2 key;
+	struct cache *cache = mg->cache;
+	struct dm_bio_prison_cell_v2 *prealloc;
+
+	prealloc = alloc_prison_cell(cache);
+	if (!prealloc) {
+		invalidate_complete(mg, false);
+		return -ENOMEM;
+	}
+
+	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
+	r = dm_cell_lock_v2(cache->prison, &key,
+			    READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
+	if (r < 0) {
+		free_prison_cell(cache, prealloc);
+		invalidate_complete(mg, false);
+		return r;
+	}
+
+	if (mg->cell != prealloc)
+		free_prison_cell(cache, prealloc);
+
+	if (r)
+		quiesce(mg, invalidate_remove);
+
+	else {
+		/*
+		 * We can't call invalidate_remove() directly here because we
+		 * might still be in request context.
+		 */
+		init_continuation(&mg->k, invalidate_remove);
+		queue_work(cache->wq, &mg->k.ws);
+	}
+
+	return 0;
+}
+
+static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
+			    dm_oblock_t oblock, struct bio *bio)
+{
+	struct dm_cache_migration *mg;
+
+	if (!background_work_begin(cache))
+		return -EPERM;
+
+	mg = alloc_migration(cache);
+	if (!mg) {
+		background_work_end(cache);
+		return -ENOMEM;
+	}
+
+	memset(mg, 0, sizeof(*mg));
+
+	mg->cache = cache;
+	mg->overwrite_bio = bio;
+	mg->invalidate_cblock = cblock;
+	mg->invalidate_oblock = oblock;
+
+	return invalidate_lock(mg);
 }
 
 /*----------------------------------------------------------------
  * bio processing
  *--------------------------------------------------------------*/
-static void defer_bio(struct cache *cache, struct bio *bio)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cache->lock, flags);
-	bio_list_add(&cache->deferred_bios, bio);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	wake_worker(cache);
-}
-
-static void process_flush_bio(struct cache *cache, struct bio *bio)
-{
-	size_t pb_data_size = get_per_bio_data_size(cache);
-	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-
-	BUG_ON(bio->bi_iter.bi_size);
-	if (!pb->req_nr)
-		remap_to_origin(cache, bio);
-	else
-		remap_to_cache(cache, bio, 0);
-
-	/*
-	 * REQ_PREFLUSH is not directed at any particular block so we don't
-	 * need to inc_ds().  REQ_FUA's are split into a write + REQ_PREFLUSH
-	 * by dm-core.
-	 */
-	issue(cache, bio);
-}
-
-static void process_discard_bio(struct cache *cache, struct prealloc *structs,
-				struct bio *bio)
-{
-	int r;
-	dm_dblock_t b, e;
-	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
-
-	calc_discard_block_range(cache, bio, &b, &e);
-	if (b == e) {
-		bio_endio(bio);
-		return;
-	}
-
-	cell_prealloc = prealloc_get_cell(structs);
-	r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
-			     (cell_free_fn) prealloc_put_cell,
-			     structs, &new_ocell);
-	if (r > 0)
-		return;
-
-	discard(cache, structs, new_ocell);
-}
-
-static bool spare_migration_bandwidth(struct cache *cache)
+
+enum busy {
+	IDLE,
+	MODERATE,
+	BUSY
+};
+
+static enum busy spare_migration_bandwidth(struct cache *cache)
 {
+	bool idle = iot_idle_for(&cache->origin_tracker, HZ);
 	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
 		cache->sectors_per_block;
-	return current_volume < cache->migration_threshold;
+
+	if (current_volume <= cache->migration_threshold)
+		return idle ? IDLE : MODERATE;
+	else
+		return idle ? MODERATE : BUSY;
 }
 
 static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 
 /*----------------------------------------------------------------*/
 
-struct inc_detail {
-	struct cache *cache;
-	struct bio_list bios_for_issue;
-	struct bio_list unhandled_bios;
-	bool any_writes;
-};
-
-static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
 {
-	struct bio *bio;
-	struct inc_detail *detail = context;
-	struct cache *cache = detail->cache;
+	return (bio_data_dir(bio) == WRITE) &&
+		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+}
 
-	inc_ds(cache, cell->holder, cell);
-	if (bio_data_dir(cell->holder) == WRITE)
-		detail->any_writes = true;
+static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
+{
+	return writeback_mode(&cache->features) &&
+		(is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
+}
 
-	while ((bio = bio_list_pop(&cell->bios))) {
-		if (discard_or_flush(bio)) {
-			bio_list_add(&detail->unhandled_bios, bio);
-			continue;
+static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
+		   bool *commit_needed)
+{
+	int r, data_dir;
+	bool rb, background_queued;
+	dm_cblock_t cblock;
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	*commit_needed = false;
+
+	rb = bio_detain_shared(cache, block, bio);
+	if (!rb) {
+		/*
+		 * An exclusive lock is held for this block, so we have to
+		 * wait.  We set the commit_needed flag so the current
+		 * transaction will be committed asap, allowing this lock
+		 * to be dropped.
+		 */
+		*commit_needed = true;
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	data_dir = bio_data_dir(bio);
+
+	if (optimisable_bio(cache, bio, block)) {
+		struct policy_work *op = NULL;
+
+		r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
+		if (unlikely(r && r != -ENOENT)) {
+			DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
+				    cache_device_name(cache), r);
+			bio_io_error(bio);
+			return DM_MAPIO_SUBMITTED;
 		}
 
-		if (bio_data_dir(bio) == WRITE)
-			detail->any_writes = true;
-
-		bio_list_add(&detail->bios_for_issue, bio);
-		inc_ds(cache, bio, cell);
-	}
-}
-
-// FIXME: refactor these two
-static void remap_cell_to_origin_clear_discard(struct cache *cache,
-					       struct dm_bio_prison_cell *cell,
-					       dm_oblock_t oblock, bool issue_holder)
-{
-	struct bio *bio;
-	unsigned long flags;
-	struct inc_detail detail;
-
-	detail.cache = cache;
-	bio_list_init(&detail.bios_for_issue);
-	bio_list_init(&detail.unhandled_bios);
-	detail.any_writes = false;
-
-	spin_lock_irqsave(&cache->lock, flags);
-	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
-	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	remap_to_origin(cache, cell->holder);
-	if (issue_holder)
-		issue(cache, cell->holder);
-	else
-		accounted_begin(cache, cell->holder);
-
-	if (detail.any_writes)
-		clear_discard(cache, oblock_to_dblock(cache, oblock));
-
-	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
-		remap_to_origin(cache, bio);
-		issue(cache, bio);
-	}
-
-	free_prison_cell(cache, cell);
-}
-
-static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
-				      dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
-{
-	struct bio *bio;
-	unsigned long flags;
-	struct inc_detail detail;
-
-	detail.cache = cache;
-	bio_list_init(&detail.bios_for_issue);
-	bio_list_init(&detail.unhandled_bios);
-	detail.any_writes = false;
-
-	spin_lock_irqsave(&cache->lock, flags);
-	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
-	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	remap_to_cache(cache, cell->holder, cblock);
-	if (issue_holder)
-		issue(cache, cell->holder);
-	else
-		accounted_begin(cache, cell->holder);
-
-	if (detail.any_writes) {
-		set_dirty(cache, oblock, cblock);
-		clear_discard(cache, oblock_to_dblock(cache, oblock));
-	}
-
-	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
-		remap_to_cache(cache, bio, cblock);
-		issue(cache, bio);
-	}
-
-	free_prison_cell(cache, cell);
-}
-
-/*----------------------------------------------------------------*/
-
-struct old_oblock_lock {
-	struct policy_locker locker;
-	struct cache *cache;
-	struct prealloc *structs;
-	struct dm_bio_prison_cell *cell;
-};
-
-static int null_locker(struct policy_locker *locker, dm_oblock_t b)
-{
-	/* This should never be called */
-	BUG();
-	return 0;
-}
-
-static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
-{
-	struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
-	struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
-
-	return bio_detain(l->cache, b, NULL, cell_prealloc,
-			  (cell_free_fn) prealloc_put_cell,
-			  l->structs, &l->cell);
-}
-
-static void process_cell(struct cache *cache, struct prealloc *structs,
-			 struct dm_bio_prison_cell *new_ocell)
-{
-	int r;
-	bool release_cell = true;
-	struct bio *bio = new_ocell->holder;
-	dm_oblock_t block = get_bio_block(cache, bio);
-	struct policy_result lookup_result;
-	bool passthrough = passthrough_mode(&cache->features);
-	bool fast_promotion, can_migrate;
-	struct old_oblock_lock ool;
-
-	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
-	can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
-
-	ool.locker.fn = cell_locker;
-	ool.cache = cache;
-	ool.structs = structs;
-	ool.cell = NULL;
-	r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
-		       bio, &ool.locker, &lookup_result);
-
-	if (r == -EWOULDBLOCK)
-		/* migration has been denied */
-		lookup_result.op = POLICY_MISS;
-
-	switch (lookup_result.op) {
-	case POLICY_HIT:
-		if (passthrough) {
-			inc_miss_counter(cache, bio);
-
-			/*
-			 * Passthrough always maps to the origin,
-			 * invalidating any cache blocks that are written
-			 * to.
-			 */
-
-			if (bio_data_dir(bio) == WRITE) {
-				atomic_inc(&cache->stats.demotion);
-				invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
-				release_cell = false;
-
-			} else {
-				/* FIXME: factor out issue_origin() */
-				remap_to_origin_clear_discard(cache, bio, block);
-				inc_and_issue(cache, bio, new_ocell);
-			}
-		} else {
-			inc_hit_counter(cache, bio);
-
-			if (bio_data_dir(bio) == WRITE &&
-			    writethrough_mode(&cache->features) &&
-			    !is_dirty(cache, lookup_result.cblock)) {
-				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-				inc_and_issue(cache, bio, new_ocell);
-
-			} else {
-				remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
-				release_cell = false;
-			}
+		if (r == -ENOENT && op) {
+			bio_drop_shared_lock(cache, bio);
+			BUG_ON(op->op != POLICY_PROMOTE);
+			mg_start(cache, op, bio);
+			return DM_MAPIO_SUBMITTED;
+		}
+	} else {
+		r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
+		if (unlikely(r && r != -ENOENT)) {
+			DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
+				    cache_device_name(cache), r);
+			bio_io_error(bio);
+			return DM_MAPIO_SUBMITTED;
 		}
 
-		break;
+		if (background_queued)
+			wake_migration_worker(cache);
+	}
 
-	case POLICY_MISS:
+	if (r == -ENOENT) {
+		/*
+		 * Miss.
+		 */
 		inc_miss_counter(cache, bio);
-		remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
-		release_cell = false;
-		break;
+		if (pb->req_nr == 0) {
+			accounted_begin(cache, bio);
+			remap_to_origin_clear_discard(cache, bio, block);
 
-	case POLICY_NEW:
-		atomic_inc(&cache->stats.promotion);
-		promote(cache, structs, block, lookup_result.cblock, new_ocell);
-		release_cell = false;
-		break;
+		} else {
+			/*
+			 * This is a duplicate writethrough io that is no
+			 * longer needed because the block has been demoted.
+			 */
+			bio_endio(bio);
+			return DM_MAPIO_SUBMITTED;
+		}
+	} else {
+		/*
+		 * Hit.
+		 */
+		inc_hit_counter(cache, bio);
 
-	case POLICY_REPLACE:
-		atomic_inc(&cache->stats.demotion);
-		atomic_inc(&cache->stats.promotion);
-		demote_then_promote(cache, structs, lookup_result.old_oblock,
-				    block, lookup_result.cblock,
-				    ool.cell, new_ocell);
-		release_cell = false;
-		break;
+		/*
+		 * Passthrough always maps to the origin, invalidating any
+		 * cache blocks that are written to.
+		 */
+		if (passthrough_mode(&cache->features)) {
+			if (bio_data_dir(bio) == WRITE) {
+				bio_drop_shared_lock(cache, bio);
+				atomic_inc(&cache->stats.demotion);
+				invalidate_start(cache, cblock, block, bio);
+			} else
+				remap_to_origin_clear_discard(cache, bio, block);
 
-	default:
-		DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
-			    cache_device_name(cache), __func__,
-			    (unsigned) lookup_result.op);
-		bio_io_error(bio);
+		} else {
+			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+			    !is_dirty(cache, cblock)) {
+				remap_to_origin_then_cache(cache, bio, block, cblock);
+				accounted_begin(cache, bio);
+			} else
+				remap_to_cache_dirty(cache, bio, block, cblock);
+		}
 	}
 
-	if (release_cell)
-		cell_defer(cache, new_ocell, false);
-}
-
-static void process_bio(struct cache *cache, struct prealloc *structs,
-			struct bio *bio)
-{
-	int r;
-	dm_oblock_t block = get_bio_block(cache, bio);
-	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
-
 	/*
-	 * Check to see if that block is currently migrating.
+	 * dm core turns FUA requests into a separate payload and FLUSH req.
 	 */
-	cell_prealloc = prealloc_get_cell(structs);
-	r = bio_detain(cache, block, bio, cell_prealloc,
-		       (cell_free_fn) prealloc_put_cell,
-		       structs, &new_ocell);
-	if (r > 0)
-		return;
+	if (bio->bi_opf & REQ_FUA) {
+		/*
+		 * issue_after_commit will call accounted_begin a second time.  So
+		 * we call accounted_complete() to avoid double accounting.
+		 */
+		accounted_complete(cache, bio);
+		issue_after_commit(&cache->committer, bio);
+		*commit_needed = true;
+		return DM_MAPIO_SUBMITTED;
+	}
 
-	process_cell(cache, structs, new_ocell);
+	return DM_MAPIO_REMAPPED;
 }
 
-static int need_commit_due_to_time(struct cache *cache)
+static bool process_bio(struct cache *cache, struct bio *bio)
 {
-	return jiffies < cache->last_commit_jiffies ||
-	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+	bool commit_needed;
+
+	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
+		generic_make_request(bio);
+
+	return commit_needed;
 }
 
 /*
@@ -1929,29 +1903,62 @@ static int commit(struct cache *cache, bool clean_shutdown)
 	return r;
 }
 
-static int commit_if_needed(struct cache *cache)
+/*
+ * Used by the batcher.
+ */
+static int commit_op(void *context)
 {
-	int r = 0;
+	struct cache *cache = context;
 
-	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
-	    dm_cache_changed_this_transaction(cache->cmd)) {
-		r = commit(cache, false);
-		cache->commit_requested = false;
-		cache->last_commit_jiffies = jiffies;
-	}
+	if (dm_cache_changed_this_transaction(cache->cmd))
+		return commit(cache, false);
 
-	return r;
+	return 0;
 }
 
-static void process_deferred_bios(struct cache *cache)
+/*----------------------------------------------------------------*/
+
+static bool process_flush_bio(struct cache *cache, struct bio *bio)
 {
-	bool prealloc_used = false;
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	if (!pb->req_nr)
+		remap_to_origin(cache, bio);
+	else
+		remap_to_cache(cache, bio, 0);
+
+	issue_after_commit(&cache->committer, bio);
+	return true;
+}
+
+static bool process_discard_bio(struct cache *cache, struct bio *bio)
+{
+	dm_dblock_t b, e;
+
+	// FIXME: do we need to lock the region?  Or can we just assume the
+	// user wont be so foolish as to issue discard concurrently with
+	// other IO?
+	calc_discard_block_range(cache, bio, &b, &e);
+	while (b != e) {
+		set_discard(cache, b);
+		b = to_dblock(from_dblock(b) + 1);
+	}
+
+	bio_endio(bio);
+
+	return false;
+}
+
+static void process_deferred_bios(struct work_struct *ws)
+{
+	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
+
 	unsigned long flags;
+	bool commit_needed = false;
 	struct bio_list bios;
 	struct bio *bio;
-	struct prealloc structs;
 
-	memset(&structs, 0, sizeof(structs));
 	bio_list_init(&bios);
 
 	spin_lock_irqsave(&cache->lock, flags);
@@ -1959,93 +1966,25 @@ static void process_deferred_bios(struct cache *cache)
 	bio_list_init(&cache->deferred_bios);
 	spin_unlock_irqrestore(&cache->lock, flags);
 
-	while (!bio_list_empty(&bios)) {
-		/*
-		 * If we've got no free migration structs, and processing
-		 * this bio might require one, we pause until there are some
-		 * prepared mappings to process.
-		 */
-		prealloc_used = true;
-		if (prealloc_data_structs(cache, &structs)) {
-			spin_lock_irqsave(&cache->lock, flags);
-			bio_list_merge(&cache->deferred_bios, &bios);
-			spin_unlock_irqrestore(&cache->lock, flags);
-			break;
-		}
-
-		bio = bio_list_pop(&bios);
-
+	while ((bio = bio_list_pop(&bios))) {
 		if (bio->bi_opf & REQ_PREFLUSH)
-			process_flush_bio(cache, bio);
+			commit_needed = process_flush_bio(cache, bio) || commit_needed;
+
 		else if (bio_op(bio) == REQ_OP_DISCARD)
-			process_discard_bio(cache, &structs, bio);
+			commit_needed = process_discard_bio(cache, bio) || commit_needed;
+
 		else
-			process_bio(cache, &structs, bio);
+			commit_needed = process_bio(cache, bio) || commit_needed;
 	}
 
-	if (prealloc_used)
-		prealloc_free_structs(cache, &structs);
+	if (commit_needed)
+		schedule_commit(&cache->committer);
 }
 
-static void process_deferred_cells(struct cache *cache)
+static void process_deferred_writethrough_bios(struct work_struct *ws)
 {
-	bool prealloc_used = false;
-	unsigned long flags;
-	struct dm_bio_prison_cell *cell, *tmp;
-	struct list_head cells;
-	struct prealloc structs;
+	struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
 
-	memset(&structs, 0, sizeof(structs));
-
-	INIT_LIST_HEAD(&cells);
-
-	spin_lock_irqsave(&cache->lock, flags);
-	list_splice_init(&cache->deferred_cells, &cells);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	list_for_each_entry_safe(cell, tmp, &cells, user_list) {
-		/*
-		 * If we've got no free migration structs, and processing
-		 * this bio might require one, we pause until there are some
-		 * prepared mappings to process.
-		 */
-		prealloc_used = true;
-		if (prealloc_data_structs(cache, &structs)) {
-			spin_lock_irqsave(&cache->lock, flags);
-			list_splice(&cells, &cache->deferred_cells);
-			spin_unlock_irqrestore(&cache->lock, flags);
-			break;
-		}
-
-		process_cell(cache, &structs, cell);
-	}
-
-	if (prealloc_used)
-		prealloc_free_structs(cache, &structs);
-}
-
-static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
-{
-	unsigned long flags;
-	struct bio_list bios;
-	struct bio *bio;
-
-	bio_list_init(&bios);
-
-	spin_lock_irqsave(&cache->lock, flags);
-	bio_list_merge(&bios, &cache->deferred_flush_bios);
-	bio_list_init(&cache->deferred_flush_bios);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	/*
-	 * These bios have already been through inc_ds()
-	 */
-	while ((bio = bio_list_pop(&bios)))
-		submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
-}
-
-static void process_deferred_writethrough_bios(struct cache *cache)
-{
 	unsigned long flags;
 	struct bio_list bios;
 	struct bio *bio;
@@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache)
 	spin_unlock_irqrestore(&cache->lock, flags);
 
 	/*
-	 * These bios have already been through inc_ds()
+	 * These bios have already been through accounted_begin()
 	 */
 	while ((bio = bio_list_pop(&bios)))
-		accounted_request(cache, bio);
-}
-
-static void writeback_some_dirty_blocks(struct cache *cache)
-{
-	bool prealloc_used = false;
-	dm_oblock_t oblock;
-	dm_cblock_t cblock;
-	struct prealloc structs;
-	struct dm_bio_prison_cell *old_ocell;
-	bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
-
-	memset(&structs, 0, sizeof(structs));
-
-	while (spare_migration_bandwidth(cache)) {
-		if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
-			break; /* no work to do */
-
-		prealloc_used = true;
-		if (prealloc_data_structs(cache, &structs) ||
-		    get_cell(cache, oblock, &structs, &old_ocell)) {
-			policy_set_dirty(cache->policy, oblock);
-			break;
-		}
-
-		writeback(cache, &structs, oblock, cblock, old_ocell);
-	}
-
-	if (prealloc_used)
-		prealloc_free_structs(cache, &structs);
-}
-
-/*----------------------------------------------------------------
- * Invalidations.
- * Dropping something from the cache *without* writing back.
- *--------------------------------------------------------------*/
-
-static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
-{
-	int r = 0;
-	uint64_t begin = from_cblock(req->cblocks->begin);
-	uint64_t end = from_cblock(req->cblocks->end);
-
-	while (begin != end) {
-		r = policy_remove_cblock(cache->policy, to_cblock(begin));
-		if (!r) {
-			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
-			if (r) {
-				metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
-				break;
-			}
-
-		} else if (r == -ENODATA) {
-			/* harmless, already unmapped */
-			r = 0;
-
-		} else {
-			DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
-			break;
-		}
-
-		begin++;
-        }
-
-	cache->commit_requested = true;
-
-	req->err = r;
-	atomic_set(&req->complete, 1);
-
-	wake_up(&req->result_wait);
-}
-
-static void process_invalidation_requests(struct cache *cache)
-{
-	struct list_head list;
-	struct invalidation_request *req, *tmp;
-
-	INIT_LIST_HEAD(&list);
-	spin_lock(&cache->invalidation_lock);
-	list_splice_init(&cache->invalidation_requests, &list);
-	spin_unlock(&cache->invalidation_lock);
-
-	list_for_each_entry_safe (req, tmp, &list, list)
-		process_invalidation_request(cache, req);
+		generic_make_request(bio);
 }
 
 /*----------------------------------------------------------------
  * Main worker loop
  *--------------------------------------------------------------*/
-static bool is_quiescing(struct cache *cache)
-{
-	return atomic_read(&cache->quiescing);
-}
-
-static void ack_quiescing(struct cache *cache)
-{
-	if (is_quiescing(cache)) {
-		atomic_inc(&cache->quiescing_ack);
-		wake_up(&cache->quiescing_wait);
-	}
-}
-
-static void wait_for_quiescing_ack(struct cache *cache)
-{
-	wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
-}
-
-static void start_quiescing(struct cache *cache)
-{
-	atomic_inc(&cache->quiescing);
-	wait_for_quiescing_ack(cache);
-}
-
-static void stop_quiescing(struct cache *cache)
-{
-	atomic_set(&cache->quiescing, 0);
-	atomic_set(&cache->quiescing_ack, 0);
-}
-
-static void wait_for_migrations(struct cache *cache)
-{
-	wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
-}
-
-static void stop_worker(struct cache *cache)
-{
-	cancel_delayed_work(&cache->waker);
-	flush_workqueue(cache->wq);
-}
-
-static void requeue_deferred_cells(struct cache *cache)
-{
-	unsigned long flags;
-	struct list_head cells;
-	struct dm_bio_prison_cell *cell, *tmp;
-
-	INIT_LIST_HEAD(&cells);
-	spin_lock_irqsave(&cache->lock, flags);
-	list_splice_init(&cache->deferred_cells, &cells);
-	spin_unlock_irqrestore(&cache->lock, flags);
-
-	list_for_each_entry_safe(cell, tmp, &cells, user_list)
-		cell_requeue(cache, cell);
-}
 
 static void requeue_deferred_bios(struct cache *cache)
 {
@@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache)
 	}
 }
 
-static int more_work(struct cache *cache)
-{
-	if (is_quiescing(cache))
-		return !list_empty(&cache->quiesced_migrations) ||
-			!list_empty(&cache->completed_migrations) ||
-			!list_empty(&cache->need_commit_migrations);
-	else
-		return !bio_list_empty(&cache->deferred_bios) ||
-			!list_empty(&cache->deferred_cells) ||
-			!bio_list_empty(&cache->deferred_flush_bios) ||
-			!bio_list_empty(&cache->deferred_writethrough_bios) ||
-			!list_empty(&cache->quiesced_migrations) ||
-			!list_empty(&cache->completed_migrations) ||
-			!list_empty(&cache->need_commit_migrations) ||
-			cache->invalidate;
-}
-
-static void do_worker(struct work_struct *ws)
-{
-	struct cache *cache = container_of(ws, struct cache, worker);
-
-	do {
-		if (!is_quiescing(cache)) {
-			writeback_some_dirty_blocks(cache);
-			process_deferred_writethrough_bios(cache);
-			process_deferred_bios(cache);
-			process_deferred_cells(cache);
-			process_invalidation_requests(cache);
-		}
-
-		process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
-		process_migrations(cache, &cache->completed_migrations, complete_migration);
-
-		if (commit_if_needed(cache)) {
-			process_deferred_flush_bios(cache, false);
-			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
-		} else {
-			process_deferred_flush_bios(cache, true);
-			process_migrations(cache, &cache->need_commit_migrations,
-					   migration_success_post_commit);
-		}
-
-		ack_quiescing(cache);
-
-	} while (more_work(cache));
-}
-
 /*
  * We want to commit periodically so that not too much
  * unwritten metadata builds up.
@@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws)
 static void do_waker(struct work_struct *ws)
 {
 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+
 	policy_tick(cache->policy, true);
-	wake_worker(cache);
+	wake_migration_worker(cache);
+	schedule_commit(&cache->committer);
 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
 }
 
-/*----------------------------------------------------------------*/
-
-static int is_congested(struct dm_dev *dev, int bdi_bits)
+static void check_migrations(struct work_struct *ws)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-	return bdi_congested(q->backing_dev_info, bdi_bits);
-}
+	int r;
+	struct policy_work *op;
+	struct cache *cache = container_of(ws, struct cache, migration_worker);
+	enum busy b;
 
-static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
-{
-	struct cache *cache = container_of(cb, struct cache, callbacks);
+	for (;;) {
+		b = spare_migration_bandwidth(cache);
+		if (b == BUSY)
+			break;
 
-	return is_congested(cache->origin_dev, bdi_bits) ||
-		is_congested(cache->cache_dev, bdi_bits);
+		r = policy_get_background_work(cache->policy, b == IDLE, &op);
+		if (r == -ENODATA)
+			break;
+
+		if (r) {
+			DMERR_LIMIT("%s: policy_background_work failed",
+				    cache_device_name(cache));
+			break;
+		}
+
+		r = mg_start(cache, op, NULL);
+		if (r)
+			break;
+	}
 }
 
 /*----------------------------------------------------------------
@@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache)
 
 	mempool_destroy(cache->migration_pool);
 
-	if (cache->all_io_ds)
-		dm_deferred_set_destroy(cache->all_io_ds);
-
 	if (cache->prison)
-		dm_bio_prison_destroy(cache->prison);
+		dm_bio_prison_destroy_v2(cache->prison);
 
 	if (cache->wq)
 		destroy_workqueue(cache->wq);
@@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
 		return PTR_ERR(p);
 	}
 	cache->policy = p;
+	BUG_ON(!cache->policy);
 
 	return 0;
 }
@@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size)
 	cache->cache_size = size;
 }
 
+static int is_congested(struct dm_dev *dev, int bdi_bits)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+	return bdi_congested(q->backing_dev_info, bdi_bits);
+}
+
+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+	struct cache *cache = container_of(cb, struct cache, callbacks);
+
+	return is_congested(cache->origin_dev, bdi_bits) ||
+		is_congested(cache->cache_dev, bdi_bits);
+}
+
 #define DEFAULT_MIGRATION_THRESHOLD 2048
 
 static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2788,7 +2568,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
 
-	/* FIXME: factor out this whole section */
 	origin_blocks = cache->origin_sectors = ca->origin_sectors;
 	origin_blocks = block_div(origin_blocks, ca->block_size);
 	cache->origin_blocks = to_oblock(origin_blocks);
@@ -2854,24 +2633,18 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 			r = -EINVAL;
 			goto bad;
 		}
+
+		policy_allow_migrations(cache->policy, false);
 	}
 
 	spin_lock_init(&cache->lock);
 	INIT_LIST_HEAD(&cache->deferred_cells);
 	bio_list_init(&cache->deferred_bios);
-	bio_list_init(&cache->deferred_flush_bios);
 	bio_list_init(&cache->deferred_writethrough_bios);
-	INIT_LIST_HEAD(&cache->quiesced_migrations);
-	INIT_LIST_HEAD(&cache->completed_migrations);
-	INIT_LIST_HEAD(&cache->need_commit_migrations);
 	atomic_set(&cache->nr_allocated_migrations, 0);
 	atomic_set(&cache->nr_io_migrations, 0);
 	init_waitqueue_head(&cache->migration_wait);
 
-	init_waitqueue_head(&cache->quiescing_wait);
-	atomic_set(&cache->quiescing, 0);
-	atomic_set(&cache->quiescing_ack, 0);
-
 	r = -ENOMEM;
 	atomic_set(&cache->nr_dirty, 0);
 	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2900,27 +2673,23 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 		goto bad;
 	}
 
-	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
 	if (!cache->wq) {
 		*error = "could not create workqueue for metadata object";
 		goto bad;
 	}
-	INIT_WORK(&cache->worker, do_worker);
+	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
+	INIT_WORK(&cache->deferred_writethrough_worker,
+		  process_deferred_writethrough_bios);
+	INIT_WORK(&cache->migration_worker, check_migrations);
 	INIT_DELAYED_WORK(&cache->waker, do_waker);
-	cache->last_commit_jiffies = jiffies;
 
-	cache->prison = dm_bio_prison_create();
+	cache->prison = dm_bio_prison_create_v2(cache->wq);
 	if (!cache->prison) {
 		*error = "could not create bio prison";
 		goto bad;
 	}
 
-	cache->all_io_ds = dm_deferred_set_create();
-	if (!cache->all_io_ds) {
-		*error = "could not create all_io deferred set";
-		goto bad;
-	}
-
 	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
 							 migration_cache);
 	if (!cache->migration_pool) {
@@ -2947,11 +2716,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	spin_lock_init(&cache->invalidation_lock);
 	INIT_LIST_HEAD(&cache->invalidation_requests);
 
+	batcher_init(&cache->committer, commit_op, cache,
+		     issue_op, cache, cache->wq);
 	iot_init(&cache->origin_tracker);
 
+	init_rwsem(&cache->background_work_lock);
+	prevent_background_work(cache);
+
 	*result = cache;
 	return 0;
-
 bad:
 	destroy(cache);
 	return r;
@@ -3009,7 +2782,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	ti->private = cache;
-
 out:
 	destroy_cache_args(ca);
 	return r;
@@ -3022,17 +2794,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	struct cache *cache = ti->private;
 
 	int r;
-	struct dm_bio_prison_cell *cell = NULL;
+	bool commit_needed;
 	dm_oblock_t block = get_bio_block(cache, bio);
 	size_t pb_data_size = get_per_bio_data_size(cache);
-	bool can_migrate = false;
-	bool fast_promotion;
-	struct policy_result lookup_result;
-	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
-	struct old_oblock_lock ool;
-
-	ool.locker.fn = null_locker;
 
+	init_per_bio_data(bio, pb_data_size);
 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
 		/*
 		 * This can only occur if the io goes to a partial block at
@@ -3049,101 +2815,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	/*
-	 * Check to see if that block is currently migrating.
-	 */
-	cell = alloc_prison_cell(cache);
-	if (!cell) {
-		defer_bio(cache, bio);
-		return DM_MAPIO_SUBMITTED;
-	}
-
-	r = bio_detain(cache, block, bio, cell,
-		       (cell_free_fn) free_prison_cell,
-		       cache, &cell);
-	if (r) {
-		if (r < 0)
-			defer_bio(cache, bio);
-
-		return DM_MAPIO_SUBMITTED;
-	}
-
-	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
-
-	r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
-		       bio, &ool.locker, &lookup_result);
-	if (r == -EWOULDBLOCK) {
-		cell_defer(cache, cell, true);
-		return DM_MAPIO_SUBMITTED;
-
-	} else if (r) {
-		DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
-			    cache_device_name(cache), r);
-		cell_defer(cache, cell, false);
-		bio_io_error(bio);
-		return DM_MAPIO_SUBMITTED;
-	}
-
-	r = DM_MAPIO_REMAPPED;
-	switch (lookup_result.op) {
-	case POLICY_HIT:
-		if (passthrough_mode(&cache->features)) {
-			if (bio_data_dir(bio) == WRITE) {
-				/*
-				 * We need to invalidate this block, so
-				 * defer for the worker thread.
-				 */
-				cell_defer(cache, cell, true);
-				r = DM_MAPIO_SUBMITTED;
-
-			} else {
-				inc_miss_counter(cache, bio);
-				remap_to_origin_clear_discard(cache, bio, block);
-				accounted_begin(cache, bio);
-				inc_ds(cache, bio, cell);
-				// FIXME: we want to remap hits or misses straight
-				// away rather than passing over to the worker.
-				cell_defer(cache, cell, false);
-			}
-
-		} else {
-			inc_hit_counter(cache, bio);
-			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
-			    !is_dirty(cache, lookup_result.cblock)) {
-				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-				accounted_begin(cache, bio);
-				inc_ds(cache, bio, cell);
-				cell_defer(cache, cell, false);
-
-			} else
-				remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
-		}
-		break;
-
-	case POLICY_MISS:
-		inc_miss_counter(cache, bio);
-		if (pb->req_nr != 0) {
-			/*
-			 * This is a duplicate writethrough io that is no
-			 * longer needed because the block has been demoted.
-			 */
-			bio_endio(bio);
-			// FIXME: remap everything as a miss
-			cell_defer(cache, cell, false);
-			r = DM_MAPIO_SUBMITTED;
-
-		} else
-			remap_cell_to_origin_clear_discard(cache, cell, block, false);
-		break;
-
-	default:
-		DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
-			    cache_device_name(cache), __func__,
-			    (unsigned) lookup_result.op);
-		cell_defer(cache, cell, false);
-		bio_io_error(bio);
-		r = DM_MAPIO_SUBMITTED;
-	}
+	r = map_bio(cache, bio, block, &commit_needed);
+	if (commit_needed)
+		schedule_commit(&cache->committer);
 
 	return r;
 }
@@ -3163,7 +2837,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 		spin_unlock_irqrestore(&cache->lock, flags);
 	}
 
-	check_for_quiesced_migrations(cache, pb);
+	bio_drop_shared_lock(cache, bio);
 	accounted_complete(cache, bio);
 
 	return 0;
@@ -3263,12 +2937,18 @@ static void cache_postsuspend(struct dm_target *ti)
 {
 	struct cache *cache = ti->private;
 
-	start_quiescing(cache);
-	wait_for_migrations(cache);
-	stop_worker(cache);
+	prevent_background_work(cache);
+	BUG_ON(atomic_read(&cache->nr_io_migrations));
+
+	cancel_delayed_work(&cache->waker);
+	flush_workqueue(cache->wq);
+	WARN_ON(cache->origin_tracker.in_flight);
+
+	/*
+	 * If it's a flush suspend there won't be any deferred bios, so this
+	 * call is harmless.
+	 */
 	requeue_deferred_bios(cache);
-	requeue_deferred_cells(cache);
-	stop_quiescing(cache);
 
 	if (get_cache_mode(cache) == CM_WRITE)
 		(void) sync_metadata(cache);
@@ -3280,15 +2960,10 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
 	int r;
 	struct cache *cache = context;
 
-	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+	r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
 	if (r)
 		return r;
 
-	if (dirty)
-		set_dirty(cache, oblock, cblock);
-	else
-		clear_dirty(cache, oblock, cblock);
-
 	return 0;
 }
 
@@ -3487,6 +3162,7 @@ static void cache_resume(struct dm_target *ti)
 	struct cache *cache = ti->private;
 
 	cache->need_tick_bio = true;
+	allow_background_work(cache);
 	do_waker(&cache->waker.work);
 }
 
@@ -3620,11 +3296,20 @@ err:
 	DMEMIT("Error");
 }
 
+/*
+ * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+	dm_cblock_t begin;
+	dm_cblock_t end;
+};
+
 /*
  * A cache block range can take two forms:
  *
  * i) A single cblock, eg. '3456'
- * ii) A begin and end cblock with dots between, eg. 123-234
+ * ii) A begin and end cblock with a dash between, eg. 123-234
  */
 static int parse_cblock_range(struct cache *cache, const char *str,
 			      struct cblock_range *result)
@@ -3690,23 +3375,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
 	return 0;
 }
 
+static inline dm_cblock_t cblock_succ(dm_cblock_t b)
+{
+	return to_cblock(from_cblock(b) + 1);
+}
+
 static int request_invalidation(struct cache *cache, struct cblock_range *range)
 {
-	struct invalidation_request req;
+	int r = 0;
 
-	INIT_LIST_HEAD(&req.list);
-	req.cblocks = range;
-	atomic_set(&req.complete, 0);
-	req.err = 0;
-	init_waitqueue_head(&req.result_wait);
+	/*
+	 * We don't need to do any locking here because we know we're in
+	 * passthrough mode.  There's is potential for a race between an
+	 * invalidation triggered by an io and an invalidation message.  This
+	 * is harmless, we must not worry if the policy call fails.
+	 */
+	while (range->begin != range->end) {
+		r = invalidate_cblock(cache, range->begin);
+		if (r)
+			return r;
 
-	spin_lock(&cache->invalidation_lock);
-	list_add(&req.list, &cache->invalidation_requests);
-	spin_unlock(&cache->invalidation_lock);
-	wake_worker(cache);
+		range->begin = cblock_succ(range->begin);
+	}
 
-	wait_event(req.result_wait, atomic_read(&req.complete));
-	return req.err;
+	cache->commit_requested = true;
+	return r;
 }
 
 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
@@ -3816,7 +3509,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 10, 0},
+	.version = {2, 0, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,

From 9b4b5a797cf8a8d904df979891a8de53f2cb9694 Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Wed, 4 Jan 2017 20:23:51 +0100
Subject: [PATCH 03/50] dm table: add flag to allow target to handle its own
 integrity metadata

Add DM_TARGET_INTEGRITY flag that specifies bio integrity metadata is
not inherited but implemented in the target itself.

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c         | 11 +++++++++++
 include/linux/device-mapper.h |  6 ++++++
 2 files changed, 17 insertions(+)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ad16d9c9d5a..b0600840e734 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -47,6 +47,7 @@ struct dm_table {
 	bool integrity_supported:1;
 	bool singleton:1;
 	bool all_blk_mq:1;
+	unsigned integrity_added:1;
 
 	/*
 	 * Indicates the rw permissions for the new logical
@@ -725,6 +726,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 		t->immutable_target_type = tgt->type;
 	}
 
+	if (dm_target_has_integrity(tgt->type))
+		t->integrity_added = 1;
+
 	tgt->table = t;
 	tgt->begin = start;
 	tgt->len = len;
@@ -1168,6 +1172,10 @@ static int dm_table_register_integrity(struct dm_table *t)
 	struct mapped_device *md = t->md;
 	struct gendisk *template_disk = NULL;
 
+	/* If target handles integrity itself do not register it here. */
+	if (t->integrity_added)
+		return 0;
+
 	template_disk = dm_table_get_integrity_disk(t);
 	if (!template_disk)
 		return 0;
@@ -1394,6 +1402,9 @@ static void dm_table_verify_integrity(struct dm_table *t)
 {
 	struct gendisk *template_disk = NULL;
 
+	if (t->integrity_added)
+		return;
+
 	if (t->integrity_supported) {
 		/*
 		 * Verify that the original integrity profile
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fd..874462153f14 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -221,6 +221,12 @@ struct target_type {
  */
 typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);
 
+/*
+ * A target implements own bio data integrity.
+ */
+#define DM_TARGET_INTEGRITY		0x00000010
+#define dm_target_has_integrity(type)	((type)->features & DM_TARGET_INTEGRITY)
+
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;

From 400a0befc96240f7bb2a53b9622deffd55d385fe Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 4 Jan 2017 20:23:52 +0100
Subject: [PATCH 04/50] dm bufio: add sector start offset to dm-bufio interface

Introduce dm_bufio_set_sector_offset() interface to allow setting a
sector offset for a dm-bufio client.  This is a prereq for the DM
integrity target.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 51 +++++++++++++++++++++++++++----------------
 drivers/md/dm-bufio.h |  7 ++++++
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index df4859f6ac6a..280b56c229d6 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -110,6 +110,8 @@ struct dm_bufio_client {
 	struct rb_root buffer_tree;
 	wait_queue_head_t free_buffer_wait;
 
+	sector_t start;
+
 	int async_write_error;
 
 	struct list_head client_list;
@@ -557,8 +559,8 @@ static void dmio_complete(unsigned long error, void *context)
 	b->bio.bi_end_io(&b->bio);
 }
 
-static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
-		     bio_end_io_t *end_io)
+static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
+		     unsigned n_sectors, bio_end_io_t *end_io)
 {
 	int r;
 	struct dm_io_request io_req = {
@@ -570,8 +572,8 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
 	};
 	struct dm_io_region region = {
 		.bdev = b->c->bdev,
-		.sector = block << b->c->sectors_per_block_bits,
-		.count = b->c->block_size >> SECTOR_SHIFT,
+		.sector = sector,
+		.count = n_sectors,
 	};
 
 	if (b->data_mode != DATA_MODE_VMALLOC) {
@@ -606,14 +608,14 @@ static void inline_endio(struct bio *bio)
 	end_fn(bio);
 }
 
-static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
-			   bio_end_io_t *end_io)
+static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
+			   unsigned n_sectors, bio_end_io_t *end_io)
 {
 	char *ptr;
 	int len;
 
 	bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
-	b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
+	b->bio.bi_iter.bi_sector = sector;
 	b->bio.bi_bdev = b->c->bdev;
 	b->bio.bi_end_io = inline_endio;
 	/*
@@ -628,7 +630,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	 * If len < PAGE_SIZE the buffer doesn't cross page boundary.
 	 */
 	ptr = b->data;
-	len = b->c->block_size;
+	len = n_sectors << SECTOR_SHIFT;
 
 	if (len >= PAGE_SIZE)
 		BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
@@ -640,7 +642,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 				  len < PAGE_SIZE ? len : PAGE_SIZE,
 				  offset_in_page(ptr))) {
 			BUG_ON(b->c->block_size <= PAGE_SIZE);
-			use_dmio(b, rw, block, end_io);
+			use_dmio(b, rw, sector, n_sectors, end_io);
 			return;
 		}
 
@@ -651,17 +653,22 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	submit_bio(&b->bio);
 }
 
-static void submit_io(struct dm_buffer *b, int rw, sector_t block,
-		      bio_end_io_t *end_io)
+static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io)
 {
+	unsigned n_sectors;
+	sector_t sector;
+
 	if (rw == WRITE && b->c->write_callback)
 		b->c->write_callback(b);
 
-	if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
+	sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
+	n_sectors = 1 << b->c->sectors_per_block_bits;
+
+	if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) &&
 	    b->data_mode != DATA_MODE_VMALLOC)
-		use_inline_bio(b, rw, block, end_io);
+		use_inline_bio(b, rw, sector, n_sectors, end_io);
 	else
-		use_dmio(b, rw, block, end_io);
+		use_dmio(b, rw, sector, n_sectors, end_io);
 }
 
 /*----------------------------------------------------------------
@@ -713,7 +720,7 @@ static void __write_dirty_buffer(struct dm_buffer *b,
 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
 
 	if (!write_list)
-		submit_io(b, WRITE, b->block, write_endio);
+		submit_io(b, WRITE, write_endio);
 	else
 		list_add_tail(&b->write_list, write_list);
 }
@@ -726,7 +733,7 @@ static void __flush_write_list(struct list_head *write_list)
 		struct dm_buffer *b =
 			list_entry(write_list->next, struct dm_buffer, write_list);
 		list_del(&b->write_list);
-		submit_io(b, WRITE, b->block, write_endio);
+		submit_io(b, WRITE, write_endio);
 		cond_resched();
 	}
 	blk_finish_plug(&plug);
@@ -1094,7 +1101,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 		return NULL;
 
 	if (need_submit)
-		submit_io(b, READ, b->block, read_endio);
+		submit_io(b, READ, read_endio);
 
 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
 
@@ -1164,7 +1171,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 			dm_bufio_unlock(c);
 
 			if (need_submit)
-				submit_io(b, READ, b->block, read_endio);
+				submit_io(b, READ, read_endio);
 			dm_bufio_release(b);
 
 			cond_resched();
@@ -1405,7 +1412,7 @@ retry:
 		old_block = b->block;
 		__unlink_buffer(b);
 		__link_buffer(b, new_block, b->list_mode);
-		submit_io(b, WRITE, new_block, write_endio);
+		submit_io(b, WRITE, write_endio);
 		wait_on_bit_io(&b->state, B_WRITING,
 			       TASK_UNINTERRUPTIBLE);
 		__unlink_buffer(b);
@@ -1762,6 +1769,12 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 }
 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
 
+void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
+{
+	c->start = start;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
+
 static unsigned get_max_age_hz(void)
 {
 	unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index c096779a7292..b6d8f53ec15b 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -31,6 +31,13 @@ dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
  */
 void dm_bufio_client_destroy(struct dm_bufio_client *c);
 
+/*
+ * Set the sector range.
+ * When this function is called, there must be no I/O in progress on the bufio
+ * client.
+ */
+void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start);
+
 /*
  * WARNING: to avoid deadlocks, these conditions are observed:
  *

From 7eada909bfd7ac90a4522e56aa3179d1fd68cd14 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 4 Jan 2017 20:23:53 +0100
Subject: [PATCH 05/50] dm: add integrity target

The dm-integrity target emulates a block device that has additional
per-sector tags that can be used for storing integrity information.

A general problem with storing integrity tags with every sector is that
writing the sector and the integrity tag must be atomic - i.e. in case of
crash, either both sector and integrity tag or none of them is written.

To guarantee write atomicity the dm-integrity target uses a journal. It
writes sector data and integrity tags into a journal, commits the journal
and then copies the data and integrity tags to their respective location.

The dm-integrity target can be used with the dm-crypt target - in this
situation the dm-crypt target creates the integrity data and passes them
to the dm-integrity target via bio_integrity_payload attached to the bio.
In this mode, the dm-crypt and dm-integrity targets provide authenticated
disk encryption - if the attacker modifies the encrypted device, an I/O
error is returned instead of random data.

The dm-integrity target can also be used as a standalone target, in this
mode it calculates and verifies the integrity tag internally. In this
mode, the dm-integrity target can be used to detect silent data
corruption on the disk or in the I/O path.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-integrity.txt |  189 ++
 drivers/md/Kconfig                           |   10 +
 drivers/md/Makefile                          |    1 +
 drivers/md/dm-integrity.c                    | 3085 ++++++++++++++++++
 4 files changed, 3285 insertions(+)
 create mode 100644 Documentation/device-mapper/dm-integrity.txt
 create mode 100644 drivers/md/dm-integrity.c

diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
new file mode 100644
index 000000000000..2406f56501dc
--- /dev/null
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -0,0 +1,189 @@
+The dm-integrity target emulates a block device that has additional
+per-sector tags that can be used for storing integrity information.
+
+A general problem with storing integrity tags with every sector is that
+writing the sector and the integrity tag must be atomic - i.e. in case of
+crash, either both sector and integrity tag or none of them is written.
+
+To guarantee write atomicity, the dm-integrity target uses journal, it
+writes sector data and integrity tags into a journal, commits the journal
+and then copies the data and integrity tags to their respective location.
+
+The dm-integrity target can be used with the dm-crypt target - in this
+situation the dm-crypt target creates the integrity data and passes them
+to the dm-integrity target via bio_integrity_payload attached to the bio.
+In this mode, the dm-crypt and dm-integrity targets provide authenticated
+disk encryption - if the attacker modifies the encrypted device, an I/O
+error is returned instead of random data.
+
+The dm-integrity target can also be used as a standalone target, in this
+mode it calculates and verifies the integrity tag internally. In this
+mode, the dm-integrity target can be used to detect silent data
+corruption on the disk or in the I/O path.
+
+
+When loading the target for the first time, the kernel driver will format
+the device. But it will only format the device if the superblock contains
+zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
+target can't be loaded.
+
+To use the target for the first time:
+1. overwrite the superblock with zeroes
+2. load the dm-integrity target with one-sector size, the kernel driver
+	will format the device
+3. unload the dm-integrity target
+4. read the "provided_data_sectors" value from the superblock
+5. load the dm-integrity target with the the target size
+	"provided_data_sectors"
+6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
+	with the size "provided_data_sectors"
+
+
+Target arguments:
+
+1. the underlying block device
+
+2. the number of reserved sector at the beginning of the device - the
+	dm-integrity won't read of write these sectors
+
+3. the size of the integrity tag (if "-" is used, the size is taken from
+	the internal-hash algorithm)
+
+4. mode:
+	D - direct writes (without journal) - in this mode, journaling is
+		not used and data sectors and integrity tags are written
+		separately. In case of crash, it is possible that the data
+		and integrity tag doesn't match.
+	J - journaled writes - data and integrity tags are written to the
+		journal and atomicity is guaranteed. In case of crash,
+		either both data and tag or none of them are written. The
+		journaled mode degrades write throughput twice because the
+		data have to be written twice.
+
+5. the number of additional arguments
+
+Additional arguments:
+
+journal-sectors:number
+	The size of journal, this argument is used only if formatting the
+	device. If the device is already formatted, the value from the
+	superblock is used.
+
+interleave-sectors:number
+	The number of interleaved sectors. This values is rounded down to
+	a power of two. If the device is already formatted, the value from
+	the superblock is used.
+
+buffer-sectors:number
+	The number of sectors in one buffer. The value is rounded down to
+	a power of two.
+
+	The tag area is accessed using buffers, the buffer size is
+	configurable. The large buffer size means that the I/O size will
+	be larger, but there could be less I/Os issued.
+
+journal-watermark:number
+	The journal watermark in percents. When the size of the journal
+	exceeds this watermark, the thread that flushes the journal will
+	be started.
+
+commit-time:number
+	Commit time in milliseconds. When this time passes, the journal is
+	written. The journal is also written immediatelly if the FLUSH
+	request is received.
+
+internal-hash:algorithm(:key)	(the key is optional)
+	Use internal hash or crc.
+	When this argument is used, the dm-integrity target won't accept
+	integrity tags from the upper target, but it will automatically
+	generate and verify the integrity tags.
+
+	You can use a crc algorithm (such as crc32), then integrity target
+	will protect the data against accidental corruption.
+	You can also use a hmac algorithm (for example
+	"hmac(sha256):0123456789abcdef"), in this mode it will provide
+	cryptographic authentication of the data without encryption.
+
+	When this argument is not used, the integrity tags are accepted
+	from an upper layer target, such as dm-crypt. The upper layer
+	target should check the validity of the integrity tags.
+
+journal-crypt:algorithm(:key)	(the key is optional)
+	Encrypt the journal using given algorithm to make sure that the
+	attacker can't read the journal. You can use a block cipher here
+	(such as "cbc(aes)") or a stream cipher (for example "chacha20",
+	"salsa20", "ctr(aes)" or "ecb(arc4)").
+
+	The journal contains history of last writes to the block device,
+	an attacker reading the journal could see the last sector nubmers
+	that were written. From the sector numbers, the attacker can infer
+	the size of files that were written. To protect against this
+	situation, you can encrypt the journal.
+
+journal-mac:algorithm(:key)	(the key is optional)
+	Protect sector numbers in the journal from accidental or malicious
+	modification. To protect against accidental modification, use a
+	crc algorithm, to protect against malicious modification, use a
+	hmac algorithm with a key.
+
+	This option is not needed when using internal-hash because in this
+	mode, the integrity of journal entries is checked when replaying
+	the journal. Thus, modified sector number would be detected at
+	this stage.
+
+
+The journal mode (D/J), buffer-sectors, journal-watermark, commit-time can
+be changed when reloading the target (load an inactive table and swap the
+tables with suspend and resume). The other arguments should not be changed
+when reloading the target because the layout of disk data depend on them
+and the reloaded target would be non-functional.
+
+
+The layout of the formatted block device:
+* reserved sectors (they are not used by this target, they can be used for
+  storing LUKS metadata or for other purpose), the size of the reserved
+  area is specified in the target arguments
+* superblock (4kiB)
+	* magic string - identifies that the device was formatted
+	* version
+	* log2(interleave sectors)
+	* integrity tag size
+	* the number of journal sections
+	* provided data sectors - the number of sectors that this target
+	  provides (i.e. the size of the device minus the size of all
+	  metadata and padding). The user of this target should not send
+	  bios that access data beyond the "provided data sectors" limit.
+	* flags - a flag is set if journal-mac is used
+* journal
+	The journal is divided into sections, each section contains:
+	* metadata area (4kiB), it contains journal entries
+	  every journal entry contains:
+		* logical sector (specifies where the data and tag should
+		  be written)
+		* last 8 bytes of data
+		* integrity tag (the size is specified in the superblock)
+	    every metadata sector ends with
+		* mac (8-bytes), all the macs in 8 metadata sectors form a
+		  64-byte value. It is used to store hmac of sector
+		  numbers in the journal section, to protect against a
+		  possibility that the attacker tampers with sector
+		  numbers in the journal.
+		* commit id
+	* data area (the size is variable; it depends on how many journal
+	  entries fit into the metadata area)
+	    every sector in the data area contains:
+		* data (504 bytes of data, the last 8 bytes are stored in
+		  the journal entry)
+		* commit id
+	To test if the whole journal section was written correctly, every
+	512-byte sector of the journal ends with 8-byte commit id. If the
+	commit id matches on all sectors in a journal section, then it is
+	assumed that the section was written correctly. If the commit id
+	doesn't match, the section was written partially and it should not
+	be replayed.
+* one or more runs of interleaved tags and data. Each run contains:
+	* tag area - it contains integrity tags. There is one tag for each
+	  sector in the data area
+	* data area - it contains data sectors. The number of data sectors
+	  in one run must be a power of two. log2 of this value is stored
+	  in the superblock.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 982cd0626bc7..5c5ed97c9fda 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -500,4 +500,14 @@ config DM_LOG_WRITES
 
 	  If unsure, say N.
 
+config DM_INTEGRITY
+	tristate "Integrity target"
+	depends on BLK_DEV_DM
+	select BLK_DEV_INTEGRITY
+	select DM_BUFIO
+	select CRYPTO
+	select ASYNC_XOR
+	---help---
+	   This is the integrity target.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 2801b2fb452d..39cf2a1b5f90 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
+obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
new file mode 100644
index 000000000000..ea779cca8b45
--- /dev/null
+++ b/drivers/md/dm-integrity.c
@@ -0,0 +1,3085 @@
+/*
+ * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2016-2017 Milan Broz
+ * Copyright (C) 2016-2017 Mikulas Patocka
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
+#include <linux/async_tx.h>
+#include "dm-bufio.h"
+
+#define DM_MSG_PREFIX "integrity"
+
+#define DEFAULT_INTERLEAVE_SECTORS	32768
+#define DEFAULT_JOURNAL_SIZE_FACTOR	7
+#define DEFAULT_BUFFER_SECTORS		128
+#define DEFAULT_JOURNAL_WATERMARK	50
+#define DEFAULT_SYNC_MSEC		10000
+#define DEFAULT_MAX_JOURNAL_SECTORS	131072
+#define MIN_INTERLEAVE_SECTORS		3
+#define MAX_INTERLEAVE_SECTORS		31
+#define METADATA_WORKQUEUE_MAX_ACTIVE	16
+
+/*
+ * Warning - DEBUG_PRINT prints security-sensitive data to the log,
+ * so it should not be enabled in the official kernel
+ */
+//#define DEBUG_PRINT
+//#define INTERNAL_VERIFY
+
+/*
+ * On disk structures
+ */
+
+#define SB_MAGIC			"integrt"
+#define SB_VERSION			1
+#define SB_SECTORS			8
+
+struct superblock {
+	__u8 magic[8];
+	__u8 version;
+	__u8 log2_interleave_sectors;
+	__u16 integrity_tag_size;
+	__u32 journal_sections;
+	__u64 provided_data_sectors;	/* userspace uses this value */
+	__u32 flags;
+};
+
+#define SB_FLAG_HAVE_JOURNAL_MAC	0x1
+
+#define	JOURNAL_ENTRY_ROUNDUP		8
+
+typedef __u64 commit_id_t;
+#define JOURNAL_MAC_PER_SECTOR		8
+
+struct journal_entry {
+	union {
+		struct {
+			__u32 sector_lo;
+			__u32 sector_hi;
+		} s;
+		__u64 sector;
+	} u;
+	commit_id_t last_bytes;
+	__u8 tag[0];
+};
+
+#if BITS_PER_LONG == 64
+#define journal_entry_set_sector(je, x)		do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
+#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
+#elif defined(CONFIG_LBDAF)
+#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0)
+#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
+#else
+#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0)
+#define journal_entry_get_sector(je)		le32_to_cpu((je)->u.s.sector_lo)
+#endif
+#define journal_entry_is_unused(je)		((je)->u.s.sector_hi == cpu_to_le32(-1))
+#define journal_entry_set_unused(je)		do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
+#define journal_entry_is_inprogress(je)		((je)->u.s.sector_hi == cpu_to_le32(-2))
+#define journal_entry_set_inprogress(je)	do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
+
+#define JOURNAL_BLOCK_SECTORS		8
+#define JOURNAL_SECTOR_DATA		((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
+#define JOURNAL_MAC_SIZE		(JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
+
+struct journal_sector {
+	__u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
+	__u8 mac[JOURNAL_MAC_PER_SECTOR];
+	commit_id_t commit_id;
+};
+
+#define MAX_TAG_SIZE			(JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, tag))
+
+#define METADATA_PADDING_SECTORS	8
+
+#define N_COMMIT_IDS			4
+
+static unsigned char prev_commit_seq(unsigned char seq)
+{
+	return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
+}
+
+static unsigned char next_commit_seq(unsigned char seq)
+{
+	return (seq + 1) % N_COMMIT_IDS;
+}
+
+/*
+ * In-memory structures
+ */
+
+struct journal_node {
+	struct rb_node node;
+	sector_t sector;
+};
+
+struct alg_spec {
+	char *alg_string;
+	char *key_string;
+	__u8 *key;
+	unsigned key_size;
+};
+
+struct dm_integrity_c {
+	struct dm_dev *dev;
+	unsigned tag_size;
+	__s8 log2_tag_size;
+	sector_t start;
+	mempool_t *journal_io_mempool;
+	struct dm_io_client *io;
+	struct dm_bufio_client *bufio;
+	struct workqueue_struct *metadata_wq;
+	struct superblock *sb;
+	unsigned journal_pages;
+	struct page_list *journal;
+	struct page_list *journal_io;
+	struct page_list *journal_xor;
+
+	struct crypto_skcipher *journal_crypt;
+	struct scatterlist **journal_scatterlist;
+	struct scatterlist **journal_io_scatterlist;
+	struct skcipher_request **sk_requests;
+
+	struct crypto_shash *journal_mac;
+
+	struct journal_node *journal_tree;
+	struct rb_root journal_tree_root;
+
+	sector_t provided_data_sectors;
+
+	unsigned short journal_entry_size;
+	unsigned char journal_entries_per_sector;
+	unsigned char journal_section_entries;
+	unsigned char journal_section_sectors;
+	unsigned journal_sections;
+	unsigned journal_entries;
+	sector_t device_sectors;
+	unsigned initial_sectors;
+	unsigned metadata_run;
+	__s8 log2_metadata_run;
+	__u8 log2_buffer_sectors;
+
+	unsigned char mode;
+	bool suspending;
+
+	int failed;
+
+	struct crypto_shash *internal_hash;
+
+	/* these variables are locked with endio_wait.lock */
+	struct rb_root in_progress;
+	wait_queue_head_t endio_wait;
+	struct workqueue_struct *wait_wq;
+
+	unsigned char commit_seq;
+	commit_id_t commit_ids[N_COMMIT_IDS];
+
+	unsigned committed_section;
+	unsigned n_committed_sections;
+
+	unsigned uncommitted_section;
+	unsigned n_uncommitted_sections;
+
+	unsigned free_section;
+	unsigned char free_section_entry;
+	unsigned free_sectors;
+
+	unsigned free_sectors_threshold;
+
+	struct workqueue_struct *commit_wq;
+	struct work_struct commit_work;
+
+	struct workqueue_struct *writer_wq;
+	struct work_struct writer_work;
+
+	struct bio_list flush_bio_list;
+
+	unsigned long autocommit_jiffies;
+	struct timer_list autocommit_timer;
+	unsigned autocommit_msec;
+
+	wait_queue_head_t copy_to_journal_wait;
+
+	struct completion crypto_backoff;
+
+	bool journal_uptodate;
+	bool just_formatted;
+
+	struct alg_spec internal_hash_alg;
+	struct alg_spec journal_crypt_alg;
+	struct alg_spec journal_mac_alg;
+};
+
+struct dm_integrity_range {
+	sector_t logical_sector;
+	unsigned n_sectors;
+	struct rb_node node;
+};
+
+struct dm_integrity_io {
+	struct work_struct work;
+
+	struct dm_integrity_c *ic;
+	bool write;
+	bool fua;
+
+	struct dm_integrity_range range;
+
+	sector_t metadata_block;
+	unsigned metadata_offset;
+
+	atomic_t in_flight;
+	int bi_error;
+
+	struct completion *completion;
+
+	struct block_device *orig_bi_bdev;
+	bio_end_io_t *orig_bi_end_io;
+	struct bio_integrity_payload *orig_bi_integrity;
+	struct bvec_iter orig_bi_iter;
+};
+
+struct journal_completion {
+	struct dm_integrity_c *ic;
+	atomic_t in_flight;
+	struct completion comp;
+};
+
+struct journal_io {
+	struct dm_integrity_range range;
+	struct journal_completion *comp;
+};
+
+static struct kmem_cache *journal_io_cache;
+
+#define JOURNAL_IO_MEMPOOL	32
+
+#ifdef DEBUG_PRINT
+#define DEBUG_print(x, ...)	printk(KERN_DEBUG x, ##__VA_ARGS__)
+static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
+{
+	va_list args;
+	va_start(args, msg);
+	vprintk(msg, args);
+	va_end(args);
+	if (len)
+		pr_cont(":");
+	while (len) {
+		pr_cont(" %02x", *bytes);
+		bytes++;
+		len--;
+	}
+	pr_cont("\n");
+}
+#define DEBUG_bytes(bytes, len, msg, ...)	__DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
+#else
+#define DEBUG_print(x, ...)			do { } while (0)
+#define DEBUG_bytes(bytes, len, msg, ...)	do { } while (0)
+#endif
+
+/*
+ * DM Integrity profile, protection is performed layer above (dm-crypt)
+ */
+static struct blk_integrity_profile dm_integrity_profile = {
+	.name			= "DM-DIF-EXT-TAG",
+	.generate_fn		= NULL,
+	.verify_fn		= NULL,
+};
+
+static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
+static void integrity_bio_wait(struct work_struct *w);
+static void dm_integrity_dtr(struct dm_target *ti);
+
+static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
+{
+	if (!cmpxchg(&ic->failed, 0, err))
+		DMERR("Error on %s: %d", msg, err);
+}
+
+static int dm_integrity_failed(struct dm_integrity_c *ic)
+{
+	return ACCESS_ONCE(ic->failed);
+}
+
+static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
+					  unsigned j, unsigned char seq)
+{
+	/*
+	 * Xor the number with section and sector, so that if a piece of
+	 * journal is written at wrong place, it is detected.
+	 */
+	return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
+}
+
+static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
+				sector_t *area, sector_t *offset)
+{
+	__u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
+
+	*area = data_sector >> log2_interleave_sectors;
+	*offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
+}
+
+static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
+					    sector_t offset, unsigned *metadata_offset)
+{
+	__u64 ms;
+	unsigned mo;
+
+	ms = area << ic->sb->log2_interleave_sectors;
+	if (likely(ic->log2_metadata_run >= 0))
+		ms += area << ic->log2_metadata_run;
+	else
+		ms += area * ic->metadata_run;
+	ms >>= ic->log2_buffer_sectors;
+
+	if (likely(ic->log2_tag_size >= 0)) {
+		ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
+		mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
+	} else {
+		ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
+		mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
+	}
+	*metadata_offset = mo;
+	return ms;
+}
+
+static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
+{
+	sector_t result;
+
+	result = area << ic->sb->log2_interleave_sectors;
+	if (likely(ic->log2_metadata_run >= 0))
+		result += (area + 1) << ic->log2_metadata_run;
+	else
+		result += (area + 1) * ic->metadata_run;
+
+	result += (sector_t)ic->initial_sectors + offset;
+	return result;
+}
+
+static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
+{
+	if (unlikely(*sec_ptr >= ic->journal_sections))
+		*sec_ptr -= ic->journal_sections;
+}
+
+static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
+{
+	struct dm_io_request io_req;
+	struct dm_io_region io_loc;
+
+	io_req.bi_op = op;
+	io_req.bi_op_flags = op_flags;
+	io_req.mem.type = DM_IO_KMEM;
+	io_req.mem.ptr.addr = ic->sb;
+	io_req.notify.fn = NULL;
+	io_req.client = ic->io;
+	io_loc.bdev = ic->dev->bdev;
+	io_loc.sector = ic->start;
+	io_loc.count = SB_SECTORS;
+
+	return dm_io(&io_req, 1, &io_loc, NULL);
+}
+
+static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+				 bool e, const char *function)
+{
+#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
+	unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
+
+	if (unlikely(section >= ic->journal_sections) ||
+	    unlikely(offset >= limit)) {
+		printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
+			function, section, offset, ic->journal_sections, limit);
+		BUG();
+	}
+#endif
+}
+
+static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+			       unsigned *pl_index, unsigned *pl_offset)
+{
+	unsigned sector;
+
+	access_journal_check(ic, section, offset, false, "access_journal");
+
+	sector = section * ic->journal_section_sectors + offset;
+
+	*pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+	*pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+}
+
+static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
+					       unsigned section, unsigned offset, unsigned *n_sectors)
+{
+	unsigned pl_index, pl_offset;
+	char *va;
+
+	page_list_location(ic, section, offset, &pl_index, &pl_offset);
+
+	if (n_sectors)
+		*n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
+
+	va = lowmem_page_address(pl[pl_index].page);
+
+	return (struct journal_sector *)(va + pl_offset);
+}
+
+static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
+{
+	return access_page_list(ic, ic->journal, section, offset, NULL);
+}
+
+static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
+{
+	unsigned rel_sector, offset;
+	struct journal_sector *js;
+
+	access_journal_check(ic, section, n, true, "access_journal_entry");
+
+	rel_sector = n % JOURNAL_BLOCK_SECTORS;
+	offset = n / JOURNAL_BLOCK_SECTORS;
+
+	js = access_journal(ic, section, rel_sector);
+	return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
+}
+
+static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
+{
+	access_journal_check(ic, section, n, true, "access_journal_data");
+
+	return access_journal(ic, section, n + JOURNAL_BLOCK_SECTORS);
+}
+
+static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
+{
+	SHASH_DESC_ON_STACK(desc, ic->journal_mac);
+	int r;
+	unsigned j, size;
+
+	desc->tfm = ic->journal_mac;
+	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	r = crypto_shash_init(desc);
+	if (unlikely(r)) {
+		dm_integrity_io_error(ic, "crypto_shash_init", r);
+		goto err;
+	}
+
+	for (j = 0; j < ic->journal_section_entries; j++) {
+		struct journal_entry *je = access_journal_entry(ic, section, j);
+		r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
+		if (unlikely(r)) {
+			dm_integrity_io_error(ic, "crypto_shash_update", r);
+			goto err;
+		}
+	}
+
+	size = crypto_shash_digestsize(ic->journal_mac);
+
+	if (likely(size <= JOURNAL_MAC_SIZE)) {
+		r = crypto_shash_final(desc, result);
+		if (unlikely(r)) {
+			dm_integrity_io_error(ic, "crypto_shash_final", r);
+			goto err;
+		}
+		memset(result + size, 0, JOURNAL_MAC_SIZE - size);
+	} else {
+		__u8 digest[size];
+		r = crypto_shash_final(desc, digest);
+		if (unlikely(r)) {
+			dm_integrity_io_error(ic, "crypto_shash_final", r);
+			goto err;
+		}
+		memcpy(result, digest, JOURNAL_MAC_SIZE);
+	}
+
+	return;
+err:
+	memset(result, 0, JOURNAL_MAC_SIZE);
+}
+
+static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
+{
+	__u8 result[JOURNAL_MAC_SIZE];
+	unsigned j;
+
+	if (!ic->journal_mac)
+		return;
+
+	section_mac(ic, section, result);
+
+	for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
+		struct journal_sector *js = access_journal(ic, section, j);
+
+		if (likely(wr))
+			memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
+		else {
+			if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
+				dm_integrity_io_error(ic, "journal mac", -EILSEQ);
+		}
+	}
+}
+
+static void complete_journal_op(void *context)
+{
+	struct journal_completion *comp = context;
+	BUG_ON(!atomic_read(&comp->in_flight));
+	if (likely(atomic_dec_and_test(&comp->in_flight)))
+		complete(&comp->comp);
+}
+
+static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+			unsigned n_sections, struct journal_completion *comp)
+{
+	struct async_submit_ctl submit;
+	size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
+	unsigned pl_index, pl_offset, section_index;
+	struct page_list *source_pl, *target_pl;
+
+	if (likely(encrypt)) {
+		source_pl = ic->journal;
+		target_pl = ic->journal_io;
+	} else {
+		source_pl = ic->journal_io;
+		target_pl = ic->journal;
+	}
+
+	page_list_location(ic, section, 0, &pl_index, &pl_offset);
+
+	atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
+
+	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
+
+	section_index = pl_index;
+
+	do {
+		size_t this_step;
+		struct page *src_pages[2];
+		struct page *dst_page;
+
+		while (unlikely(pl_index == section_index)) {
+			unsigned dummy;
+			if (likely(encrypt))
+				rw_section_mac(ic, section, true);
+			section++;
+			n_sections--;
+			if (!n_sections)
+				break;
+			page_list_location(ic, section, 0, &section_index, &dummy);
+		}
+
+		this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
+		dst_page = target_pl[pl_index].page;
+		src_pages[0] = source_pl[pl_index].page;
+		src_pages[1] = ic->journal_xor[pl_index].page;
+
+		async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
+
+		pl_index++;
+		pl_offset = 0;
+		n_bytes -= this_step;
+	} while (n_bytes);
+
+	BUG_ON(n_sections);
+
+	async_tx_issue_pending_all();
+}
+
+static void complete_journal_encrypt(struct crypto_async_request *req, int err)
+{
+	struct journal_completion *comp = req->data;
+	if (unlikely(err)) {
+		if (likely(err == -EINPROGRESS)) {
+			complete(&comp->ic->crypto_backoff);
+			return;
+		}
+		dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
+	}
+	complete_journal_op(comp);
+}
+
+static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
+{
+	int r;
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+				      complete_journal_encrypt, comp);
+	if (likely(encrypt))
+		r = crypto_skcipher_encrypt(req);
+	else
+		r = crypto_skcipher_decrypt(req);
+	if (likely(!r))
+		return false;
+	if (likely(r == -EINPROGRESS))
+		return true;
+	if (likely(r == -EBUSY)) {
+		wait_for_completion(&comp->ic->crypto_backoff);
+		reinit_completion(&comp->ic->crypto_backoff);
+		return true;
+	}
+	dm_integrity_io_error(comp->ic, "encrypt", r);
+	return false;
+}
+
+static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+			  unsigned n_sections, struct journal_completion *comp)
+{
+	struct scatterlist **source_sg;
+	struct scatterlist **target_sg;
+
+	atomic_add(2, &comp->in_flight);
+
+	if (likely(encrypt)) {
+		source_sg = ic->journal_scatterlist;
+		target_sg = ic->journal_io_scatterlist;
+	} else {
+		source_sg = ic->journal_io_scatterlist;
+		target_sg = ic->journal_scatterlist;
+	}
+
+	do {
+		struct skcipher_request *req;
+		unsigned ivsize;
+		char *iv;
+
+		if (likely(encrypt))
+			rw_section_mac(ic, section, true);
+
+		req = ic->sk_requests[section];
+		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
+		iv = req->iv;
+
+		memcpy(iv, iv + ivsize, ivsize);
+
+		req->src = source_sg[section];
+		req->dst = target_sg[section];
+
+		if (unlikely(do_crypt(encrypt, req, comp)))
+			atomic_inc(&comp->in_flight);
+
+		section++;
+		n_sections--;
+	} while (n_sections);
+
+	atomic_dec(&comp->in_flight);
+	complete_journal_op(comp);
+}
+
+static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+			    unsigned n_sections, struct journal_completion *comp)
+{
+	if (ic->journal_xor)
+		return xor_journal(ic, encrypt, section, n_sections, comp);
+	else
+		return crypt_journal(ic, encrypt, section, n_sections, comp);
+}
+
+static void complete_journal_io(unsigned long error, void *context)
+{
+	struct journal_completion *comp = context;
+	if (unlikely(error != 0))
+		dm_integrity_io_error(comp->ic, "writing journal", -EIO);
+	complete_journal_op(comp);
+}
+
+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
+		       unsigned n_sections, struct journal_completion *comp)
+{
+	struct dm_io_request io_req;
+	struct dm_io_region io_loc;
+	unsigned sector, n_sectors, pl_index, pl_offset;
+	int r;
+
+	if (unlikely(dm_integrity_failed(ic))) {
+		if (comp)
+			complete_journal_io(-1UL, comp);
+		return;
+	}
+
+	sector = section * ic->journal_section_sectors;
+	n_sectors = n_sections * ic->journal_section_sectors;
+
+	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+	io_req.bi_op = op;
+	io_req.bi_op_flags = op_flags;
+	io_req.mem.type = DM_IO_PAGE_LIST;
+	if (ic->journal_io)
+		io_req.mem.ptr.pl = &ic->journal_io[pl_index];
+	else
+		io_req.mem.ptr.pl = &ic->journal[pl_index];
+	io_req.mem.offset = pl_offset;
+	if (likely(comp != NULL)) {
+		io_req.notify.fn = complete_journal_io;
+		io_req.notify.context = comp;
+	} else {
+		io_req.notify.fn = NULL;
+	}
+	io_req.client = ic->io;
+	io_loc.bdev = ic->dev->bdev;
+	io_loc.sector = ic->start + SB_SECTORS + sector;
+	io_loc.count = n_sectors;
+
+	r = dm_io(&io_req, 1, &io_loc, NULL);
+	if (unlikely(r)) {
+		dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
+		if (comp) {
+			WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
+			complete_journal_io(-1UL, comp);
+		}
+	}
+}
+
+static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
+{
+	struct journal_completion io_comp;
+	struct journal_completion crypt_comp_1;
+	struct journal_completion crypt_comp_2;
+	unsigned i;
+
+	io_comp.ic = ic;
+	io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
+
+	if (commit_start + commit_sections <= ic->journal_sections) {
+		io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+		if (ic->journal_io) {
+			crypt_comp_1.ic = ic;
+			crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+			crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+			encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
+			wait_for_completion_io(&crypt_comp_1.comp);
+		} else {
+			for (i = 0; i < commit_sections; i++)
+				rw_section_mac(ic, commit_start + i, true);
+		}
+		rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp);
+	} else {
+		unsigned to_end;
+		io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
+		to_end = ic->journal_sections - commit_start;
+		if (ic->journal_io) {
+			crypt_comp_1.ic = ic;
+			crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+			crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+			encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
+			if (try_wait_for_completion(&crypt_comp_1.comp)) {
+				rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+				crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+				crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+				encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
+				wait_for_completion_io(&crypt_comp_1.comp);
+			} else {
+				crypt_comp_2.ic = ic;
+				crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp);
+				crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
+				encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
+				wait_for_completion_io(&crypt_comp_1.comp);
+				rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+				wait_for_completion_io(&crypt_comp_2.comp);
+			}
+		} else {
+			for (i = 0; i < to_end; i++)
+				rw_section_mac(ic, commit_start + i, true);
+			rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+			for (i = 0; i < commit_sections - to_end; i++)
+				rw_section_mac(ic, i, true);
+		}
+		rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
+	}
+
+	wait_for_completion_io(&io_comp.comp);
+}
+
+static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+			      unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
+{
+	struct dm_io_request io_req;
+	struct dm_io_region io_loc;
+	int r;
+	unsigned sector, pl_index, pl_offset;
+
+	if (unlikely(dm_integrity_failed(ic))) {
+		fn(-1UL, data);
+		return;
+	}
+
+	sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
+
+	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+	io_req.bi_op = REQ_OP_WRITE;
+	io_req.bi_op_flags = 0;
+	io_req.mem.type = DM_IO_PAGE_LIST;
+	io_req.mem.ptr.pl = &ic->journal[pl_index];
+	io_req.mem.offset = pl_offset;
+	io_req.notify.fn = fn;
+	io_req.notify.context = data;
+	io_req.client = ic->io;
+	io_loc.bdev = ic->dev->bdev;
+	io_loc.sector = ic->start + target;
+	io_loc.count = n_sectors;
+
+	r = dm_io(&io_req, 1, &io_loc, NULL);
+	if (unlikely(r)) {
+		WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
+		fn(-1UL, data);
+	}
+}
+
+static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+	struct rb_node **n = &ic->in_progress.rb_node;
+	struct rb_node *parent;
+
+	parent = NULL;
+
+	while (*n) {
+		struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
+
+		parent = *n;
+		if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
+			n = &range->node.rb_left;
+		} else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
+			n = &range->node.rb_right;
+		} else {
+			return false;
+		}
+	}
+
+	rb_link_node(&new_range->node, parent, n);
+	rb_insert_color(&new_range->node, &ic->in_progress);
+
+	return true;
+}
+
+static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
+{
+	rb_erase(&range->node, &ic->in_progress);
+	wake_up_locked(&ic->endio_wait);
+}
+
+static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->endio_wait.lock, flags);
+	remove_range_unlocked(ic, range);
+	spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+}
+
+static void init_journal_node(struct journal_node *node)
+{
+	RB_CLEAR_NODE(&node->node);
+	node->sector = (sector_t)-1;
+}
+
+static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
+{
+	struct rb_node **link;
+	struct rb_node *parent;
+
+	node->sector = sector;
+	BUG_ON(!RB_EMPTY_NODE(&node->node));
+
+	link = &ic->journal_tree_root.rb_node;
+	parent = NULL;
+
+	while (*link) {
+		struct journal_node *j;
+		parent = *link;
+		j = container_of(parent, struct journal_node, node);
+		if (sector < j->sector)
+			link = &j->node.rb_left;
+		else
+			link = &j->node.rb_right;
+	}
+
+	rb_link_node(&node->node, parent, link);
+	rb_insert_color(&node->node, &ic->journal_tree_root);
+}
+
+static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
+{
+	BUG_ON(RB_EMPTY_NODE(&node->node));
+	rb_erase(&node->node, &ic->journal_tree_root);
+	init_journal_node(node);
+}
+
+#define NOT_FOUND	(-1U)
+
+static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
+{
+	struct rb_node *n = ic->journal_tree_root.rb_node;
+	unsigned found = NOT_FOUND;
+	*next_sector = (sector_t)-1;
+	while (n) {
+		struct journal_node *j = container_of(n, struct journal_node, node);
+		if (sector == j->sector) {
+			found = j - ic->journal_tree;
+		}
+		if (sector < j->sector) {
+			*next_sector = j->sector;
+			n = j->node.rb_left;
+		} else {
+			n = j->node.rb_right;
+		}
+	}
+
+	return found;
+}
+
+static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
+{
+	struct journal_node *node, *next_node;
+	struct rb_node *next;
+
+	if (unlikely(pos >= ic->journal_entries))
+		return false;
+	node = &ic->journal_tree[pos];
+	if (unlikely(RB_EMPTY_NODE(&node->node)))
+		return false;
+	if (unlikely(node->sector != sector))
+		return false;
+
+	next = rb_next(&node->node);
+	if (unlikely(!next))
+		return true;
+
+	next_node = container_of(next, struct journal_node, node);
+	return next_node->sector != sector;
+}
+
+static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
+{
+	struct rb_node *next;
+	struct journal_node *next_node;
+	unsigned next_section;
+
+	BUG_ON(RB_EMPTY_NODE(&node->node));
+
+	next = rb_next(&node->node);
+	if (unlikely(!next))
+		return false;
+
+	next_node = container_of(next, struct journal_node, node);
+
+	if (next_node->sector != node->sector)
+		return false;
+
+	next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
+	if (next_section >= ic->committed_section &&
+	    next_section < ic->committed_section + ic->n_committed_sections)
+		return true;
+	if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
+		return true;
+
+	return false;
+}
+
+#define TAG_READ	0
+#define TAG_WRITE	1
+#define TAG_CMP		2
+
+static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
+			       unsigned *metadata_offset, unsigned total_size, int op)
+{
+	do {
+		unsigned char *data, *dp;
+		struct dm_buffer *b;
+		unsigned to_copy;
+		int r;
+
+		r = dm_integrity_failed(ic);
+		if (unlikely(r))
+			return r;
+
+		data = dm_bufio_read(ic->bufio, *metadata_block, &b);
+		if (unlikely(IS_ERR(data)))
+			return PTR_ERR(data);
+
+		to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
+		dp = data + *metadata_offset;
+		if (op == TAG_READ) {
+			memcpy(tag, dp, to_copy);
+		} else if (op == TAG_WRITE) {
+			memcpy(dp, tag, to_copy);
+			dm_bufio_mark_buffer_dirty(b);
+		} else  {
+			/* e.g.: op == TAG_CMP */
+			if (unlikely(memcmp(dp, tag, to_copy))) {
+				unsigned i;
+
+				for (i = 0; i < to_copy; i++) {
+					if (dp[i] != tag[i])
+						break;
+					total_size--;
+				}
+				dm_bufio_release(b);
+				return total_size;
+			}
+		}
+		dm_bufio_release(b);
+
+		tag += to_copy;
+		*metadata_offset += to_copy;
+		if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
+			(*metadata_block)++;
+			*metadata_offset = 0;
+		}
+		total_size -= to_copy;
+	} while (unlikely(total_size));
+
+	return 0;
+}
+
+static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
+{
+	int r;
+	r = dm_bufio_write_dirty_buffers(ic->bufio);
+	if (unlikely(r))
+		dm_integrity_io_error(ic, "writing tags", r);
+}
+
+static void sleep_on_endio_wait(struct dm_integrity_c *ic)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	__add_wait_queue(&ic->endio_wait, &wait);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock_irq(&ic->endio_wait.lock);
+	io_schedule();
+	spin_lock_irq(&ic->endio_wait.lock);
+	__remove_wait_queue(&ic->endio_wait, &wait);
+}
+
+static void autocommit_fn(unsigned long data)
+{
+	struct dm_integrity_c *ic = (struct dm_integrity_c *)data;
+
+	if (likely(!dm_integrity_failed(ic)))
+		queue_work(ic->commit_wq, &ic->commit_work);
+}
+
+static void schedule_autocommit(struct dm_integrity_c *ic)
+{
+	if (!timer_pending(&ic->autocommit_timer))
+		mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
+}
+
+static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
+{
+	struct bio *bio;
+	spin_lock_irq(&ic->endio_wait.lock);
+	bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+	bio_list_add(&ic->flush_bio_list, bio);
+	spin_unlock_irq(&ic->endio_wait.lock);
+	queue_work(ic->commit_wq, &ic->commit_work);
+}
+
+static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
+{
+	int r = dm_integrity_failed(ic);
+	if (unlikely(r) && !bio->bi_error)
+		bio->bi_error = r;
+	bio_endio(bio);
+}
+
+static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
+{
+	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+
+	if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic)))
+		submit_flush_bio(ic, dio);
+	else
+		do_endio(ic, bio);
+}
+
+static void dec_in_flight(struct dm_integrity_io *dio)
+{
+	if (atomic_dec_and_test(&dio->in_flight)) {
+		struct dm_integrity_c *ic = dio->ic;
+		struct bio *bio;
+
+		remove_range(ic, &dio->range);
+
+		if (unlikely(dio->write))
+			schedule_autocommit(ic);
+
+		bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+
+		if (unlikely(dio->bi_error) && !bio->bi_error)
+			bio->bi_error = dio->bi_error;
+		if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
+			dio->range.logical_sector += dio->range.n_sectors;
+			bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
+			INIT_WORK(&dio->work, integrity_bio_wait);
+			queue_work(ic->wait_wq, &dio->work);
+			return;
+		}
+		do_endio_flush(ic, dio);
+	}
+}
+
+static void integrity_end_io(struct bio *bio)
+{
+	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+	bio->bi_iter = dio->orig_bi_iter;
+	bio->bi_bdev = dio->orig_bi_bdev;
+	if (dio->orig_bi_integrity) {
+		bio->bi_integrity = dio->orig_bi_integrity;
+		bio->bi_opf |= REQ_INTEGRITY;
+	}
+	bio->bi_end_io = dio->orig_bi_end_io;
+
+	if (dio->completion)
+		complete(dio->completion);
+
+	dec_in_flight(dio);
+}
+
+static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
+				      const char *data, char *result)
+{
+	__u64 sector_le = cpu_to_le64(sector);
+	SHASH_DESC_ON_STACK(req, ic->internal_hash);
+	int r;
+	unsigned digest_size;
+
+	req->tfm = ic->internal_hash;
+	req->flags = 0;
+
+	r = crypto_shash_init(req);
+	if (unlikely(r < 0)) {
+		dm_integrity_io_error(ic, "crypto_shash_init", r);
+		goto failed;
+	}
+
+	r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
+	if (unlikely(r < 0)) {
+		dm_integrity_io_error(ic, "crypto_shash_update", r);
+		goto failed;
+	}
+
+	r = crypto_shash_update(req, data, 1 << SECTOR_SHIFT);
+	if (unlikely(r < 0)) {
+		dm_integrity_io_error(ic, "crypto_shash_update", r);
+		goto failed;
+	}
+
+	r = crypto_shash_final(req, result);
+	if (unlikely(r < 0)) {
+		dm_integrity_io_error(ic, "crypto_shash_final", r);
+		goto failed;
+	}
+
+	digest_size = crypto_shash_digestsize(ic->internal_hash);
+	if (unlikely(digest_size < ic->tag_size))
+		memset(result + digest_size, 0, ic->tag_size - digest_size);
+
+	return;
+
+failed:
+	/* this shouldn't happen anyway, the hash functions have no reason to fail */
+	get_random_bytes(result, ic->tag_size);
+}
+
+static void integrity_metadata(struct work_struct *w)
+{
+	struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+	struct dm_integrity_c *ic = dio->ic;
+
+	int r;
+
+	if (ic->internal_hash) {
+		struct bvec_iter iter;
+		struct bio_vec bv;
+		unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
+		struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+		char *checksums;
+		unsigned extra_space = digest_size > ic->tag_size ? digest_size - ic->tag_size : 0;
+		char checksums_onstack[ic->tag_size + extra_space];
+		unsigned sectors_to_process = dio->range.n_sectors;
+		sector_t sector = dio->range.logical_sector;
+
+		checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT) * ic->tag_size + extra_space,
+				    GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
+		if (!checksums)
+			checksums = checksums_onstack;
+
+		__bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
+			unsigned pos;
+			char *mem, *checksums_ptr;
+
+again:
+			mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
+			pos = 0;
+			checksums_ptr = checksums;
+			do {
+				integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
+				checksums_ptr += ic->tag_size;
+				sectors_to_process--;
+				pos += 1 << SECTOR_SHIFT;
+				sector++;
+			} while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
+			kunmap_atomic(mem);
+
+			r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
+						checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
+			if (unlikely(r)) {
+				if (r > 0) {
+					DMERR("Checksum failed at sector 0x%llx",
+					      (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
+					r = -EILSEQ;
+				}
+				if (likely(checksums != checksums_onstack))
+					kfree(checksums);
+				goto error;
+			}
+
+			if (!sectors_to_process)
+				break;
+
+			if (unlikely(pos < bv.bv_len)) {
+				bv.bv_offset += pos;
+				bv.bv_len -= pos;
+				goto again;
+			}
+		}
+
+		if (likely(checksums != checksums_onstack))
+			kfree(checksums);
+	} else {
+		struct bio_integrity_payload *bip = dio->orig_bi_integrity;
+
+		if (bip) {
+			struct bio_vec biv;
+			struct bvec_iter iter;
+			unsigned data_to_process = dio->range.n_sectors * ic->tag_size;
+
+			bip_for_each_vec(biv, bip, iter) {
+				unsigned char *tag;
+				unsigned this_len;
+
+				BUG_ON(PageHighMem(biv.bv_page));
+				tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
+				this_len = min(biv.bv_len, data_to_process);
+				r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
+							this_len, !dio->write ? TAG_READ : TAG_WRITE);
+				if (unlikely(r))
+					goto error;
+				data_to_process -= this_len;
+				if (!data_to_process)
+					break;
+			}
+		}
+	}
+	dec_in_flight(dio);
+	return;
+error:
+	dio->bi_error = r;
+	dec_in_flight(dio);
+}
+
+static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
+{
+	struct dm_integrity_c *ic = ti->private;
+	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+	sector_t area, offset;
+
+	dio->ic = ic;
+	dio->bi_error = 0;
+
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+		submit_flush_bio(ic, dio);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+	dio->write = bio_op(bio) == REQ_OP_WRITE;
+	dio->fua = dio->write && bio->bi_opf & REQ_FUA;
+	if (unlikely(dio->fua)) {
+		/*
+		 * Don't pass down the FUA flag because we have to flush
+		 * disk cache anyway.
+		 */
+		bio->bi_opf &= ~REQ_FUA;
+	}
+	if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
+		DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
+		      (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
+		      (unsigned long long)ic->provided_data_sectors);
+		return -EIO;
+	}
+
+	get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
+	dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
+	bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
+
+	dm_integrity_map_continue(dio, true);
+	return DM_MAPIO_SUBMITTED;
+}
+
+static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
+				 unsigned journal_section, unsigned journal_entry)
+{
+	struct dm_integrity_c *ic = dio->ic;
+	sector_t logical_sector;
+	unsigned n_sectors;
+
+	logical_sector = dio->range.logical_sector;
+	n_sectors = dio->range.n_sectors;
+	do {
+		struct bio_vec bv = bio_iovec(bio);
+		char *mem;
+
+		if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
+			bv.bv_len = n_sectors << SECTOR_SHIFT;
+		n_sectors -= bv.bv_len >> SECTOR_SHIFT;
+		bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
+retry_kmap:
+		mem = kmap_atomic(bv.bv_page);
+		if (likely(dio->write))
+			flush_dcache_page(bv.bv_page);
+
+		do {
+			struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
+
+			if (unlikely(!dio->write)) {
+				struct journal_sector *js;
+
+				if (unlikely(journal_entry_is_inprogress(je))) {
+					flush_dcache_page(bv.bv_page);
+					kunmap_atomic(mem);
+
+					__io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
+					goto retry_kmap;
+				}
+				smp_rmb();
+				BUG_ON(journal_entry_get_sector(je) != logical_sector);
+				js = access_journal_data(ic, journal_section, journal_entry);
+				memcpy(mem + bv.bv_offset, js, JOURNAL_SECTOR_DATA);
+				memcpy(mem + bv.bv_offset + JOURNAL_SECTOR_DATA, &je->last_bytes, sizeof je->last_bytes);
+#ifdef INTERNAL_VERIFY
+				if (ic->internal_hash) {
+					char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
+
+					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
+					if (unlikely(memcmp(checksums_onstack, je->tag, ic->tag_size))) {
+						DMERR("Checksum failed when reading from journal, at sector 0x%llx",
+						      (unsigned long long)logical_sector);
+					}
+				}
+#endif
+			}
+
+			if (!ic->internal_hash) {
+				struct bio_integrity_payload *bip = bio_integrity(bio);
+				unsigned tag_todo = ic->tag_size;
+				char *tag_ptr = je->tag;
+
+				if (bip) do {
+					struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+					unsigned tag_now = min(biv.bv_len, tag_todo);
+					char *tag_addr;
+					BUG_ON(PageHighMem(biv.bv_page));
+					tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
+					if (likely(dio->write))
+						memcpy(tag_ptr, tag_addr, tag_now);
+					else
+						memcpy(tag_addr, tag_ptr, tag_now);
+					bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
+					tag_ptr += tag_now;
+					tag_todo -= tag_now;
+				} while (unlikely(tag_todo)); else {
+					if (likely(dio->write))
+						memset(tag_ptr, 0, tag_todo);
+				}
+			}
+
+			if (likely(dio->write)) {
+				struct journal_sector *js;
+
+				js = access_journal_data(ic, journal_section, journal_entry);
+				memcpy(js, mem + bv.bv_offset, 1 << SECTOR_SHIFT);
+				je->last_bytes = js->commit_id;
+
+				if (ic->internal_hash) {
+					unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
+					if (unlikely(digest_size > ic->tag_size)) {
+						char checksums_onstack[digest_size];
+						integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
+						memcpy(je->tag, checksums_onstack, ic->tag_size);
+					} else
+						integrity_sector_checksum(ic, logical_sector, (char *)js, je->tag);
+				}
+
+				journal_entry_set_sector(je, logical_sector);
+			}
+			logical_sector++;
+
+			journal_entry++;
+			if (unlikely(journal_entry == ic->journal_section_entries)) {
+				journal_entry = 0;
+				journal_section++;
+				wraparound_section(ic, &journal_section);
+			}
+
+			bv.bv_offset += 1 << SECTOR_SHIFT;
+		} while (bv.bv_len -= 1 << SECTOR_SHIFT);
+
+		if (unlikely(!dio->write))
+			flush_dcache_page(bv.bv_page);
+		kunmap_atomic(mem);
+	} while (n_sectors);
+
+	if (likely(dio->write)) {
+		smp_mb();
+		if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
+			wake_up(&ic->copy_to_journal_wait);
+		if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
+			queue_work(ic->commit_wq, &ic->commit_work);
+		} else {
+			schedule_autocommit(ic);
+		}
+	} else {
+		remove_range(ic, &dio->range);
+	}
+
+	if (unlikely(bio->bi_iter.bi_size)) {
+		sector_t area, offset;
+
+		dio->range.logical_sector = logical_sector;
+		get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
+		dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
+		return true;
+	}
+
+	return false;
+}
+
+static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
+{
+	struct dm_integrity_c *ic = dio->ic;
+	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+	unsigned journal_section, journal_entry;
+	unsigned journal_read_pos;
+	struct completion read_comp;
+	bool need_sync_io = ic->internal_hash && !dio->write;
+
+	if (need_sync_io && from_map) {
+		INIT_WORK(&dio->work, integrity_bio_wait);
+		queue_work(ic->metadata_wq, &dio->work);
+		return;
+	}
+
+lock_retry:
+	spin_lock_irq(&ic->endio_wait.lock);
+retry:
+	if (unlikely(dm_integrity_failed(ic))) {
+		spin_unlock_irq(&ic->endio_wait.lock);
+		do_endio(ic, bio);
+		return;
+	}
+	dio->range.n_sectors = bio_sectors(bio);
+	journal_read_pos = NOT_FOUND;
+	if (likely(ic->mode == 'J')) {
+		if (dio->write) {
+			unsigned next_entry, i, pos;
+			unsigned ws, we;
+
+			dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors);
+			if (unlikely(!dio->range.n_sectors))
+				goto sleep;
+			ic->free_sectors -= dio->range.n_sectors;
+			journal_section = ic->free_section;
+			journal_entry = ic->free_section_entry;
+
+			next_entry = ic->free_section_entry + dio->range.n_sectors;
+			ic->free_section_entry = next_entry % ic->journal_section_entries;
+			ic->free_section += next_entry / ic->journal_section_entries;
+			ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
+			wraparound_section(ic, &ic->free_section);
+
+			pos = journal_section * ic->journal_section_entries + journal_entry;
+			ws = journal_section;
+			we = journal_entry;
+			for (i = 0; i < dio->range.n_sectors; i++) {
+				struct journal_entry *je;
+
+				add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
+				pos++;
+				if (unlikely(pos >= ic->journal_entries))
+					pos = 0;
+
+				je = access_journal_entry(ic, ws, we);
+				BUG_ON(!journal_entry_is_unused(je));
+				journal_entry_set_inprogress(je);
+				we++;
+				if (unlikely(we == ic->journal_section_entries)) {
+					we = 0;
+					ws++;
+					wraparound_section(ic, &ws);
+				}
+			}
+
+			spin_unlock_irq(&ic->endio_wait.lock);
+			goto journal_read_write;
+		} else {
+			sector_t next_sector;
+			journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
+			if (likely(journal_read_pos == NOT_FOUND)) {
+				if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
+					dio->range.n_sectors = next_sector - dio->range.logical_sector;
+			} else {
+				unsigned i;
+				for (i = 1; i < dio->range.n_sectors; i++) {
+					if (!test_journal_node(ic, journal_read_pos + i, dio->range.logical_sector + i))
+						break;
+				}
+				dio->range.n_sectors = i;
+			}
+		}
+	}
+	if (unlikely(!add_new_range(ic, &dio->range))) {
+		/*
+		 * We must not sleep in the request routine because it could
+		 * stall bios on current->bio_list.
+		 * So, we offload the bio to a workqueue if we have to sleep.
+		 */
+sleep:
+		if (from_map) {
+			spin_unlock_irq(&ic->endio_wait.lock);
+			INIT_WORK(&dio->work, integrity_bio_wait);
+			queue_work(ic->wait_wq, &dio->work);
+			return;
+		} else {
+			sleep_on_endio_wait(ic);
+			goto retry;
+		}
+	}
+	spin_unlock_irq(&ic->endio_wait.lock);
+
+	if (unlikely(journal_read_pos != NOT_FOUND)) {
+		journal_section = journal_read_pos / ic->journal_section_entries;
+		journal_entry = journal_read_pos % ic->journal_section_entries;
+		goto journal_read_write;
+	}
+
+	dio->in_flight = (atomic_t)ATOMIC_INIT(2);
+
+	if (need_sync_io) {
+		read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp);
+		dio->completion = &read_comp;
+	} else
+		dio->completion = NULL;
+
+	dio->orig_bi_iter = bio->bi_iter;
+
+	dio->orig_bi_bdev = bio->bi_bdev;
+	bio->bi_bdev = ic->dev->bdev;
+
+	dio->orig_bi_integrity = bio_integrity(bio);
+	bio->bi_integrity = NULL;
+	bio->bi_opf &= ~REQ_INTEGRITY;
+
+	dio->orig_bi_end_io = bio->bi_end_io;
+	bio->bi_end_io = integrity_end_io;
+
+	bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
+	bio->bi_iter.bi_sector += ic->start;
+	generic_make_request(bio);
+
+	if (need_sync_io) {
+		wait_for_completion_io(&read_comp);
+		integrity_metadata(&dio->work);
+	} else {
+		INIT_WORK(&dio->work, integrity_metadata);
+		queue_work(ic->metadata_wq, &dio->work);
+	}
+
+	return;
+
+journal_read_write:
+	if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
+		goto lock_retry;
+
+	do_endio_flush(ic, dio);
+}
+
+
+static void integrity_bio_wait(struct work_struct *w)
+{
+	struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+
+	dm_integrity_map_continue(dio, false);
+}
+
+static void pad_uncommitted(struct dm_integrity_c *ic)
+{
+	if (ic->free_section_entry) {
+		ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
+		ic->free_section_entry = 0;
+		ic->free_section++;
+		wraparound_section(ic, &ic->free_section);
+		ic->n_uncommitted_sections++;
+	}
+}
+
+static void integrity_commit(struct work_struct *w)
+{
+	struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
+	unsigned commit_start, commit_sections;
+	unsigned i, j, n;
+	struct bio *flushes;
+
+	del_timer(&ic->autocommit_timer);
+
+	spin_lock_irq(&ic->endio_wait.lock);
+	flushes = bio_list_get(&ic->flush_bio_list);
+	if (unlikely(ic->mode != 'J')) {
+		spin_unlock_irq(&ic->endio_wait.lock);
+		dm_integrity_flush_buffers(ic);
+		goto release_flush_bios;
+	}
+
+	pad_uncommitted(ic);
+	commit_start = ic->uncommitted_section;
+	commit_sections = ic->n_uncommitted_sections;
+	spin_unlock_irq(&ic->endio_wait.lock);
+
+	if (!commit_sections)
+		goto release_flush_bios;
+
+	i = commit_start;
+	for (n = 0; n < commit_sections; n++) {
+		for (j = 0; j < ic->journal_section_entries; j++) {
+			struct journal_entry *je;
+			je = access_journal_entry(ic, i, j);
+			io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
+		}
+		for (j = 0; j < ic->journal_section_sectors; j++) {
+			struct journal_sector *js;
+			js = access_journal(ic, i, j);
+			js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
+		}
+		i++;
+		if (unlikely(i >= ic->journal_sections))
+			ic->commit_seq = next_commit_seq(ic->commit_seq);
+		wraparound_section(ic, &i);
+	}
+	smp_rmb();
+
+	write_journal(ic, commit_start, commit_sections);
+
+	spin_lock_irq(&ic->endio_wait.lock);
+	ic->uncommitted_section += commit_sections;
+	wraparound_section(ic, &ic->uncommitted_section);
+	ic->n_uncommitted_sections -= commit_sections;
+	ic->n_committed_sections += commit_sections;
+	spin_unlock_irq(&ic->endio_wait.lock);
+
+	if (ACCESS_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
+		queue_work(ic->writer_wq, &ic->writer_work);
+
+release_flush_bios:
+	while (flushes) {
+		struct bio *next = flushes->bi_next;
+		flushes->bi_next = NULL;
+		do_endio(ic, flushes);
+		flushes = next;
+	}
+}
+
+static void complete_copy_from_journal(unsigned long error, void *context)
+{
+	struct journal_io *io = context;
+	struct journal_completion *comp = io->comp;
+	struct dm_integrity_c *ic = comp->ic;
+	remove_range(ic, &io->range);
+	mempool_free(io, ic->journal_io_mempool);
+	if (unlikely(error != 0))
+		dm_integrity_io_error(ic, "copying from journal", -EIO);
+	complete_journal_op(comp);
+}
+
+static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
+			     unsigned write_sections, bool from_replay)
+{
+	unsigned i, j, n;
+	struct journal_completion comp;
+
+	comp.ic = ic;
+	comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+	comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+
+	i = write_start;
+	for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
+#ifndef INTERNAL_VERIFY
+		if (unlikely(from_replay))
+#endif
+			rw_section_mac(ic, i, false);
+		for (j = 0; j < ic->journal_section_entries; j++) {
+			struct journal_entry *je = access_journal_entry(ic, i, j);
+			sector_t sec, area, offset;
+			unsigned k, l, next_loop;
+			sector_t metadata_block;
+			unsigned metadata_offset;
+			struct journal_io *io;
+
+			if (journal_entry_is_unused(je))
+				continue;
+			BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
+			sec = journal_entry_get_sector(je);
+			get_area_and_offset(ic, sec, &area, &offset);
+			access_journal_data(ic, i, j)->commit_id = je->last_bytes;
+			for (k = j + 1; k < ic->journal_section_entries; k++) {
+				struct journal_entry *je2 = access_journal_entry(ic, i, k);
+				sector_t sec2, area2, offset2;
+				if (journal_entry_is_unused(je2))
+					break;
+				BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
+				sec2 = journal_entry_get_sector(je2);
+				get_area_and_offset(ic, sec2, &area2, &offset2);
+				if (area2 != area || offset2 != offset + (k - j))
+					break;
+				access_journal_data(ic, i, k)->commit_id = je2->last_bytes;
+			}
+			next_loop = k - 1;
+
+			io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
+			io->comp = &comp;
+			io->range.logical_sector = sec;
+			io->range.n_sectors = k - j;
+
+			spin_lock_irq(&ic->endio_wait.lock);
+			while (unlikely(!add_new_range(ic, &io->range)))
+				sleep_on_endio_wait(ic);
+
+			if (likely(!from_replay)) {
+				struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
+
+				/* don't write if there is newer committed sector */
+				while (j < k && find_newer_committed_node(ic, &section_node[j])) {
+					struct journal_entry *je2 = access_journal_entry(ic, i, j);
+
+					journal_entry_set_unused(je2);
+					remove_journal_node(ic, &section_node[j]);
+					j++;
+					sec++;
+					offset++;
+				}
+				while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
+					struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
+
+					journal_entry_set_unused(je2);
+					remove_journal_node(ic, &section_node[k - 1]);
+					k--;
+				}
+				if (j == k) {
+					remove_range_unlocked(ic, &io->range);
+					spin_unlock_irq(&ic->endio_wait.lock);
+					mempool_free(io, ic->journal_io_mempool);
+					goto skip_io;
+				}
+				for (l = j; l < k; l++) {
+					remove_journal_node(ic, &section_node[l]);
+				}
+			}
+			spin_unlock_irq(&ic->endio_wait.lock);
+
+			metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
+			for (l = j; l < k; l++) {
+				int r;
+				struct journal_entry *je2 = access_journal_entry(ic, i, l);
+
+				if (
+#ifndef INTERNAL_VERIFY
+				    unlikely(from_replay) &&
+#endif
+				    ic->internal_hash) {
+					unsigned char test_tag[ic->tag_size];
+
+					integrity_sector_checksum(ic, sec + (l - j),
+								  (char *)access_journal_data(ic, i, l), test_tag);
+					if (unlikely(memcmp(test_tag, je2->tag, ic->tag_size)))
+						dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
+				}
+
+				journal_entry_set_unused(je2);
+				r = dm_integrity_rw_tag(ic, je2->tag, &metadata_block, &metadata_offset,
+							ic->tag_size, TAG_WRITE);
+				if (unlikely(r)) {
+					dm_integrity_io_error(ic, "reading tags", r);
+				}
+			}
+
+			atomic_inc(&comp.in_flight);
+			copy_from_journal(ic, i, j, k - j, get_data_sector(ic, area, offset),
+					  complete_copy_from_journal, io);
+skip_io:
+			j = next_loop;
+		}
+	}
+
+	dm_bufio_write_dirty_buffers_async(ic->bufio);
+
+	complete_journal_op(&comp);
+	wait_for_completion_io(&comp.comp);
+
+	dm_integrity_flush_buffers(ic);
+}
+
+static void integrity_writer(struct work_struct *w)
+{
+	struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
+	unsigned write_start, write_sections;
+
+	unsigned prev_free_sectors;
+
+	/* the following test is not needed, but it tests the replay code */
+	if (ACCESS_ONCE(ic->suspending))
+		return;
+
+	spin_lock_irq(&ic->endio_wait.lock);
+	write_start = ic->committed_section;
+	write_sections = ic->n_committed_sections;
+	spin_unlock_irq(&ic->endio_wait.lock);
+
+	if (!write_sections)
+		return;
+
+	do_journal_write(ic, write_start, write_sections, false);
+
+	spin_lock_irq(&ic->endio_wait.lock);
+
+	ic->committed_section += write_sections;
+	wraparound_section(ic, &ic->committed_section);
+	ic->n_committed_sections -= write_sections;
+
+	prev_free_sectors = ic->free_sectors;
+	ic->free_sectors += write_sections * ic->journal_section_entries;
+	if (unlikely(!prev_free_sectors))
+		wake_up_locked(&ic->endio_wait);
+
+	spin_unlock_irq(&ic->endio_wait.lock);
+}
+
+static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
+			 unsigned n_sections, unsigned char commit_seq)
+{
+	unsigned i, j, n;
+
+	if (!n_sections)
+		return;
+
+	for (n = 0; n < n_sections; n++) {
+		i = start_section + n;
+		wraparound_section(ic, &i);
+		for (j = 0; j < ic->journal_section_sectors; j++) {
+			struct journal_sector *js = access_journal(ic, i, j);
+			memset(&js->entries, 0, JOURNAL_SECTOR_DATA);
+			js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
+		}
+		for (j = 0; j < ic->journal_section_entries; j++) {
+			struct journal_entry *je = access_journal_entry(ic, i, j);
+			journal_entry_set_unused(je);
+		}
+	}
+
+	write_journal(ic, start_section, n_sections);
+}
+
+static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id)
+{
+	unsigned char k;
+	for (k = 0; k < N_COMMIT_IDS; k++) {
+		if (dm_integrity_commit_id(ic, i, j, k) == id)
+			return k;
+	}
+	dm_integrity_io_error(ic, "journal commit id", -EIO);
+	return -EIO;
+}
+
+static void replay_journal(struct dm_integrity_c *ic)
+{
+	unsigned i, j;
+	bool used_commit_ids[N_COMMIT_IDS];
+	unsigned max_commit_id_sections[N_COMMIT_IDS];
+	unsigned write_start, write_sections;
+	unsigned continue_section;
+	bool journal_empty;
+	unsigned char unused, last_used, want_commit_seq;
+
+	if (ic->journal_uptodate)
+		return;
+
+	last_used = 0;
+	write_start = 0;
+
+	if (!ic->just_formatted) {
+		DEBUG_print("reading journal\n");
+		rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL);
+		if (ic->journal_io)
+			DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
+		if (ic->journal_io) {
+			struct journal_completion crypt_comp;
+			crypt_comp.ic = ic;
+			crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp);
+			crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
+			encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
+			wait_for_completion(&crypt_comp.comp);
+		}
+		DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
+	}
+
+	if (dm_integrity_failed(ic))
+		goto clear_journal;
+
+	journal_empty = true;
+	memset(used_commit_ids, 0, sizeof used_commit_ids);
+	memset(max_commit_id_sections, 0, sizeof max_commit_id_sections);
+	for (i = 0; i < ic->journal_sections; i++) {
+		for (j = 0; j < ic->journal_section_sectors; j++) {
+			int k;
+			struct journal_sector *js = access_journal(ic, i, j);
+			k = find_commit_seq(ic, i, j, js->commit_id);
+			if (k < 0)
+				goto clear_journal;
+			used_commit_ids[k] = true;
+			max_commit_id_sections[k] = i;
+		}
+		if (journal_empty) {
+			for (j = 0; j < ic->journal_section_entries; j++) {
+				struct journal_entry *je = access_journal_entry(ic, i, j);
+				if (!journal_entry_is_unused(je)) {
+					journal_empty = false;
+					break;
+				}
+			}
+		}
+	}
+
+	if (!used_commit_ids[N_COMMIT_IDS - 1]) {
+		unused = N_COMMIT_IDS - 1;
+		while (unused && !used_commit_ids[unused - 1])
+			unused--;
+	} else {
+		for (unused = 0; unused < N_COMMIT_IDS; unused++)
+			if (!used_commit_ids[unused])
+				break;
+		if (unused == N_COMMIT_IDS) {
+			dm_integrity_io_error(ic, "journal commit ids", -EIO);
+			goto clear_journal;
+		}
+	}
+	DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
+		    unused, used_commit_ids[0], used_commit_ids[1],
+		    used_commit_ids[2], used_commit_ids[3]);
+
+	last_used = prev_commit_seq(unused);
+	want_commit_seq = prev_commit_seq(last_used);
+
+	if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
+		journal_empty = true;
+
+	write_start = max_commit_id_sections[last_used] + 1;
+	if (unlikely(write_start >= ic->journal_sections))
+		want_commit_seq = next_commit_seq(want_commit_seq);
+	wraparound_section(ic, &write_start);
+
+	i = write_start;
+	for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
+		for (j = 0; j < ic->journal_section_sectors; j++) {
+			struct journal_sector *js = access_journal(ic, i, j);
+
+			if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
+				/*
+				 * This could be caused by crash during writing.
+				 * We won't replay the inconsistent part of the
+				 * journal.
+				 */
+				DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
+					    i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
+				goto brk;
+			}
+		}
+		i++;
+		if (unlikely(i >= ic->journal_sections))
+			want_commit_seq = next_commit_seq(want_commit_seq);
+		wraparound_section(ic, &i);
+	}
+brk:
+
+	if (!journal_empty) {
+		DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
+			    write_sections, write_start, want_commit_seq);
+		do_journal_write(ic, write_start, write_sections, true);
+	}
+
+	if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
+		continue_section = write_start;
+		ic->commit_seq = want_commit_seq;
+		DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
+	} else {
+		unsigned s;
+		unsigned char erase_seq;
+clear_journal:
+		DEBUG_print("clearing journal\n");
+
+		erase_seq = prev_commit_seq(prev_commit_seq(last_used));
+		s = write_start;
+		init_journal(ic, s, 1, erase_seq);
+		s++;
+		wraparound_section(ic, &s);
+		if (ic->journal_sections >= 2) {
+			init_journal(ic, s, ic->journal_sections - 2, erase_seq);
+			s += ic->journal_sections - 2;
+			wraparound_section(ic, &s);
+			init_journal(ic, s, 1, erase_seq);
+		}
+
+		continue_section = 0;
+		ic->commit_seq = next_commit_seq(erase_seq);
+	}
+
+	ic->committed_section = continue_section;
+	ic->n_committed_sections = 0;
+
+	ic->uncommitted_section = continue_section;
+	ic->n_uncommitted_sections = 0;
+
+	ic->free_section = continue_section;
+	ic->free_section_entry = 0;
+	ic->free_sectors = ic->journal_entries;
+
+	ic->journal_tree_root = RB_ROOT;
+	for (i = 0; i < ic->journal_entries; i++)
+		init_journal_node(&ic->journal_tree[i]);
+}
+
+static void dm_integrity_postsuspend(struct dm_target *ti)
+{
+	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+
+	del_timer_sync(&ic->autocommit_timer);
+
+	ic->suspending = true;
+
+	queue_work(ic->commit_wq, &ic->commit_work);
+	drain_workqueue(ic->commit_wq);
+
+	if (ic->mode == 'J') {
+		drain_workqueue(ic->writer_wq);
+		dm_integrity_flush_buffers(ic);
+	}
+
+	ic->suspending = false;
+
+	BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
+
+	ic->journal_uptodate = true;
+}
+
+static void dm_integrity_resume(struct dm_target *ti)
+{
+	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+
+	replay_journal(ic);
+}
+
+static void dm_integrity_status(struct dm_target *ti, status_type_t type,
+				unsigned status_flags, char *result, unsigned maxlen)
+{
+	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+	unsigned arg_count;
+	size_t sz = 0;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE: {
+		__u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
+		watermark_percentage += ic->journal_entries / 2;
+		do_div(watermark_percentage, ic->journal_entries);
+		arg_count = 5;
+		arg_count += !!ic->internal_hash_alg.alg_string;
+		arg_count += !!ic->journal_crypt_alg.alg_string;
+		arg_count += !!ic->journal_mac_alg.alg_string;
+		DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
+		       ic->tag_size, ic->mode, arg_count);
+		DMEMIT(" journal-sectors:%u", ic->initial_sectors - SB_SECTORS);
+		DMEMIT(" interleave-sectors:%u", 1U << ic->sb->log2_interleave_sectors);
+		DMEMIT(" buffer-sectors:%u", 1U << ic->log2_buffer_sectors);
+		DMEMIT(" journal-watermark:%u", (unsigned)watermark_percentage);
+		DMEMIT(" commit-time:%u", ic->autocommit_msec);
+
+#define EMIT_ALG(a, n)							\
+		do {							\
+			if (ic->a.alg_string) {				\
+				DMEMIT(" %s:%s", n, ic->a.alg_string);	\
+				if (ic->a.key_string)			\
+					DMEMIT(":%s", ic->a.key_string);\
+			}						\
+		} while (0)
+		EMIT_ALG(internal_hash_alg, "internal-hash");
+		EMIT_ALG(journal_crypt_alg, "journal-crypt");
+		EMIT_ALG(journal_mac_alg, "journal-mac");
+		break;
+	}
+	}
+}
+
+static int dm_integrity_iterate_devices(struct dm_target *ti,
+					iterate_devices_callout_fn fn, void *data)
+{
+	struct dm_integrity_c *ic = ti->private;
+
+	return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
+}
+
+static void calculate_journal_section_size(struct dm_integrity_c *ic)
+{
+	unsigned sector_space = JOURNAL_SECTOR_DATA;
+
+	ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
+	ic->journal_entry_size = roundup(offsetof(struct journal_entry, tag) + ic->tag_size,
+					 JOURNAL_ENTRY_ROUNDUP);
+
+	if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
+		sector_space -= JOURNAL_MAC_PER_SECTOR;
+	ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
+	ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
+	ic->journal_section_sectors = ic->journal_section_entries + JOURNAL_BLOCK_SECTORS;
+	ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
+}
+
+static int calculate_device_limits(struct dm_integrity_c *ic)
+{
+	__u64 initial_sectors;
+	sector_t last_sector, last_area, last_offset;
+
+	calculate_journal_section_size(ic);
+	initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
+	if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX)
+		return -EINVAL;
+	ic->initial_sectors = initial_sectors;
+
+	ic->metadata_run = roundup((__u64)ic->tag_size << ic->sb->log2_interleave_sectors,
+				   (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
+	if (!(ic->metadata_run & (ic->metadata_run - 1)))
+		ic->log2_metadata_run = __ffs(ic->metadata_run);
+	else
+		ic->log2_metadata_run = -1;
+
+	get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
+	last_sector = get_data_sector(ic, last_area, last_offset);
+
+	if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
+{
+	unsigned journal_sections;
+	int test_bit;
+
+	memcpy(ic->sb->magic, SB_MAGIC, 8);
+	ic->sb->version = SB_VERSION;
+	ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
+	if (ic->journal_mac_alg.alg_string)
+		ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
+
+	calculate_journal_section_size(ic);
+	journal_sections = journal_sectors / ic->journal_section_sectors;
+	if (!journal_sections)
+		journal_sections = 1;
+	ic->sb->journal_sections = cpu_to_le32(journal_sections);
+
+	ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
+	ic->sb->log2_interleave_sectors = max((__u8)MIN_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
+	ic->sb->log2_interleave_sectors = min((__u8)MAX_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
+
+	ic->provided_data_sectors = 0;
+	for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) {
+		__u64 prev_data_sectors = ic->provided_data_sectors;
+
+		ic->provided_data_sectors |= (sector_t)1 << test_bit;
+		if (calculate_device_limits(ic))
+			ic->provided_data_sectors = prev_data_sectors;
+	}
+
+	if (!le64_to_cpu(ic->provided_data_sectors))
+		return -EINVAL;
+
+	ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
+
+	return 0;
+}
+
+static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
+{
+	struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
+	struct blk_integrity bi;
+
+	memset(&bi, 0, sizeof(bi));
+	bi.profile = &dm_integrity_profile;
+	bi.tuple_size = ic->tag_size * (queue_logical_block_size(disk->queue) >> SECTOR_SHIFT);
+	bi.tag_size = ic->tag_size;
+
+	blk_integrity_register(disk, &bi);
+	blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
+}
+
+/* FIXME: use new kvmalloc */
+static void *dm_integrity_kvmalloc(size_t size, gfp_t gfp)
+{
+	void *ptr = NULL;
+
+	if (size <= PAGE_SIZE)
+		ptr = kmalloc(size, GFP_KERNEL | gfp);
+	if (!ptr && size <= KMALLOC_MAX_SIZE)
+		ptr = kmalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | gfp);
+	if (!ptr)
+		ptr = __vmalloc(size, GFP_KERNEL | gfp, PAGE_KERNEL);
+
+	return ptr;
+}
+
+static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
+{
+	unsigned i;
+
+	if (!pl)
+		return;
+	for (i = 0; i < ic->journal_pages; i++)
+		if (pl[i].page)
+			__free_page(pl[i].page);
+	kvfree(pl);
+}
+
+static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
+{
+	size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
+	struct page_list *pl;
+	unsigned i;
+
+	pl = dm_integrity_kvmalloc(page_list_desc_size, __GFP_ZERO);
+	if (!pl)
+		return NULL;
+
+	for (i = 0; i < ic->journal_pages; i++) {
+		pl[i].page = alloc_page(GFP_KERNEL);
+		if (!pl[i].page) {
+			dm_integrity_free_page_list(ic, pl);
+			return NULL;
+		}
+		if (i)
+			pl[i - 1].next = &pl[i];
+	}
+
+	return pl;
+}
+
+static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
+{
+	unsigned i;
+	for (i = 0; i < ic->journal_sections; i++)
+		kvfree(sl[i]);
+	kfree(sl);
+}
+
+static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
+{
+	struct scatterlist **sl;
+	unsigned i;
+
+	sl = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), __GFP_ZERO);
+	if (!sl)
+		return NULL;
+
+	for (i = 0; i < ic->journal_sections; i++) {
+		struct scatterlist *s;
+		unsigned start_index, start_offset;
+		unsigned end_index, end_offset;
+		unsigned n_pages;
+		unsigned idx;
+
+		page_list_location(ic, i, 0, &start_index, &start_offset);
+		page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
+
+		n_pages = (end_index - start_index + 1);
+
+		s = dm_integrity_kvmalloc(n_pages * sizeof(struct scatterlist), 0);
+		if (!s) {
+			dm_integrity_free_journal_scatterlist(ic, sl);
+			return NULL;
+		}
+
+		sg_init_table(s, n_pages);
+		for (idx = start_index; idx <= end_index; idx++) {
+			char *va = lowmem_page_address(pl[idx].page);
+			unsigned start = 0, end = PAGE_SIZE;
+			if (idx == start_index)
+				start = start_offset;
+			if (idx == end_index)
+				end = end_offset + (1 << SECTOR_SHIFT);
+			sg_set_buf(&s[idx - start_index], va + start, end - start);
+		}
+
+		sl[i] = s;
+	}
+
+	return sl;
+}
+
+static void free_alg(struct alg_spec *a)
+{
+	kzfree(a->alg_string);
+	kzfree(a->key);
+	memset(a, 0, sizeof *a);
+}
+
+static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
+{
+	char *k;
+
+	free_alg(a);
+
+	a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
+	if (!a->alg_string)
+		goto nomem;
+
+	k = strchr(a->alg_string, ':');
+	if (k) {
+		unsigned i;
+
+		*k = 0;
+		a->key_string = k + 1;
+		if (strlen(a->key_string) & 1)
+			goto inval;
+
+		a->key_size = strlen(a->key_string) / 2;
+		a->key = kmalloc(a->key_size, GFP_KERNEL);
+		if (!a->key)
+			goto nomem;
+		for (i = 0; i < a->key_size; i++) {
+			char digit[3];
+			digit[0] = a->key_string[i * 2];
+			digit[1] = a->key_string[i * 2 + 1];
+			digit[2] = 0;
+			if (strspn(digit, "0123456789abcdefABCDEF") != 2)
+				goto inval;
+			if (kstrtou8(digit, 16, &a->key[i]))
+				goto inval;
+		}
+	}
+
+	return 0;
+inval:
+	*error = error_inval;
+	return -EINVAL;
+nomem:
+	*error = "Out of memory for an argument";
+	return -ENOMEM;
+}
+
+static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
+		   char *error_alg, char *error_key)
+{
+	int r;
+
+	if (a->alg_string) {
+		*hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(*hash)) {
+			*error = error_alg;
+			r = PTR_ERR(*hash);
+			*hash = NULL;
+			return r;
+		}
+
+		if (a->key) {
+			r = crypto_shash_setkey(*hash, a->key, a->key_size);
+			if (r) {
+				*error = error_key;
+				return r;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Construct a integrity mapping: <dev_path> <offset> <tag_size>
+ *
+ * Arguments:
+ *	device
+ *	offset from the start of the device
+ *	tag size
+ *	D - direct writes, J - journal writes
+ *	number of optional arguments
+ *	optional arguments:
+ *		journal-sectors
+ *		interleave-sectors
+ *		buffer-sectors
+ *		journal-watermark
+ *		commit-time
+ *		internal-hash
+ *		journal-crypt
+ *		journal-mac
+ */
+static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct dm_integrity_c *ic;
+	char dummy;
+	int r;
+	unsigned i;
+	unsigned extra_args;
+	struct dm_arg_set as;
+	static struct dm_arg _args[] = {
+		{0, 7, "Invalid number of feature args"},
+	};
+	unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
+	bool should_write_sb;
+	__u64 journal_pages, journal_desc_size, journal_tree_size;
+	__u64 threshold;
+	unsigned long long start;
+
+#define DIRECT_ARGUMENTS	4
+
+	if (argc <= DIRECT_ARGUMENTS) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
+	if (!ic) {
+		ti->error = "Cannot allocate integrity context";
+		return -ENOMEM;
+	}
+	ti->private = ic;
+	ti->per_io_data_size = sizeof(struct dm_integrity_io);
+
+	ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
+	ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
+	ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
+	ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
+
+	ic->in_progress = RB_ROOT;
+	init_waitqueue_head(&ic->endio_wait);
+	bio_list_init(&ic->flush_bio_list);
+	init_waitqueue_head(&ic->copy_to_journal_wait);
+	init_completion(&ic->crypto_backoff);
+
+	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
+	if (r) {
+		ti->error = "Device lookup failed";
+		goto bad;
+	}
+
+	if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
+		ti->error = "Invalid starting offset";
+		r = -EINVAL;
+		goto bad;
+	}
+	ic->start = start;
+
+	if (strcmp(argv[2], "-")) {
+		if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
+			ti->error = "Invalid tag size";
+			r = -EINVAL;
+			goto bad;
+		}
+	}
+
+	if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D"))
+		ic->mode = argv[3][0];
+	else {
+		ti->error = "Invalid mode (expecting J or D)";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+	journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
+			ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
+	interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
+	buffer_sectors = DEFAULT_BUFFER_SECTORS;
+	journal_watermark = DEFAULT_JOURNAL_WATERMARK;
+	sync_msec = DEFAULT_SYNC_MSEC;
+
+	as.argc = argc - DIRECT_ARGUMENTS;
+	as.argv = argv + DIRECT_ARGUMENTS;
+	r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
+	if (r)
+		goto bad;
+
+	while (extra_args--) {
+		const char *opt_string;
+		unsigned val;
+		opt_string = dm_shift_arg(&as);
+		if (!opt_string) {
+			r = -EINVAL;
+			ti->error = "Not enough feature arguments";
+			goto bad;
+		}
+		if (sscanf(opt_string, "journal-sectors:%u%c", &val, &dummy) == 1)
+			journal_sectors = val;
+		else if (sscanf(opt_string, "interleave-sectors:%u%c", &val, &dummy) == 1)
+			interleave_sectors = val;
+		else if (sscanf(opt_string, "buffer-sectors:%u%c", &val, &dummy) == 1)
+			buffer_sectors = val;
+		else if (sscanf(opt_string, "journal-watermark:%u%c", &val, &dummy) == 1 && val <= 100)
+			journal_watermark = val;
+		else if (sscanf(opt_string, "commit-time:%u%c", &val, &dummy) == 1)
+			sync_msec = val;
+		else if (!memcmp(opt_string, "internal-hash:", strlen("internal-hash:"))) {
+			r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
+					    "Invalid internal-hash argument");
+			if (r)
+				goto bad;
+		} else if (!memcmp(opt_string, "journal-crypt:", strlen("journal-crypt:"))) {
+			r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
+					    "Invalid journal-crypt argument");
+			if (r)
+				goto bad;
+		} else if (!memcmp(opt_string, "journal-mac:", strlen("journal-mac:"))) {
+			r = get_alg_and_key(opt_string, &ic->journal_mac_alg,  &ti->error,
+					    "Invalid journal-mac argument");
+			if (r)
+				goto bad;
+		} else {
+			r = -EINVAL;
+			ti->error = "Invalid argument";
+			goto bad;
+		}
+	}
+
+	r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
+		    "Invalid internal hash", "Error setting internal hash key");
+	if (r)
+		goto bad;
+
+	r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
+		    "Invalid journal mac", "Error setting journal mac key");
+	if (r)
+		goto bad;
+
+	if (!ic->tag_size) {
+		if (!ic->internal_hash) {
+			ti->error = "Unknown tag size";
+			r = -EINVAL;
+			goto bad;
+		}
+		ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
+	}
+	if (ic->tag_size > MAX_TAG_SIZE) {
+		ti->error = "Too big tag size";
+		r = -EINVAL;
+		goto bad;
+	}
+	if (!(ic->tag_size & (ic->tag_size - 1)))
+		ic->log2_tag_size = __ffs(ic->tag_size);
+	else
+		ic->log2_tag_size = -1;
+
+	ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
+	ic->autocommit_msec = sync_msec;
+	setup_timer(&ic->autocommit_timer, autocommit_fn, (unsigned long)ic);
+
+	ic->io = dm_io_client_create();
+	if (IS_ERR(ic->io)) {
+		r = PTR_ERR(ic->io);
+		ic->io = NULL;
+		ti->error = "Cannot allocate dm io";
+		goto bad;
+	}
+
+	ic->journal_io_mempool = mempool_create_slab_pool(JOURNAL_IO_MEMPOOL, journal_io_cache);
+	if (!ic->journal_io_mempool) {
+		r = -ENOMEM;
+		ti->error = "Cannot allocate mempool";
+		goto bad;
+	}
+
+	ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
+					  WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
+	if (!ic->metadata_wq) {
+		ti->error = "Cannot allocate workqueue";
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	/*
+	 * If this workqueue were percpu, it would cause bio reordering
+	 * and reduced performance.
+	 */
+	ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!ic->wait_wq) {
+		ti->error = "Cannot allocate workqueue";
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
+	if (!ic->commit_wq) {
+		ti->error = "Cannot allocate workqueue";
+		r = -ENOMEM;
+		goto bad;
+	}
+	INIT_WORK(&ic->commit_work, integrity_commit);
+
+	if (ic->mode == 'J') {
+		ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
+		if (!ic->writer_wq) {
+			ti->error = "Cannot allocate workqueue";
+			r = -ENOMEM;
+			goto bad;
+		}
+		INIT_WORK(&ic->writer_work, integrity_writer);
+	}
+
+	ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
+	if (!ic->sb) {
+		r = -ENOMEM;
+		ti->error = "Cannot allocate superblock area";
+		goto bad;
+	}
+
+	r = sync_rw_sb(ic, REQ_OP_READ, 0);
+	if (r) {
+		ti->error = "Error reading superblock";
+		goto bad;
+	}
+	if (!memcmp(ic->sb->magic, SB_MAGIC, 8)) {
+		should_write_sb = false;
+	} else {
+		for (i = 0; i < 512; i += 8) {
+			if (*(__u64 *)((__u8 *)ic->sb + i)) {
+				r = -EINVAL;
+				ti->error = "The device is not initialized";
+				goto bad;
+			}
+		}
+
+		r = initialize_superblock(ic, journal_sectors, interleave_sectors);
+		if (r) {
+			ti->error = "Could not initialize superblock";
+			goto bad;
+		}
+		should_write_sb = true;
+	}
+
+	if (ic->sb->version != SB_VERSION) {
+		r = -EINVAL;
+		ti->error = "Unknown version";
+		goto bad;
+	}
+	if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
+		r = -EINVAL;
+		ti->error = "Invalid tag size";
+		goto bad;
+	}
+	/* make sure that ti->max_io_len doesn't overflow */
+	if (ic->sb->log2_interleave_sectors < MIN_INTERLEAVE_SECTORS ||
+	    ic->sb->log2_interleave_sectors > MAX_INTERLEAVE_SECTORS) {
+		r = -EINVAL;
+		ti->error = "Invalid interleave_sectors in the superblock";
+		goto bad;
+	}
+	ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
+	if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
+		/* test for overflow */
+		r = -EINVAL;
+		ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
+		goto bad;
+	}
+	if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
+		r = -EINVAL;
+		ti->error = "Journal mac mismatch";
+		goto bad;
+	}
+	r = calculate_device_limits(ic);
+	if (r) {
+		ti->error = "The device is too small";
+		goto bad;
+	}
+
+	if (!buffer_sectors)
+		buffer_sectors = 1;
+	ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT);
+
+	threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
+	threshold += 50;
+	do_div(threshold, 100);
+	ic->free_sectors_threshold = threshold;
+
+	DEBUG_print("initialized:\n");
+	DEBUG_print("	integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
+	DEBUG_print("	journal_entry_size %u\n", ic->journal_entry_size);
+	DEBUG_print("	journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
+	DEBUG_print("	journal_section_entries %u\n", ic->journal_section_entries);
+	DEBUG_print("	journal_section_sectors %u\n", ic->journal_section_sectors);
+	DEBUG_print("	journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
+	DEBUG_print("	journal_entries %u\n", ic->journal_entries);
+	DEBUG_print("	log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
+	DEBUG_print("	device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
+	DEBUG_print("	initial_sectors 0x%x\n", ic->initial_sectors);
+	DEBUG_print("	metadata_run 0x%x\n", ic->metadata_run);
+	DEBUG_print("	log2_metadata_run %d\n", ic->log2_metadata_run);
+	DEBUG_print("	provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
+		    (unsigned long long)ic->provided_data_sectors);
+	DEBUG_print("	log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
+
+	ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors),
+					   1, 0, NULL, NULL);
+	if (IS_ERR(ic->bufio)) {
+		r = PTR_ERR(ic->bufio);
+		ti->error = "Cannot initialize dm-bufio";
+		ic->bufio = NULL;
+		goto bad;
+	}
+	dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
+
+	journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
+				PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
+	journal_desc_size = journal_pages * sizeof(struct page_list);
+	if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
+		ti->error = "Journal doesn't fit into memory";
+		r = -ENOMEM;
+		goto bad;
+	}
+	ic->journal_pages = journal_pages;
+
+	ic->journal = dm_integrity_alloc_page_list(ic);
+	if (!ic->journal) {
+		ti->error = "Could not allocate memory for journal";
+		r = -ENOMEM;
+		goto bad;
+	}
+	if (ic->journal_crypt_alg.alg_string) {
+		unsigned ivsize, blocksize;
+		struct journal_completion comp;
+		comp.ic = ic;
+
+		ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
+		if (IS_ERR(ic->journal_crypt)) {
+			ti->error = "Invalid journal cipher";
+			r = PTR_ERR(ic->journal_crypt);
+			ic->journal_crypt = NULL;
+			goto bad;
+		}
+		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
+		blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
+
+		if (ic->journal_crypt_alg.key) {
+			r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
+						   ic->journal_crypt_alg.key_size);
+			if (r) {
+				ti->error = "Error setting encryption key";
+				goto bad;
+			}
+		}
+		DEBUG_print("cipher %s, block size %u iv size %u\n",
+			    ic->journal_crypt_alg.alg_string, blocksize, ivsize);
+
+		ic->journal_io = dm_integrity_alloc_page_list(ic);
+		if (!ic->journal_io) {
+			ti->error = "Could not allocate memory for journal io";
+			r = -ENOMEM;
+			goto bad;
+		}
+
+		if (blocksize == 1) {
+			struct scatterlist *sg;
+			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
+			unsigned char iv[ivsize];
+			skcipher_request_set_tfm(req, ic->journal_crypt);
+
+			ic->journal_xor = dm_integrity_alloc_page_list(ic);
+			if (!ic->journal_xor) {
+				ti->error = "Could not allocate memory for journal xor";
+				r = -ENOMEM;
+				goto bad;
+			}
+
+			sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0);
+			if (!sg) {
+				ti->error = "Unable to allocate sg list";
+				r = -ENOMEM;
+				goto bad;
+			}
+			sg_init_table(sg, ic->journal_pages + 1);
+			for (i = 0; i < ic->journal_pages; i++) {
+				char *va = lowmem_page_address(ic->journal_xor[i].page);
+				clear_page(va);
+				sg_set_buf(&sg[i], va, PAGE_SIZE);
+			}
+			sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
+			memset(iv, 0x00, ivsize);
+
+			skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
+			comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+			comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+			if (do_crypt(true, req, &comp))
+				wait_for_completion(&comp.comp);
+			kvfree(sg);
+			if ((r = dm_integrity_failed(ic))) {
+				ti->error = "Unable to encrypt journal";
+				goto bad;
+			}
+			DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
+
+			crypto_free_skcipher(ic->journal_crypt);
+			ic->journal_crypt = NULL;
+		} else {
+			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
+			unsigned char iv[ivsize];
+			unsigned crypt_len = roundup(ivsize, blocksize);
+			unsigned char crypt_data[crypt_len];
+
+			skcipher_request_set_tfm(req, ic->journal_crypt);
+
+			ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
+			if (!ic->journal_scatterlist) {
+				ti->error = "Unable to allocate sg list";
+				r = -ENOMEM;
+				goto bad;
+			}
+			ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
+			if (!ic->journal_io_scatterlist) {
+				ti->error = "Unable to allocate sg list";
+				r = -ENOMEM;
+				goto bad;
+			}
+			ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO);
+			if (!ic->sk_requests) {
+				ti->error = "Unable to allocate sk requests";
+				r = -ENOMEM;
+				goto bad;
+			}
+			for (i = 0; i < ic->journal_sections; i++) {
+				struct scatterlist sg;
+				struct skcipher_request *section_req;
+				__u32 section_le = cpu_to_le32(i);
+
+				memset(iv, 0x00, ivsize);
+				memset(crypt_data, 0x00, crypt_len);
+				memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
+
+				sg_init_one(&sg, crypt_data, crypt_len);
+				skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
+				comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+				comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+				if (do_crypt(true, req, &comp))
+					wait_for_completion(&comp.comp);
+
+				if ((r = dm_integrity_failed(ic))) {
+					ti->error = "Unable to generate iv";
+					goto bad;
+				}
+
+				section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
+				if (!section_req) {
+					ti->error = "Unable to allocate crypt request";
+					r = -ENOMEM;
+					goto bad;
+				}
+				section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL);
+				if (!section_req->iv) {
+					skcipher_request_free(section_req);
+					ti->error = "Unable to allocate iv";
+					r = -ENOMEM;
+					goto bad;
+				}
+				memcpy(section_req->iv + ivsize, crypt_data, ivsize);
+				section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
+				ic->sk_requests[i] = section_req;
+				DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
+			}
+		}
+	}
+
+	for (i = 0; i < N_COMMIT_IDS; i++) {
+		unsigned j;
+retest_commit_id:
+		for (j = 0; j < i; j++) {
+			if (ic->commit_ids[j] == ic->commit_ids[i]) {
+				ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
+				goto retest_commit_id;
+			}
+		}
+		DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
+	}
+
+	journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
+	if (journal_tree_size > ULONG_MAX) {
+		ti->error = "Journal doesn't fit into memory";
+		r = -ENOMEM;
+		goto bad;
+	}
+	ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0);
+	if (!ic->journal_tree) {
+		ti->error = "Could not allocate memory for journal tree";
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	if (should_write_sb) {
+		int r;
+
+		init_journal(ic, 0, ic->journal_sections, 0);
+		r = dm_integrity_failed(ic);
+		if (unlikely(r)) {
+			ti->error = "Error initializing journal";
+			goto bad;
+		}
+		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+		if (r) {
+			ti->error = "Error initializing superblock";
+			goto bad;
+		}
+		ic->just_formatted = true;
+	}
+
+	r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
+	if (r)
+		goto bad;
+
+	if (!ic->internal_hash)
+		dm_integrity_set(ti, ic);
+
+	ti->num_flush_bios = 1;
+	ti->flush_supported = true;
+
+	return 0;
+bad:
+	dm_integrity_dtr(ti);
+	return r;
+}
+
+static void dm_integrity_dtr(struct dm_target *ti)
+{
+	struct dm_integrity_c *ic = ti->private;
+
+	BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
+
+	if (ic->metadata_wq)
+		destroy_workqueue(ic->metadata_wq);
+	if (ic->wait_wq)
+		destroy_workqueue(ic->wait_wq);
+	if (ic->commit_wq)
+		destroy_workqueue(ic->commit_wq);
+	if (ic->writer_wq)
+		destroy_workqueue(ic->writer_wq);
+	if (ic->bufio)
+		dm_bufio_client_destroy(ic->bufio);
+	mempool_destroy(ic->journal_io_mempool);
+	if (ic->io)
+		dm_io_client_destroy(ic->io);
+	if (ic->dev)
+		dm_put_device(ti, ic->dev);
+	dm_integrity_free_page_list(ic, ic->journal);
+	dm_integrity_free_page_list(ic, ic->journal_io);
+	dm_integrity_free_page_list(ic, ic->journal_xor);
+	if (ic->journal_scatterlist)
+		dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
+	if (ic->journal_io_scatterlist)
+		dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
+	if (ic->sk_requests) {
+		unsigned i;
+
+		for (i = 0; i < ic->journal_sections; i++) {
+			struct skcipher_request *req = ic->sk_requests[i];
+			if (req) {
+				kzfree(req->iv);
+				skcipher_request_free(req);
+			}
+		}
+		kvfree(ic->sk_requests);
+	}
+	kvfree(ic->journal_tree);
+	if (ic->sb)
+		free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
+
+	if (ic->internal_hash)
+		crypto_free_shash(ic->internal_hash);
+	free_alg(&ic->internal_hash_alg);
+
+	if (ic->journal_crypt)
+		crypto_free_skcipher(ic->journal_crypt);
+	free_alg(&ic->journal_crypt_alg);
+
+	if (ic->journal_mac)
+		crypto_free_shash(ic->journal_mac);
+	free_alg(&ic->journal_mac_alg);
+
+	kfree(ic);
+}
+
+static struct target_type integrity_target = {
+	.name			= "integrity",
+	.version		= {1, 0, 0},
+	.module			= THIS_MODULE,
+	.features		= DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
+	.ctr			= dm_integrity_ctr,
+	.dtr			= dm_integrity_dtr,
+	.map			= dm_integrity_map,
+	.postsuspend		= dm_integrity_postsuspend,
+	.resume			= dm_integrity_resume,
+	.status			= dm_integrity_status,
+	.iterate_devices	= dm_integrity_iterate_devices,
+};
+
+int __init dm_integrity_init(void)
+{
+	int r;
+
+	journal_io_cache = kmem_cache_create("integrity_journal_io",
+					     sizeof(struct journal_io), 0, 0, NULL);
+	if (!journal_io_cache) {
+		DMERR("can't allocate journal io cache");
+		return -ENOMEM;
+	}
+
+	r = dm_register_target(&integrity_target);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+void dm_integrity_exit(void)
+{
+	dm_unregister_target(&integrity_target);
+	kmem_cache_destroy(journal_io_cache);
+}
+
+module_init(dm_integrity_init);
+module_exit(dm_integrity_exit);
+
+MODULE_AUTHOR("Milan Broz");
+MODULE_AUTHOR("Mikulas Patocka");
+MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
+MODULE_LICENSE("GPL");

From ef43aa38063a6b2b3c6618e28ab35794f4f1fe29 Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Wed, 4 Jan 2017 20:23:54 +0100
Subject: [PATCH 06/50] dm crypt: add cryptographic data integrity protection
 (authenticated encryption)

Allow the use of per-sector metadata, provided by the dm-integrity
module, for integrity protection and persistently stored per-sector
Initialization Vector (IV).  The underlying device must support the
"DM-DIF-EXT-TAG" dm-integrity profile.

The per-bio integrity metadata is allocated by dm-crypt for every bio.

Example of low-level mapping table for various types of use:
 DEV=/dev/sdb
 SIZE=417792

 # Additional HMAC with CBC-ESSIV, key is concatenated encryption key + HMAC key
 SIZE_INT=389952
 dmsetup create x --table "0 $SIZE_INT integrity $DEV 0 32 J 0"
 dmsetup create y --table "0 $SIZE_INT crypt aes-cbc-essiv:sha256 \
 11ff33c6fb942655efb3e30cf4c0fd95f5ef483afca72166c530ae26151dd83b \
 00112233445566778899aabbccddeeff00112233445566778899aabbccddeeff \
 0 /dev/mapper/x 0 1 integrity:32:hmac(sha256)"

 # AEAD (Authenticated Encryption with Additional Data) - GCM with random IVs
 # GCM in kernel uses 96bits IV and we store 128bits auth tag (so 28 bytes metadata space)
 SIZE_INT=393024
 dmsetup create x --table "0 $SIZE_INT integrity $DEV 0 28 J 0"
 dmsetup create y --table "0 $SIZE_INT crypt aes-gcm-random \
 11ff33c6fb942655efb3e30cf4c0fd95f5ef483afca72166c530ae26151dd83b \
 0 /dev/mapper/x 0 1 integrity:28:aead"

 # Random IV only for XTS mode (no integrity protection but provides atomic random sector change)
 SIZE_INT=401272
 dmsetup create x --table "0 $SIZE_INT integrity $DEV 0 16 J 0"
 dmsetup create y --table "0 $SIZE_INT crypt aes-xts-random \
 11ff33c6fb942655efb3e30cf4c0fd95f5ef483afca72166c530ae26151dd83b \
 0 /dev/mapper/x 0 1 integrity:16:none"

 # Random IV with XTS + HMAC integrity protection
 SIZE_INT=377656
 dmsetup create x --table "0 $SIZE_INT integrity $DEV 0 48 J 0"
 dmsetup create y --table "0 $SIZE_INT crypt aes-xts-random \
 11ff33c6fb942655efb3e30cf4c0fd95f5ef483afca72166c530ae26151dd83b \
 00112233445566778899aabbccddeeff00112233445566778899aabbccddeeff \
 0 /dev/mapper/x 0 1 integrity:48:hmac(sha256)"

Both AEAD and HMAC protection authenticates not only data but also
sector metadata.

HMAC protection is implemented through autenc wrapper (so it is
processed the same way as an authenticated mode).

In HMAC mode there are two keys (concatenated in dm-crypt mapping
table).  First is the encryption key and the second is the key for
authentication (HMAC).  (It is userspace decision if these keys are
independent or somehow derived.)

The sector request for AEAD/HMAC authenticated encryption looks like this:
 |----- AAD -------|------ DATA -------|-- AUTH TAG --|
 | (authenticated) | (auth+encryption) |              |
 | sector_LE |  IV |  sector in/out    |  tag in/out  |

For writes, the integrity fields are calculated during AEAD encryption
of every sector and stored in bio integrity fields and sent to
underlying dm-integrity target for storage.

For reads, the integrity metadata is verified during AEAD decryption of
every sector (they are filled in by dm-integrity, but the integrity
fields are pre-allocated in dm-crypt).

There is also an experimental support in cryptsetup utility for more
friendly configuration (part of LUKS2 format).

Because the integrity fields are not valid on initial creation, the
device must be "formatted".  This can be done by direct-io writes to the
device (e.g. dd in direct-io mode).  For now, there is available trivial
tool to do this, see: https://github.com/mbroz/dm_int_tools

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com>
Signed-off-by: Vashek Matyas <matyas@fi.muni.cz>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-crypt.txt |  16 +
 drivers/md/dm-crypt.c                    | 886 +++++++++++++++++++----
 2 files changed, 760 insertions(+), 142 deletions(-)

diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index ff1f87bf26e8..a2a6627aa659 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -93,6 +93,22 @@ submit_from_crypt_cpus
     thread because it benefits CFQ to have writes submitted using the
     same context.
 
+integrity:<bytes>:<type>
+    Calculates and verifies integrity for the encrypted device (uses
+    authenticated encryption). This mode requires metadata stored in per-bio
+    integrity structure of <bytes> in size.
+
+    This option requires that the underlying device is created by dm-integrity
+    target and provides exactly <bytes> of per-sector metadata.
+
+    There can by two options for <type>. The first one is used when encryption
+    mode is Authenticated mode (AEAD mode), then type must be just "aead".
+    The second option is integrity calculated by keyed hash (HMAC), then
+    <type> is for example "hmac(sha256)".
+
+    If random IV is used (persistently stored IV in metadata per-sector),
+    then <bytes> includes both space for random IV and authentication tag.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..e8e570d000f6 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,8 +1,8 @@
 /*
  * Copyright (C) 2003 Jana Saout <jana@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
- * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
+ * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
  *
  * This file is released under the GPL.
  */
@@ -31,6 +31,9 @@
 #include <crypto/md5.h>
 #include <crypto/algapi.h>
 #include <crypto/skcipher.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */
 #include <keys/user-type.h>
 
 #include <linux/device-mapper.h>
@@ -48,7 +51,11 @@ struct convert_context {
 	struct bvec_iter iter_out;
 	sector_t cc_sector;
 	atomic_t cc_pending;
-	struct skcipher_request *req;
+	union {
+		struct skcipher_request *req;
+		struct aead_request *req_aead;
+	} r;
+
 };
 
 /*
@@ -57,6 +64,8 @@ struct convert_context {
 struct dm_crypt_io {
 	struct crypt_config *cc;
 	struct bio *base_bio;
+	u8 *integrity_metadata;
+	bool integrity_metadata_from_pool;
 	struct work_struct work;
 
 	struct convert_context ctx;
@@ -70,8 +79,8 @@ struct dm_crypt_io {
 
 struct dm_crypt_request {
 	struct convert_context *ctx;
-	struct scatterlist sg_in;
-	struct scatterlist sg_out;
+	struct scatterlist sg_in[4];
+	struct scatterlist sg_out[4];
 	sector_t iv_sector;
 };
 
@@ -118,6 +127,11 @@ struct iv_tcw_private {
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
 	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
 
+enum cipher_flags {
+	CRYPT_MODE_INTEGRITY_AEAD,	/* Use authenticated mode for cihper */
+	CRYPT_MODE_INTEGRITY_HMAC,	/* Compose authenticated mode from normal mode and HMAC */
+};
+
 /*
  * The fields in here must be read only after initialization.
  */
@@ -126,11 +140,14 @@ struct crypt_config {
 	sector_t start;
 
 	/*
-	 * pool for per bio private data, crypto requests and
-	 * encryption requeusts/buffer pages
+	 * pool for per bio private data, crypto requests,
+	 * encryption requeusts/buffer pages and integrity tags
 	 */
 	mempool_t *req_pool;
 	mempool_t *page_pool;
+	mempool_t *tag_pool;
+	unsigned tag_pool_max_sectors;
+
 	struct bio_set *bs;
 	struct mutex bio_alloc_lock;
 
@@ -143,6 +160,7 @@ struct crypt_config {
 
 	char *cipher;
 	char *cipher_string;
+	char *cipher_auth;
 	char *key_string;
 
 	const struct crypt_iv_operations *iv_gen_ops;
@@ -157,8 +175,12 @@ struct crypt_config {
 
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
-	struct crypto_skcipher **tfms;
+	union {
+		struct crypto_skcipher **tfms;
+		struct crypto_aead **tfms_aead;
+	} cipher_tfm;
 	unsigned tfms_count;
+	unsigned long cipher_flags;
 
 	/*
 	 * Layout of each crypto request:
@@ -181,21 +203,36 @@ struct crypt_config {
 	unsigned int key_size;
 	unsigned int key_parts;      /* independent parts in key buffer */
 	unsigned int key_extra_size; /* additional keys length */
+	unsigned int key_mac_size;   /* MAC key size for authenc(...) */
+
+	unsigned int integrity_tag_size;
+	unsigned int integrity_iv_size;
+	unsigned int on_disk_tag_size;
+
+	u8 *authenc_key; /* space for keys in authenc() format (if used) */
 	u8 key[0];
 };
 
-#define MIN_IOS        64
+#define MIN_IOS		64
+#define MAX_TAG_SIZE	480
+#define POOL_ENTRY_SIZE	512
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
-static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
+static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
+					     struct scatterlist *sg);
 
 /*
  * Use this to access cipher attributes that are the same for each CPU.
  */
 static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
 {
-	return cc->tfms[0];
+	return cc->cipher_tfm.tfms[0];
+}
+
+static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
+{
+	return cc->cipher_tfm.tfms_aead[0];
 }
 
 /*
@@ -325,8 +362,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
 		return essiv_tfm;
 	}
 
-	if (crypto_cipher_blocksize(essiv_tfm) !=
-	    crypto_skcipher_ivsize(any_tfm(cc))) {
+	if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
 		ti->error = "Block size of ESSIV cipher does "
 			    "not match IV size of block cipher";
 		crypto_free_cipher(essiv_tfm);
@@ -395,6 +431,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 
 	essiv_tfm = setup_essiv_cpu(cc, ti, salt,
 				crypto_ahash_digestsize(hash_tfm));
+
 	if (IS_ERR(essiv_tfm)) {
 		crypt_iv_essiv_dtr(cc);
 		return PTR_ERR(essiv_tfm);
@@ -585,12 +622,14 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
 static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
 			    struct dm_crypt_request *dmreq)
 {
+	struct scatterlist *sg;
 	u8 *src;
 	int r = 0;
 
 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
-		src = kmap_atomic(sg_page(&dmreq->sg_in));
-		r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+		sg = crypt_get_sg_data(cc, dmreq->sg_in);
+		src = kmap_atomic(sg_page(sg));
+		r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset);
 		kunmap_atomic(src);
 	} else
 		memset(iv, 0, cc->iv_size);
@@ -601,18 +640,20 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
 static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
 			     struct dm_crypt_request *dmreq)
 {
+	struct scatterlist *sg;
 	u8 *dst;
 	int r;
 
 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
 		return 0;
 
-	dst = kmap_atomic(sg_page(&dmreq->sg_out));
-	r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+	sg = crypt_get_sg_data(cc, dmreq->sg_out);
+	dst = kmap_atomic(sg_page(sg));
+	r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset);
 
 	/* Tweak the first block of plaintext sector */
 	if (!r)
-		crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+		crypto_xor(dst + sg->offset, iv, cc->iv_size);
 
 	kunmap_atomic(dst);
 	return r;
@@ -724,6 +765,7 @@ out:
 static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 			    struct dm_crypt_request *dmreq)
 {
+	struct scatterlist *sg;
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
 	__le64 sector = cpu_to_le64(dmreq->iv_sector);
 	u8 *src;
@@ -731,8 +773,9 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 
 	/* Remove whitening from ciphertext */
 	if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
-		src = kmap_atomic(sg_page(&dmreq->sg_in));
-		r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+		sg = crypt_get_sg_data(cc, dmreq->sg_in);
+		src = kmap_atomic(sg_page(sg));
+		r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset);
 		kunmap_atomic(src);
 	}
 
@@ -748,6 +791,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
 			     struct dm_crypt_request *dmreq)
 {
+	struct scatterlist *sg;
 	u8 *dst;
 	int r;
 
@@ -755,13 +799,22 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
 		return 0;
 
 	/* Apply whitening on ciphertext */
-	dst = kmap_atomic(sg_page(&dmreq->sg_out));
-	r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+	sg = crypt_get_sg_data(cc, dmreq->sg_out);
+	dst = kmap_atomic(sg_page(sg));
+	r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset);
 	kunmap_atomic(dst);
 
 	return r;
 }
 
+static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
+				struct dm_crypt_request *dmreq)
+{
+	/* Used only for writes, there must be an additional space to store IV */
+	get_random_bytes(iv, cc->iv_size);
+	return 0;
+}
+
 static const struct crypt_iv_operations crypt_iv_plain_ops = {
 	.generator = crypt_iv_plain_gen
 };
@@ -806,6 +859,108 @@ static const struct crypt_iv_operations crypt_iv_tcw_ops = {
 	.post	   = crypt_iv_tcw_post
 };
 
+static struct crypt_iv_operations crypt_iv_random_ops = {
+	.generator = crypt_iv_random_gen
+};
+
+/*
+ * Integrity extensions
+ */
+static bool crypt_integrity_aead(struct crypt_config *cc)
+{
+	return test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
+}
+
+static bool crypt_integrity_hmac(struct crypt_config *cc)
+{
+	return test_bit(CRYPT_MODE_INTEGRITY_HMAC, &cc->cipher_flags);
+}
+
+static bool crypt_integrity_mode(struct crypt_config *cc)
+{
+	return crypt_integrity_aead(cc) || crypt_integrity_hmac(cc);
+}
+
+/* Get sg containing data */
+static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
+					     struct scatterlist *sg)
+{
+	if (unlikely(crypt_integrity_mode(cc)))
+		return &sg[2];
+
+	return sg;
+}
+
+static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
+{
+	struct bio_integrity_payload *bip;
+	unsigned int tag_len;
+	int ret;
+
+	if (!bio_sectors(bio) || !io->cc->on_disk_tag_size)
+		return 0;
+
+	bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
+	if (IS_ERR(bip))
+		return PTR_ERR(bip);
+
+	tag_len = io->cc->on_disk_tag_size * bio_sectors(bio);
+
+	bip->bip_iter.bi_size = tag_len;
+	bip->bip_iter.bi_sector = io->cc->start + io->sector;
+
+	/* We own the metadata, do not let bio_free to release it */
+	bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
+
+	ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
+				     tag_len, offset_in_page(io->integrity_metadata));
+	if (unlikely(ret != tag_len))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
+{
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
+
+	/* From now we require underlying device with our integrity profile */
+	if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
+		ti->error = "Integrity profile not supported.";
+		return -EINVAL;
+	}
+
+	if (bi->tag_size != cc->on_disk_tag_size) {
+		ti->error = "Integrity profile tag size mismatch.";
+		return -EINVAL;
+	}
+
+	if (crypt_integrity_mode(cc)) {
+		cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
+		DMINFO("Integrity AEAD, tag size %u, IV size %u.",
+		       cc->integrity_tag_size, cc->integrity_iv_size);
+
+		if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
+			ti->error = "Integrity AEAD auth tag size is not supported.";
+			return -EINVAL;
+		}
+	} else if (cc->integrity_iv_size)
+		DMINFO("Additional per-sector space %u bytes for IV.",
+		       cc->integrity_iv_size);
+
+	if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
+		ti->error = "Not enough space for integrity tag in the profile.";
+		return -EINVAL;
+	}
+
+	return 0;
+#else
+	ti->error = "Integrity profile not supported.";
+	return -EINVAL;
+#endif
+}
+
 static void crypt_convert_init(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct bio *bio_out, struct bio *bio_in,
@@ -822,58 +977,207 @@ static void crypt_convert_init(struct crypt_config *cc,
 }
 
 static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
-					     struct skcipher_request *req)
+					     void *req)
 {
 	return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
 }
 
-static struct skcipher_request *req_of_dmreq(struct crypt_config *cc,
-					       struct dm_crypt_request *dmreq)
+static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq)
 {
-	return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start);
+	return (void *)((char *)dmreq - cc->dmreq_start);
 }
 
 static u8 *iv_of_dmreq(struct crypt_config *cc,
 		       struct dm_crypt_request *dmreq)
 {
-	return (u8 *)ALIGN((unsigned long)(dmreq + 1),
-		crypto_skcipher_alignmask(any_tfm(cc)) + 1);
+	if (crypt_integrity_mode(cc))
+		return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+			crypto_aead_alignmask(any_tfm_aead(cc)) + 1);
+	else
+		return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+			crypto_skcipher_alignmask(any_tfm(cc)) + 1);
 }
 
-static int crypt_convert_block(struct crypt_config *cc,
-			       struct convert_context *ctx,
-			       struct skcipher_request *req)
+static u8 *org_iv_of_dmreq(struct crypt_config *cc,
+		       struct dm_crypt_request *dmreq)
+{
+	return iv_of_dmreq(cc, dmreq) + cc->iv_size;
+}
+
+static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
+		       struct dm_crypt_request *dmreq)
+{
+	u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
+	return (uint64_t*) ptr;
+}
+
+static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
+		       struct dm_crypt_request *dmreq)
+{
+	u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size +
+		  cc->iv_size + sizeof(uint64_t);
+	return (unsigned int*)ptr;
+}
+
+static void *tag_from_dmreq(struct crypt_config *cc,
+				struct dm_crypt_request *dmreq)
+{
+	struct convert_context *ctx = dmreq->ctx;
+	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
+
+	return &io->integrity_metadata[*org_tag_of_dmreq(cc, dmreq) *
+		cc->on_disk_tag_size];
+}
+
+static void *iv_tag_from_dmreq(struct crypt_config *cc,
+			       struct dm_crypt_request *dmreq)
+{
+	return tag_from_dmreq(cc, dmreq) + cc->integrity_tag_size;
+}
+
+static int crypt_convert_block_aead(struct crypt_config *cc,
+				     struct convert_context *ctx,
+				     struct aead_request *req,
+				     unsigned int tag_offset)
 {
 	struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
 	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
 	struct dm_crypt_request *dmreq;
-	u8 *iv;
-	int r;
+	unsigned int data_len = 1 << SECTOR_SHIFT;
+	u8 *iv, *org_iv, *tag_iv, *tag;
+	uint64_t *sector;
+	int r = 0;
+
+	BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
 
 	dmreq = dmreq_of_req(cc, req);
-	iv = iv_of_dmreq(cc, dmreq);
-
 	dmreq->iv_sector = ctx->cc_sector;
 	dmreq->ctx = ctx;
-	sg_init_table(&dmreq->sg_in, 1);
-	sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
-		    bv_in.bv_offset);
 
-	sg_init_table(&dmreq->sg_out, 1);
-	sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
-		    bv_out.bv_offset);
+	*org_tag_of_dmreq(cc, dmreq) = tag_offset;
 
-	bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
-	bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
+	sector = org_sector_of_dmreq(cc, dmreq);
+	*sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
+
+	iv = iv_of_dmreq(cc, dmreq);
+	org_iv = org_iv_of_dmreq(cc, dmreq);
+	tag = tag_from_dmreq(cc, dmreq);
+	tag_iv = iv_tag_from_dmreq(cc, dmreq);
+
+	/* AEAD request:
+	 *  |----- AAD -------|------ DATA -------|-- AUTH TAG --|
+	 *  | (authenticated) | (auth+encryption) |              |
+	 *  | sector_LE |  IV |  sector in/out    |  tag in/out  |
+	 */
+	sg_init_table(dmreq->sg_in, 4);
+	sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t));
+	sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size);
+	sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, data_len, bv_in.bv_offset);
+	sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size);
+
+	sg_init_table(dmreq->sg_out, 4);
+	sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t));
+	sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size);
+	sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, data_len, bv_out.bv_offset);
+	sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size);
 
 	if (cc->iv_gen_ops) {
-		r = cc->iv_gen_ops->generator(cc, iv, dmreq);
-		if (r < 0)
-			return r;
+		/* For READs use IV stored in integrity metadata */
+		if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
+			memcpy(org_iv, tag_iv, cc->iv_size);
+		} else {
+			r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+			if (r < 0)
+				return r;
+			/* Store generated IV in integrity metadata */
+			if (cc->integrity_iv_size)
+				memcpy(tag_iv, org_iv, cc->iv_size);
+		}
+		/* Working copy of IV, to be modified in crypto API */
+		memcpy(iv, org_iv, cc->iv_size);
 	}
 
-	skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
-				   1 << SECTOR_SHIFT, iv);
+	aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size);
+	if (bio_data_dir(ctx->bio_in) == WRITE) {
+		aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
+				       data_len, iv);
+		r = crypto_aead_encrypt(req);
+		if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size)
+			memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0,
+			       cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size));
+	} else {
+		aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
+				       data_len + cc->integrity_tag_size, iv);
+		r = crypto_aead_decrypt(req);
+	}
+
+	if (r == -EBADMSG)
+		DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+			    (unsigned long long)le64_to_cpu(*sector));
+
+	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+
+	bio_advance_iter(ctx->bio_in, &ctx->iter_in, data_len);
+	bio_advance_iter(ctx->bio_out, &ctx->iter_out, data_len);
+
+	return r;
+}
+
+static int crypt_convert_block_skcipher(struct crypt_config *cc,
+					struct convert_context *ctx,
+					struct skcipher_request *req,
+					unsigned int tag_offset)
+{
+	struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
+	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
+	struct scatterlist *sg_in, *sg_out;
+	struct dm_crypt_request *dmreq;
+	unsigned int data_len = 1 << SECTOR_SHIFT;
+	u8 *iv, *org_iv, *tag_iv;
+	uint64_t *sector;
+	int r = 0;
+
+	dmreq = dmreq_of_req(cc, req);
+	dmreq->iv_sector = ctx->cc_sector;
+	dmreq->ctx = ctx;
+
+	*org_tag_of_dmreq(cc, dmreq) = tag_offset;
+
+	iv = iv_of_dmreq(cc, dmreq);
+	org_iv = org_iv_of_dmreq(cc, dmreq);
+	tag_iv = iv_tag_from_dmreq(cc, dmreq);
+
+	sector = org_sector_of_dmreq(cc, dmreq);
+	*sector = cpu_to_le64(ctx->cc_sector - cc->iv_offset);
+
+	/* For skcipher we use only the first sg item */
+	sg_in  = &dmreq->sg_in[0];
+	sg_out = &dmreq->sg_out[0];
+
+	sg_init_table(sg_in, 1);
+	sg_set_page(sg_in, bv_in.bv_page, data_len, bv_in.bv_offset);
+
+	sg_init_table(sg_out, 1);
+	sg_set_page(sg_out, bv_out.bv_page, data_len, bv_out.bv_offset);
+
+	if (cc->iv_gen_ops) {
+		/* For READs use IV stored in integrity metadata */
+		if (cc->integrity_iv_size && bio_data_dir(ctx->bio_in) != WRITE) {
+			memcpy(org_iv, tag_iv, cc->integrity_iv_size);
+		} else {
+			r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
+			if (r < 0)
+				return r;
+			/* Store generated IV in integrity metadata */
+			if (cc->integrity_iv_size)
+				memcpy(tag_iv, org_iv, cc->integrity_iv_size);
+		}
+		/* Working copy of IV, to be modified in crypto API */
+		memcpy(iv, org_iv, cc->iv_size);
+	}
+
+	skcipher_request_set_crypt(req, sg_in, sg_out, data_len, iv);
 
 	if (bio_data_dir(ctx->bio_in) == WRITE)
 		r = crypto_skcipher_encrypt(req);
@@ -881,7 +1185,10 @@ static int crypt_convert_block(struct crypt_config *cc,
 		r = crypto_skcipher_decrypt(req);
 
 	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
-		r = cc->iv_gen_ops->post(cc, iv, dmreq);
+		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
+
+	bio_advance_iter(ctx->bio_in, &ctx->iter_in, data_len);
+	bio_advance_iter(ctx->bio_out, &ctx->iter_out, data_len);
 
 	return r;
 }
@@ -889,27 +1196,53 @@ static int crypt_convert_block(struct crypt_config *cc,
 static void kcryptd_async_done(struct crypto_async_request *async_req,
 			       int error);
 
-static void crypt_alloc_req(struct crypt_config *cc,
-			    struct convert_context *ctx)
+static void crypt_alloc_req_skcipher(struct crypt_config *cc,
+				     struct convert_context *ctx)
 {
 	unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
 
-	if (!ctx->req)
-		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+	if (!ctx->r.req)
+		ctx->r.req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
-	skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+	skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]);
 
 	/*
 	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
 	 * requests if driver request queue is full.
 	 */
-	skcipher_request_set_callback(ctx->req,
+	skcipher_request_set_callback(ctx->r.req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
+	    kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
 }
 
-static void crypt_free_req(struct crypt_config *cc,
-			   struct skcipher_request *req, struct bio *base_bio)
+static void crypt_alloc_req_aead(struct crypt_config *cc,
+				 struct convert_context *ctx)
+{
+	if (!ctx->r.req_aead)
+		ctx->r.req_aead = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+	aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]);
+
+	/*
+	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+	 * requests if driver request queue is full.
+	 */
+	aead_request_set_callback(ctx->r.req_aead,
+	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+	    kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
+}
+
+static void crypt_alloc_req(struct crypt_config *cc,
+			    struct convert_context *ctx)
+{
+	if (crypt_integrity_mode(cc))
+		crypt_alloc_req_aead(cc, ctx);
+	else
+		crypt_alloc_req_skcipher(cc, ctx);
+}
+
+static void crypt_free_req_skcipher(struct crypt_config *cc,
+				    struct skcipher_request *req, struct bio *base_bio)
 {
 	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
 
@@ -917,12 +1250,30 @@ static void crypt_free_req(struct crypt_config *cc,
 		mempool_free(req, cc->req_pool);
 }
 
+static void crypt_free_req_aead(struct crypt_config *cc,
+				struct aead_request *req, struct bio *base_bio)
+{
+	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+
+	if ((struct aead_request *)(io + 1) != req)
+		mempool_free(req, cc->req_pool);
+}
+
+static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
+{
+	if (crypt_integrity_mode(cc))
+		crypt_free_req_aead(cc, req, base_bio);
+	else
+		crypt_free_req_skcipher(cc, req, base_bio);
+}
+
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
+	unsigned int tag_offset = 0;
 	int r;
 
 	atomic_set(&ctx->cc_pending, 1);
@@ -933,7 +1284,10 @@ static int crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->cc_pending);
 
-		r = crypt_convert_block(cc, ctx, ctx->req);
+		if (crypt_integrity_mode(cc))
+			r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
+		else
+			r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
 
 		switch (r) {
 		/*
@@ -949,8 +1303,9 @@ static int crypt_convert(struct crypt_config *cc,
 		 * completion function kcryptd_async_done() will be called.
 		 */
 		case -EINPROGRESS:
-			ctx->req = NULL;
+			ctx->r.req = NULL;
 			ctx->cc_sector++;
+			tag_offset++;
 			continue;
 		/*
 		 * The request was already processed (synchronously).
@@ -958,13 +1313,21 @@ static int crypt_convert(struct crypt_config *cc,
 		case 0:
 			atomic_dec(&ctx->cc_pending);
 			ctx->cc_sector++;
+			tag_offset++;
 			cond_resched();
 			continue;
-
-		/* There was an error while processing the request. */
+		/*
+		 * There was a data integrity error.
+		 */
+		case -EBADMSG:
+			atomic_dec(&ctx->cc_pending);
+			return -EILSEQ;
+		/*
+		 * There was an error while processing the request.
+		 */
 		default:
 			atomic_dec(&ctx->cc_pending);
-			return r;
+			return -EIO;
 		}
 	}
 
@@ -1005,7 +1368,7 @@ retry:
 
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
 	if (!clone)
-		goto return_clone;
+		goto out;
 
 	clone_init(io, clone);
 
@@ -1027,7 +1390,13 @@ retry:
 		remaining_size -= len;
 	}
 
-return_clone:
+	/* Allocate space for integrity tags */
+	if (dm_crypt_integrity_io_alloc(io, clone)) {
+		crypt_free_buffer_pages(cc, clone);
+		bio_put(clone);
+		clone = NULL;
+	}
+out:
 	if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
 		mutex_unlock(&cc->bio_alloc_lock);
 
@@ -1053,7 +1422,9 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
 	io->base_bio = bio;
 	io->sector = sector;
 	io->error = 0;
-	io->ctx.req = NULL;
+	io->ctx.r.req = NULL;
+	io->integrity_metadata = NULL;
+	io->integrity_metadata_from_pool = false;
 	atomic_set(&io->io_pending, 0);
 }
 
@@ -1075,8 +1446,13 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	if (!atomic_dec_and_test(&io->io_pending))
 		return;
 
-	if (io->ctx.req)
-		crypt_free_req(cc, io->ctx.req, base_bio);
+	if (io->ctx.r.req)
+		crypt_free_req(cc, io->ctx.r.req, base_bio);
+
+	if (unlikely(io->integrity_metadata_from_pool))
+		mempool_free(io->integrity_metadata, io->cc->tag_pool);
+	else
+		kfree(io->integrity_metadata);
 
 	base_bio->bi_error = error;
 	bio_endio(base_bio);
@@ -1156,6 +1532,12 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	clone_init(io, clone);
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
+	if (dm_crypt_integrity_io_alloc(io, clone)) {
+		crypt_dec_pending(io);
+		bio_put(clone);
+		return 1;
+	}
+
 	generic_make_request(clone);
 	return 0;
 }
@@ -1314,8 +1696,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 
 	crypt_inc_pending(io);
 	r = crypt_convert(cc, &io->ctx);
-	if (r)
-		io->error = -EIO;
+	if (r < 0)
+		io->error = r;
 	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
 
 	/* Encryption was already finished, submit io now */
@@ -1345,7 +1727,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 
 	r = crypt_convert(cc, &io->ctx);
 	if (r < 0)
-		io->error = -EIO;
+		io->error = r;
 
 	if (atomic_dec_and_test(&io->ctx.cc_pending))
 		kcryptd_crypt_read_done(io);
@@ -1372,9 +1754,13 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	}
 
 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
-		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+		error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
 
-	if (error < 0)
+	if (error == -EBADMSG) {
+		DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+			    (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
+		io->error = -EILSEQ;
+	} else if (error < 0)
 		io->error = -EIO;
 
 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
@@ -1430,37 +1816,59 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
 	return 0;
 }
 
-static void crypt_free_tfms(struct crypt_config *cc)
+static void crypt_free_tfms_aead(struct crypt_config *cc)
+{
+	if (!cc->cipher_tfm.tfms_aead)
+		return;
+
+	if (cc->cipher_tfm.tfms_aead[0] && !IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
+		crypto_free_aead(cc->cipher_tfm.tfms_aead[0]);
+		cc->cipher_tfm.tfms_aead[0] = NULL;
+	}
+
+	kfree(cc->cipher_tfm.tfms_aead);
+	cc->cipher_tfm.tfms_aead = NULL;
+}
+
+static void crypt_free_tfms_skcipher(struct crypt_config *cc)
 {
 	unsigned i;
 
-	if (!cc->tfms)
+	if (!cc->cipher_tfm.tfms)
 		return;
 
 	for (i = 0; i < cc->tfms_count; i++)
-		if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
-			crypto_free_skcipher(cc->tfms[i]);
-			cc->tfms[i] = NULL;
+		if (cc->cipher_tfm.tfms[i] && !IS_ERR(cc->cipher_tfm.tfms[i])) {
+			crypto_free_skcipher(cc->cipher_tfm.tfms[i]);
+			cc->cipher_tfm.tfms[i] = NULL;
 		}
 
-	kfree(cc->tfms);
-	cc->tfms = NULL;
+	kfree(cc->cipher_tfm.tfms);
+	cc->cipher_tfm.tfms = NULL;
 }
 
-static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+static void crypt_free_tfms(struct crypt_config *cc)
+{
+	if (crypt_integrity_mode(cc))
+		crypt_free_tfms_aead(cc);
+	else
+		crypt_free_tfms_skcipher(cc);
+}
+
+static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
 {
 	unsigned i;
 	int err;
 
-	cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *),
-			   GFP_KERNEL);
-	if (!cc->tfms)
+	cc->cipher_tfm.tfms = kzalloc(cc->tfms_count *
+				      sizeof(struct crypto_skcipher *), GFP_KERNEL);
+	if (!cc->cipher_tfm.tfms)
 		return -ENOMEM;
 
 	for (i = 0; i < cc->tfms_count; i++) {
-		cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
-		if (IS_ERR(cc->tfms[i])) {
-			err = PTR_ERR(cc->tfms[i]);
+		cc->cipher_tfm.tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
+		if (IS_ERR(cc->cipher_tfm.tfms[i])) {
+			err = PTR_ERR(cc->cipher_tfm.tfms[i]);
 			crypt_free_tfms(cc);
 			return err;
 		}
@@ -1469,22 +1877,111 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 	return 0;
 }
 
+static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
+{
+	char *authenc = NULL;
+	int err;
+
+	cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL);
+	if (!cc->cipher_tfm.tfms)
+		return -ENOMEM;
+
+	/* Compose AEAD cipher with autenc(authenticator,cipher) structure */
+	if (crypt_integrity_hmac(cc)) {
+		authenc = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL);
+		if (!authenc)
+			return -ENOMEM;
+		err = snprintf(authenc, CRYPTO_MAX_ALG_NAME,
+		       "authenc(%s,%s)", cc->cipher_auth, ciphermode);
+		if (err < 0) {
+			kzfree(authenc);
+			return err;
+		}
+		ciphermode = authenc;
+	}
+
+	cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0);
+	if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
+		err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]);
+		crypt_free_tfms(cc);
+		return err;
+	}
+
+	kzfree(authenc);
+	return 0;
+}
+
+static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
+{
+	if (crypt_integrity_mode(cc))
+		return crypt_alloc_tfms_aead(cc, ciphermode);
+	else
+		return crypt_alloc_tfms_skcipher(cc, ciphermode);
+}
+
+static unsigned crypt_subkey_size(struct crypt_config *cc)
+{
+	return (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+}
+
+static unsigned crypt_authenckey_size(struct crypt_config *cc)
+{
+	return crypt_subkey_size(cc) + RTA_SPACE(sizeof(struct crypto_authenc_key_param));
+}
+
+/*
+ * If AEAD is composed like authenc(hmac(sha256),xts(aes)),
+ * the key must be for some reason in special format.
+ * This funcion converts cc->key to this special format.
+ */
+static void crypt_copy_authenckey(char *p, const void *key,
+				  unsigned enckeylen, unsigned authkeylen)
+{
+	struct crypto_authenc_key_param *param;
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)p;
+	param = RTA_DATA(rta);
+	param->enckeylen = cpu_to_be32(enckeylen);
+	rta->rta_len = RTA_LENGTH(sizeof(*param));
+	rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+	p += RTA_SPACE(sizeof(*param));
+	memcpy(p, key + enckeylen, authkeylen);
+	p += authkeylen;
+	memcpy(p, key, enckeylen);
+}
+
 static int crypt_setkey(struct crypt_config *cc)
 {
 	unsigned subkey_size;
 	int err = 0, i, r;
 
 	/* Ignore extra keys (which are used for IV etc) */
-	subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+	subkey_size = crypt_subkey_size(cc);
 
+	if (crypt_integrity_hmac(cc))
+		crypt_copy_authenckey(cc->authenc_key, cc->key,
+				      subkey_size - cc->key_mac_size,
+				      cc->key_mac_size);
 	for (i = 0; i < cc->tfms_count; i++) {
-		r = crypto_skcipher_setkey(cc->tfms[i],
-					   cc->key + (i * subkey_size),
-					   subkey_size);
+		if (crypt_integrity_aead(cc))
+			r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+						   cc->key + (i * subkey_size),
+						   subkey_size);
+		else if (crypt_integrity_hmac(cc))
+			r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+				cc->authenc_key, crypt_authenckey_size(cc));
+		else
+			r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i],
+						   cc->key + (i * subkey_size),
+						   subkey_size);
 		if (r)
 			err = r;
 	}
 
+	if (crypt_integrity_hmac(cc))
+		memzero_explicit(cc->authenc_key, crypt_authenckey_size(cc));
+
 	return err;
 }
 
@@ -1681,6 +2178,7 @@ static void crypt_dtr(struct dm_target *ti)
 
 	mempool_destroy(cc->page_pool);
 	mempool_destroy(cc->req_pool);
+	mempool_destroy(cc->tag_pool);
 
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
@@ -1691,6 +2189,8 @@ static void crypt_dtr(struct dm_target *ti)
 	kzfree(cc->cipher);
 	kzfree(cc->cipher_string);
 	kzfree(cc->key_string);
+	kzfree(cc->cipher_auth);
+	kzfree(cc->authenc_key);
 
 	/* Must zero key material before freeing */
 	kzfree(cc);
@@ -1731,7 +2231,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		return -EINVAL;
 	}
 	cc->key_parts = cc->tfms_count;
-	cc->key_extra_size = 0;
 
 	cc->cipher = kstrdup(cipher, GFP_KERNEL);
 	if (!cc->cipher)
@@ -1777,7 +2276,20 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	}
 
 	/* Initialize IV */
-	cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+	if (crypt_integrity_mode(cc))
+		cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+	else
+		cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+
+	if (crypt_integrity_hmac(cc)) {
+		cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
+		if (!cc->authenc_key) {
+			ret = -ENOMEM;
+			ti->error = "Error allocating authenc key space";
+			goto bad;
+		}
+	}
+
 	if (cc->iv_size)
 		/* at least a 64 bit sector number should fit in our buffer */
 		cc->iv_size = max(cc->iv_size,
@@ -1816,6 +2328,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		cc->iv_gen_ops = &crypt_iv_tcw_ops;
 		cc->key_parts += 2; /* IV + whitening */
 		cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
+	} else if (strcmp(ivmode, "random") == 0) {
+		cc->iv_gen_ops = &crypt_iv_random_ops;
+		/* Need storage space in integrity fields. */
+		cc->integrity_iv_size = cc->iv_size;
 	} else {
 		ret = -EINVAL;
 		ti->error = "Invalid IV mode";
@@ -1857,6 +2373,75 @@ bad_mem:
 	return -ENOMEM;
 }
 
+static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct crypt_config *cc = ti->private;
+	struct dm_arg_set as;
+	static struct dm_arg _args[] = {
+		{0, 3, "Invalid number of feature args"},
+	};
+	unsigned int opt_params, val;
+	const char *opt_string, *sval;
+	int ret;
+
+	/* Optional parameters */
+	as.argc = argc;
+	as.argv = argv;
+
+	ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+	if (ret)
+		return ret;
+
+	while (opt_params--) {
+		opt_string = dm_shift_arg(&as);
+		if (!opt_string) {
+			ti->error = "Not enough feature arguments";
+			return -EINVAL;
+		}
+
+		if (!strcasecmp(opt_string, "allow_discards"))
+			ti->num_discard_bios = 1;
+
+		else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+			set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+
+		else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+			set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+		else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
+			if (val == 0 || val > MAX_TAG_SIZE) {
+				ti->error = "Invalid integrity arguments";
+				return -EINVAL;
+			}
+			cc->on_disk_tag_size = val;
+			sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
+			if (!strcasecmp(sval, "aead")) {
+				set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
+			} else  if (!strncasecmp(sval, "hmac(", strlen("hmac("))) {
+				struct crypto_ahash *hmac_tfm = crypto_alloc_ahash(sval, 0, 0);
+				if (IS_ERR(hmac_tfm)) {
+					ti->error = "Error initializing HMAC integrity hash.";
+					return PTR_ERR(hmac_tfm);
+				}
+				cc->key_mac_size = crypto_ahash_digestsize(hmac_tfm);
+				crypto_free_ahash(hmac_tfm);
+				set_bit(CRYPT_MODE_INTEGRITY_HMAC, &cc->cipher_flags);
+			} else  if (strcasecmp(sval, "none")) {
+				ti->error = "Unknown integrity profile";
+				return -EINVAL;
+			}
+
+			cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
+			if (!cc->cipher_auth)
+				return -ENOMEM;
+		}  else {
+			ti->error = "Invalid feature arguments";
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Construct an encryption mapping:
  * <cipher> [<key>|:<key_size>:<user|logon>:<key_description>] <iv_offset> <dev_path> <start>
@@ -1865,18 +2450,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct crypt_config *cc;
 	int key_size;
-	unsigned int opt_params;
+	unsigned int align_mask;
 	unsigned long long tmpll;
 	int ret;
-	size_t iv_size_padding;
-	struct dm_arg_set as;
-	const char *opt_string;
+	size_t iv_size_padding, additional_req_size;
 	char dummy;
 
-	static struct dm_arg _args[] = {
-		{0, 3, "Invalid number of feature args"},
-	};
-
 	if (argc < 5) {
 		ti->error = "Not enough arguments";
 		return -EINVAL;
@@ -1896,38 +2475,59 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	cc->key_size = key_size;
 
 	ti->private = cc;
+
+	/* Optional parameters need to be read before cipher constructor */
+	if (argc > 5) {
+		ret = crypt_ctr_optional(ti, argc - 5, &argv[5]);
+		if (ret)
+			goto bad;
+	}
+
 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
 	if (ret < 0)
 		goto bad;
 
-	cc->dmreq_start = sizeof(struct skcipher_request);
-	cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
+	if (crypt_integrity_mode(cc)) {
+		cc->dmreq_start = sizeof(struct aead_request);
+		cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
+		align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
+	} else {
+		cc->dmreq_start = sizeof(struct skcipher_request);
+		cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
+		align_mask = crypto_skcipher_alignmask(any_tfm(cc));
+	}
 	cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
 
-	if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+	if (align_mask < CRYPTO_MINALIGN) {
 		/* Allocate the padding exactly */
 		iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
-				& crypto_skcipher_alignmask(any_tfm(cc));
+				& align_mask;
 	} else {
 		/*
 		 * If the cipher requires greater alignment than kmalloc
 		 * alignment, we don't know the exact position of the
 		 * initialization vector. We must assume worst case.
 		 */
-		iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc));
+		iv_size_padding = align_mask;
 	}
 
 	ret = -ENOMEM;
-	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
-			sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
+
+	/*  ...| IV + padding | original IV | original sec. number | bio tag offset | */
+	additional_req_size = sizeof(struct dm_crypt_request) +
+		iv_size_padding + cc->iv_size +
+		cc->iv_size +
+		sizeof(uint64_t) +
+		sizeof(unsigned int);
+
+	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + additional_req_size);
 	if (!cc->req_pool) {
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad;
 	}
 
 	cc->per_bio_data_size = ti->per_io_data_size =
-		ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
-		      sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
+		ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
 		      ARCH_KMALLOC_MINALIGN);
 
 	cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
@@ -1964,39 +2564,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	cc->start = tmpll;
 
-	argv += 5;
-	argc -= 5;
-
-	/* Optional parameters */
-	if (argc) {
-		as.argc = argc;
-		as.argv = argv;
-
-		ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+	if (crypt_integrity_mode(cc) || cc->integrity_iv_size) {
+		ret = crypt_integrity_ctr(cc, ti);
 		if (ret)
 			goto bad;
 
-		ret = -EINVAL;
-		while (opt_params--) {
-			opt_string = dm_shift_arg(&as);
-			if (!opt_string) {
-				ti->error = "Not enough feature arguments";
-				goto bad;
-			}
+		cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
+		if (!cc->tag_pool_max_sectors)
+			cc->tag_pool_max_sectors = 1;
 
-			if (!strcasecmp(opt_string, "allow_discards"))
-				ti->num_discard_bios = 1;
-
-			else if (!strcasecmp(opt_string, "same_cpu_crypt"))
-				set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
-
-			else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
-				set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
-
-			else {
-				ti->error = "Invalid feature arguments";
-				goto bad;
-			}
+		cc->tag_pool = mempool_create_kmalloc_pool(MIN_IOS,
+			cc->tag_pool_max_sectors * cc->on_disk_tag_size);
+		if (!cc->tag_pool) {
+			ti->error = "Cannot allocate integrity tags mempool";
+			goto bad;
 		}
 	}
 
@@ -2062,12 +2643,29 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	 * Check if bio is too large, split as needed.
 	 */
 	if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) &&
-	    bio_data_dir(bio) == WRITE)
+	    (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
 		dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
 
 	io = dm_per_bio_data(bio, cc->per_bio_data_size);
 	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
-	io->ctx.req = (struct skcipher_request *)(io + 1);
+
+	if (cc->on_disk_tag_size) {
+		unsigned tag_len = cc->on_disk_tag_size * bio_sectors(bio);
+
+		if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
+		    unlikely(!(io->integrity_metadata = kmalloc(tag_len,
+				GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
+			if (bio_sectors(bio) > cc->tag_pool_max_sectors)
+				dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
+			io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
+			io->integrity_metadata_from_pool = true;
+		}
+	}
+
+	if (crypt_integrity_mode(cc))
+		io->ctx.r.req_aead = (struct aead_request *)(io + 1);
+	else
+		io->ctx.r.req = (struct skcipher_request *)(io + 1);
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
@@ -2108,6 +2706,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 		num_feature_args += !!ti->num_discard_bios;
 		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
 		num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+		if (cc->on_disk_tag_size)
+			num_feature_args++;
 		if (num_feature_args) {
 			DMEMIT(" %d", num_feature_args);
 			if (ti->num_discard_bios)
@@ -2116,6 +2716,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 				DMEMIT(" same_cpu_crypt");
 			if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
 				DMEMIT(" submit_from_crypt_cpus");
+			if (cc->on_disk_tag_size)
+				DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
 		}
 
 		break;
@@ -2216,7 +2818,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 15, 0},
+	.version = {1, 16, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

From e889f97a3e35a4e8f48ebc04c27031ca8805aa7e Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Thu, 16 Mar 2017 15:39:39 +0100
Subject: [PATCH 07/50] dm crypt: factor IV constructor out to separate
 function

No functional change.

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 130 ++++++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 61 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e8e570d000f6..aa0aca1aea79 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2196,6 +2196,73 @@ static void crypt_dtr(struct dm_target *ti)
 	kzfree(cc);
 }
 
+static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
+{
+	struct crypt_config *cc = ti->private;
+
+	if (crypt_integrity_mode(cc))
+		cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+	else
+		cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+
+	if (crypt_integrity_hmac(cc)) {
+		cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
+		if (!cc->authenc_key) {
+			ti->error = "Error allocating authenc key space";
+			return -ENOMEM;
+		}
+	}
+
+	if (cc->iv_size)
+		/* at least a 64 bit sector number should fit in our buffer */
+		cc->iv_size = max(cc->iv_size,
+				  (unsigned int)(sizeof(u64) / sizeof(u8)));
+	else if (ivmode) {
+		DMWARN("Selected cipher does not support IVs");
+		ivmode = NULL;
+	}
+
+	/* Choose ivmode, see comments at iv code. */
+	if (ivmode == NULL)
+		cc->iv_gen_ops = NULL;
+	else if (strcmp(ivmode, "plain") == 0)
+		cc->iv_gen_ops = &crypt_iv_plain_ops;
+	else if (strcmp(ivmode, "plain64") == 0)
+		cc->iv_gen_ops = &crypt_iv_plain64_ops;
+	else if (strcmp(ivmode, "essiv") == 0)
+		cc->iv_gen_ops = &crypt_iv_essiv_ops;
+	else if (strcmp(ivmode, "benbi") == 0)
+		cc->iv_gen_ops = &crypt_iv_benbi_ops;
+	else if (strcmp(ivmode, "null") == 0)
+		cc->iv_gen_ops = &crypt_iv_null_ops;
+	else if (strcmp(ivmode, "lmk") == 0) {
+		cc->iv_gen_ops = &crypt_iv_lmk_ops;
+		/*
+		 * Version 2 and 3 is recognised according
+		 * to length of provided multi-key string.
+		 * If present (version 3), last key is used as IV seed.
+		 * All keys (including IV seed) are always the same size.
+		 */
+		if (cc->key_size % cc->key_parts) {
+			cc->key_parts++;
+			cc->key_extra_size = cc->key_size / cc->key_parts;
+		}
+	} else if (strcmp(ivmode, "tcw") == 0) {
+		cc->iv_gen_ops = &crypt_iv_tcw_ops;
+		cc->key_parts += 2; /* IV + whitening */
+		cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
+	} else if (strcmp(ivmode, "random") == 0) {
+		cc->iv_gen_ops = &crypt_iv_random_ops;
+		/* Need storage space in integrity fields. */
+		cc->integrity_iv_size = cc->iv_size;
+	} else {
+		ti->error = "Invalid IV mode";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int crypt_ctr_cipher(struct dm_target *ti,
 			    char *cipher_in, char *key)
 {
@@ -2205,7 +2272,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	int ret = -EINVAL;
 	char dummy;
 
-	/* Convert to crypto api definition? */
 	if (strchr(cipher_in, '(')) {
 		ti->error = "Bad cipher specification";
 		return -EINVAL;
@@ -2276,67 +2342,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	}
 
 	/* Initialize IV */
-	if (crypt_integrity_mode(cc))
-		cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
-	else
-		cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
-
-	if (crypt_integrity_hmac(cc)) {
-		cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
-		if (!cc->authenc_key) {
-			ret = -ENOMEM;
-			ti->error = "Error allocating authenc key space";
-			goto bad;
-		}
-	}
-
-	if (cc->iv_size)
-		/* at least a 64 bit sector number should fit in our buffer */
-		cc->iv_size = max(cc->iv_size,
-				  (unsigned int)(sizeof(u64) / sizeof(u8)));
-	else if (ivmode) {
-		DMWARN("Selected cipher does not support IVs");
-		ivmode = NULL;
-	}
-
-	/* Choose ivmode, see comments at iv code. */
-	if (ivmode == NULL)
-		cc->iv_gen_ops = NULL;
-	else if (strcmp(ivmode, "plain") == 0)
-		cc->iv_gen_ops = &crypt_iv_plain_ops;
-	else if (strcmp(ivmode, "plain64") == 0)
-		cc->iv_gen_ops = &crypt_iv_plain64_ops;
-	else if (strcmp(ivmode, "essiv") == 0)
-		cc->iv_gen_ops = &crypt_iv_essiv_ops;
-	else if (strcmp(ivmode, "benbi") == 0)
-		cc->iv_gen_ops = &crypt_iv_benbi_ops;
-	else if (strcmp(ivmode, "null") == 0)
-		cc->iv_gen_ops = &crypt_iv_null_ops;
-	else if (strcmp(ivmode, "lmk") == 0) {
-		cc->iv_gen_ops = &crypt_iv_lmk_ops;
-		/*
-		 * Version 2 and 3 is recognised according
-		 * to length of provided multi-key string.
-		 * If present (version 3), last key is used as IV seed.
-		 * All keys (including IV seed) are always the same size.
-		 */
-		if (cc->key_size % cc->key_parts) {
-			cc->key_parts++;
-			cc->key_extra_size = cc->key_size / cc->key_parts;
-		}
-	} else if (strcmp(ivmode, "tcw") == 0) {
-		cc->iv_gen_ops = &crypt_iv_tcw_ops;
-		cc->key_parts += 2; /* IV + whitening */
-		cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
-	} else if (strcmp(ivmode, "random") == 0) {
-		cc->iv_gen_ops = &crypt_iv_random_ops;
-		/* Need storage space in integrity fields. */
-		cc->integrity_iv_size = cc->iv_size;
-	} else {
-		ret = -EINVAL;
-		ti->error = "Invalid IV mode";
+	ret = crypt_ctr_ivmode(ti, ivmode);
+	if (ret < 0)
 		goto bad;
-	}
 
 	/* Initialize and set key */
 	ret = crypt_set_key(cc, key);

From 33d2f09fcb357fd1861c4959d1d3505492bf91f8 Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Thu, 16 Mar 2017 15:39:40 +0100
Subject: [PATCH 08/50] dm crypt: introduce new format of cipher with "capi:"
 prefix

For the new authenticated encryption we have to support generic composed
modes (combination of encryption algorithm and authenticator) because
this is how the kernel crypto API accesses such algorithms.

To simplify the interface, we accept an algorithm directly in crypto API
format.  The new format is recognised by the "capi:" prefix.  The
dmcrypt internal IV specification is the same as for the old format.

The crypto API cipher specifications format is:
     capi:cipher_api_spec-ivmode[:ivopts]
Examples:
     capi:cbc(aes)-essiv:sha256 (equivalent to old aes-cbc-essiv:sha256)
     capi:xts(aes)-plain64      (equivalent to old aes-xts-plain64)
Examples of authenticated modes:
     capi:gcm(aes)-random
     capi:authenc(hmac(sha256),xts(aes))-random
     capi:rfc7539(chacha20,poly1305)-random

Authenticated modes can only be configured using the new cipher format.
Note that this format allows user to specify arbitrary combinations that
can be insecure. (Policy decision is done in cryptsetup userspace.)

Authenticated encryption algorithms can be of two types, either native
modes (like GCM) that performs both encryption and authentication
internally, or composed modes where user can compose AEAD with separate
specification of encryption algorithm and authenticator.

For composed mode with HMAC (length-preserving encryption mode like an
XTS and HMAC as an authenticator) we have to calculate HMAC digest size
(the separate authentication key is the same size as the HMAC digest).
Introduce crypt_ctr_auth_cipher() to parse the crypto API string to get
HMAC algorithm and retrieve digest size from it.

Also, for HMAC composed mode we need to parse the crypto API string to
get the cipher mode nested in the specification.  For native AEAD mode
(like GCM), we can use crypto_tfm_alg_name() API to get the cipher
specification.

Because the HMAC composed mode is not processed the same as the native
AEAD mode, the CRYPT_MODE_INTEGRITY_HMAC flag is no longer needed and
"hmac" specification for the table integrity argument is removed.

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-crypt.txt |  51 +++--
 drivers/md/dm-crypt.c                    | 275 ++++++++++++++++-------
 2 files changed, 225 insertions(+), 101 deletions(-)

diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index a2a6627aa659..8140b71f3c54 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -11,14 +11,31 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
 	      <offset> [<#opt_params> <opt_params>]
 
 <cipher>
-    Encryption cipher and an optional IV generation mode.
-    (In format cipher[:keycount]-chainmode-ivmode[:ivopts]).
-    Examples:
-       des
-       aes-cbc-essiv:sha256
-       twofish-ecb
+    Encryption cipher, encryption mode and Initial Vector (IV) generator.
 
-    /proc/crypto contains supported crypto modes
+    The cipher specifications format is:
+       cipher[:keycount]-chainmode-ivmode[:ivopts]
+    Examples:
+       aes-cbc-essiv:sha256
+       aes-xts-plain64
+       serpent-xts-plain64
+
+    Cipher format also supports direct specification with kernel crypt API
+    format (selected by capi: prefix). The IV specification is the same
+    as for the first format type.
+    This format is mainly used for specification of authenticated modes.
+
+    The crypto API cipher specifications format is:
+        capi:cipher_api_spec-ivmode[:ivopts]
+    Examples:
+        capi:cbc(aes)-essiv:sha256
+        capi:xts(aes)-plain64
+    Examples of authenticated modes:
+        capi:gcm(aes)-random
+        capi:authenc(hmac(sha256),xts(aes))-random
+        capi:rfc7539(chacha20,poly1305)-random
+
+    The /proc/crypto contains a list of curently loaded crypto modes.
 
 <key>
     Key used for encryption. It is encoded either as a hexadecimal number
@@ -94,20 +111,16 @@ submit_from_crypt_cpus
     same context.
 
 integrity:<bytes>:<type>
-    Calculates and verifies integrity for the encrypted device (uses
-    authenticated encryption). This mode requires metadata stored in per-bio
-    integrity structure of <bytes> in size.
+    The device requires additional <bytes> metadata per-sector stored
+    in per-bio integrity structure. This metadata must by provided
+    by underlying dm-integrity target.
 
-    This option requires that the underlying device is created by dm-integrity
-    target and provides exactly <bytes> of per-sector metadata.
+    The <type> can be "none" if metadata is used only for persistent IV.
 
-    There can by two options for <type>. The first one is used when encryption
-    mode is Authenticated mode (AEAD mode), then type must be just "aead".
-    The second option is integrity calculated by keyed hash (HMAC), then
-    <type> is for example "hmac(sha256)".
-
-    If random IV is used (persistently stored IV in metadata per-sector),
-    then <bytes> includes both space for random IV and authentication tag.
+    For Authenticated Encryption with Additional Data (AEAD)
+    the <type> is "aead". An AEAD mode additionally calculates and verifies
+    integrity for the encrypted device. The additional space is then
+    used for storing authentication tag (and persistent IV if needed).
 
 Example scripts
 ===============
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index aa0aca1aea79..b2e48b26fd40 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -129,7 +129,6 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
 
 enum cipher_flags {
 	CRYPT_MODE_INTEGRITY_AEAD,	/* Use authenticated mode for cihper */
-	CRYPT_MODE_INTEGRITY_HMAC,	/* Compose authenticated mode from normal mode and HMAC */
 };
 
 /*
@@ -873,19 +872,14 @@ static bool crypt_integrity_aead(struct crypt_config *cc)
 
 static bool crypt_integrity_hmac(struct crypt_config *cc)
 {
-	return test_bit(CRYPT_MODE_INTEGRITY_HMAC, &cc->cipher_flags);
-}
-
-static bool crypt_integrity_mode(struct crypt_config *cc)
-{
-	return crypt_integrity_aead(cc) || crypt_integrity_hmac(cc);
+	return crypt_integrity_aead(cc) && cc->key_mac_size;
 }
 
 /* Get sg containing data */
 static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
 					     struct scatterlist *sg)
 {
-	if (unlikely(crypt_integrity_mode(cc)))
+	if (unlikely(crypt_integrity_aead(cc)))
 		return &sg[2];
 
 	return sg;
@@ -936,7 +930,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
 		return -EINVAL;
 	}
 
-	if (crypt_integrity_mode(cc)) {
+	if (crypt_integrity_aead(cc)) {
 		cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
 		DMINFO("Integrity AEAD, tag size %u, IV size %u.",
 		       cc->integrity_tag_size, cc->integrity_iv_size);
@@ -990,7 +984,7 @@ static void *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmre
 static u8 *iv_of_dmreq(struct crypt_config *cc,
 		       struct dm_crypt_request *dmreq)
 {
-	if (crypt_integrity_mode(cc))
+	if (crypt_integrity_aead(cc))
 		return (u8 *)ALIGN((unsigned long)(dmreq + 1),
 			crypto_aead_alignmask(any_tfm_aead(cc)) + 1);
 	else
@@ -1235,7 +1229,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc,
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	if (crypt_integrity_mode(cc))
+	if (crypt_integrity_aead(cc))
 		crypt_alloc_req_aead(cc, ctx);
 	else
 		crypt_alloc_req_skcipher(cc, ctx);
@@ -1261,7 +1255,7 @@ static void crypt_free_req_aead(struct crypt_config *cc,
 
 static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_bio)
 {
-	if (crypt_integrity_mode(cc))
+	if (crypt_integrity_aead(cc))
 		crypt_free_req_aead(cc, req, base_bio);
 	else
 		crypt_free_req_skcipher(cc, req, base_bio);
@@ -1284,7 +1278,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->cc_pending);
 
-		if (crypt_integrity_mode(cc))
+		if (crypt_integrity_aead(cc))
 			r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
 		else
 			r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
@@ -1849,7 +1843,7 @@ static void crypt_free_tfms_skcipher(struct crypt_config *cc)
 
 static void crypt_free_tfms(struct crypt_config *cc)
 {
-	if (crypt_integrity_mode(cc))
+	if (crypt_integrity_aead(cc))
 		crypt_free_tfms_aead(cc);
 	else
 		crypt_free_tfms_skcipher(cc);
@@ -1879,27 +1873,12 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
 
 static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
 {
-	char *authenc = NULL;
 	int err;
 
 	cc->cipher_tfm.tfms = kmalloc(sizeof(struct crypto_aead *), GFP_KERNEL);
 	if (!cc->cipher_tfm.tfms)
 		return -ENOMEM;
 
-	/* Compose AEAD cipher with autenc(authenticator,cipher) structure */
-	if (crypt_integrity_hmac(cc)) {
-		authenc = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL);
-		if (!authenc)
-			return -ENOMEM;
-		err = snprintf(authenc, CRYPTO_MAX_ALG_NAME,
-		       "authenc(%s,%s)", cc->cipher_auth, ciphermode);
-		if (err < 0) {
-			kzfree(authenc);
-			return err;
-		}
-		ciphermode = authenc;
-	}
-
 	cc->cipher_tfm.tfms_aead[0] = crypto_alloc_aead(ciphermode, 0, 0);
 	if (IS_ERR(cc->cipher_tfm.tfms_aead[0])) {
 		err = PTR_ERR(cc->cipher_tfm.tfms_aead[0]);
@@ -1907,13 +1886,12 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
 		return err;
 	}
 
-	kzfree(authenc);
 	return 0;
 }
 
 static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 {
-	if (crypt_integrity_mode(cc))
+	if (crypt_integrity_aead(cc))
 		return crypt_alloc_tfms_aead(cc, ciphermode);
 	else
 		return crypt_alloc_tfms_skcipher(cc, ciphermode);
@@ -1964,13 +1942,13 @@ static int crypt_setkey(struct crypt_config *cc)
 				      subkey_size - cc->key_mac_size,
 				      cc->key_mac_size);
 	for (i = 0; i < cc->tfms_count; i++) {
-		if (crypt_integrity_aead(cc))
-			r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
-						   cc->key + (i * subkey_size),
-						   subkey_size);
-		else if (crypt_integrity_hmac(cc))
+		if (crypt_integrity_hmac(cc))
 			r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
 				cc->authenc_key, crypt_authenckey_size(cc));
+		else if (crypt_integrity_aead(cc))
+			r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
+					       cc->key + (i * subkey_size),
+					       subkey_size);
 		else
 			r = crypto_skcipher_setkey(cc->cipher_tfm.tfms[i],
 						   cc->key + (i * subkey_size),
@@ -2200,19 +2178,11 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
 {
 	struct crypt_config *cc = ti->private;
 
-	if (crypt_integrity_mode(cc))
+	if (crypt_integrity_aead(cc))
 		cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
 	else
 		cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
 
-	if (crypt_integrity_hmac(cc)) {
-		cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
-		if (!cc->authenc_key) {
-			ti->error = "Error allocating authenc key space";
-			return -ENOMEM;
-		}
-	}
-
 	if (cc->iv_size)
 		/* at least a 64 bit sector number should fit in our buffer */
 		cc->iv_size = max(cc->iv_size,
@@ -2263,24 +2233,155 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
 	return 0;
 }
 
-static int crypt_ctr_cipher(struct dm_target *ti,
-			    char *cipher_in, char *key)
+/*
+ * Workaround to parse cipher algorithm from crypto API spec.
+ * The cc->cipher is currently used only in ESSIV.
+ * This should be probably done by crypto-api calls (once available...)
+ */
+static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
+{
+	const char *alg_name = NULL;
+	char *start, *end;
+
+	if (crypt_integrity_aead(cc)) {
+		alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
+		if (!alg_name)
+			return -EINVAL;
+		if (crypt_integrity_hmac(cc)) {
+			alg_name = strchr(alg_name, ',');
+			if (!alg_name)
+				return -EINVAL;
+		}
+		alg_name++;
+	} else {
+		alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
+		if (!alg_name)
+			return -EINVAL;
+	}
+
+	start = strchr(alg_name, '(');
+	end = strchr(alg_name, ')');
+
+	if (!start && !end) {
+		cc->cipher = kstrdup(alg_name, GFP_KERNEL);
+		return cc->cipher ? 0 : -ENOMEM;
+	}
+
+	if (!start || !end || ++start >= end)
+		return -EINVAL;
+
+	cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
+	if (!cc->cipher)
+		return -ENOMEM;
+
+	strncpy(cc->cipher, start, end - start);
+
+	return 0;
+}
+
+/*
+ * Workaround to parse HMAC algorithm from AEAD crypto API spec.
+ * The HMAC is needed to calculate tag size (HMAC digest size).
+ * This should be probably done by crypto-api calls (once available...)
+ */
+static int crypt_ctr_auth_cipher(struct crypt_config *cc, char *cipher_api)
+{
+	char *start, *end, *mac_alg = NULL;
+	struct crypto_ahash *mac;
+
+	if (!strstarts(cipher_api, "authenc("))
+		return 0;
+
+	start = strchr(cipher_api, '(');
+	end = strchr(cipher_api, ',');
+	if (!start || !end || ++start > end)
+		return -EINVAL;
+
+	mac_alg = kzalloc(end - start + 1, GFP_KERNEL);
+	if (!mac_alg)
+		return -ENOMEM;
+	strncpy(mac_alg, start, end - start);
+
+	mac = crypto_alloc_ahash(mac_alg, 0, 0);
+	kfree(mac_alg);
+
+	if (IS_ERR(mac))
+		return PTR_ERR(mac);
+
+	cc->key_mac_size = crypto_ahash_digestsize(mac);
+	crypto_free_ahash(mac);
+
+	cc->authenc_key = kmalloc(crypt_authenckey_size(cc), GFP_KERNEL);
+	if (!cc->authenc_key)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key,
+				char **ivmode, char **ivopts)
 {
 	struct crypt_config *cc = ti->private;
-	char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
+	char *tmp, *cipher_api;
+	int ret = -EINVAL;
+
+	cc->tfms_count = 1;
+
+	/*
+	 * New format (capi: prefix)
+	 * capi:cipher_api_spec-iv:ivopts
+	 */
+	tmp = &cipher_in[strlen("capi:")];
+	cipher_api = strsep(&tmp, "-");
+	*ivmode = strsep(&tmp, ":");
+	*ivopts = tmp;
+
+	if (*ivmode && !strcmp(*ivmode, "lmk"))
+		cc->tfms_count = 64;
+
+	cc->key_parts = cc->tfms_count;
+
+	/* Allocate cipher */
+	ret = crypt_alloc_tfms(cc, cipher_api);
+	if (ret < 0) {
+		ti->error = "Error allocating crypto tfm";
+		return ret;
+	}
+
+	/* Alloc AEAD, can be used only in new format. */
+	if (crypt_integrity_aead(cc)) {
+		ret = crypt_ctr_auth_cipher(cc, cipher_api);
+		if (ret < 0) {
+			ti->error = "Invalid AEAD cipher spec";
+			return -ENOMEM;
+		}
+		cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
+	} else
+		cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
+
+	ret = crypt_ctr_blkdev_cipher(cc);
+	if (ret < 0) {
+		ti->error = "Cannot allocate cipher string";
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key,
+				char **ivmode, char **ivopts)
+{
+	struct crypt_config *cc = ti->private;
+	char *tmp, *cipher, *chainmode, *keycount;
 	char *cipher_api = NULL;
 	int ret = -EINVAL;
 	char dummy;
 
-	if (strchr(cipher_in, '(')) {
+	if (strchr(cipher_in, '(') || crypt_integrity_aead(cc)) {
 		ti->error = "Bad cipher specification";
 		return -EINVAL;
 	}
 
-	cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
-	if (!cc->cipher_string)
-		goto bad_mem;
-
 	/*
 	 * Legacy dm-crypt cipher specification
 	 * cipher[:keycount]-mode-iv:ivopts
@@ -2303,8 +2404,8 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		goto bad_mem;
 
 	chainmode = strsep(&tmp, "-");
-	ivopts = strsep(&tmp, "-");
-	ivmode = strsep(&ivopts, ":");
+	*ivopts = strsep(&tmp, "-");
+	*ivmode = strsep(&*ivopts, ":");
 
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
@@ -2313,12 +2414,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	 * For compatibility with the original dm-crypt mapping format, if
 	 * only the cipher name is supplied, use cbc-plain.
 	 */
-	if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
+	if (!chainmode || (!strcmp(chainmode, "plain") && !*ivmode)) {
 		chainmode = "cbc";
-		ivmode = "plain";
+		*ivmode = "plain";
 	}
 
-	if (strcmp(chainmode, "ecb") && !ivmode) {
+	if (strcmp(chainmode, "ecb") && !*ivmode) {
 		ti->error = "IV mechanism required";
 		return -EINVAL;
 	}
@@ -2338,19 +2439,45 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	ret = crypt_alloc_tfms(cc, cipher_api);
 	if (ret < 0) {
 		ti->error = "Error allocating crypto tfm";
-		goto bad;
+		kfree(cipher_api);
+		return ret;
 	}
 
+	return 0;
+bad_mem:
+	ti->error = "Cannot allocate cipher strings";
+	return -ENOMEM;
+}
+
+static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key)
+{
+	struct crypt_config *cc = ti->private;
+	char *ivmode = NULL, *ivopts = NULL;
+	int ret;
+
+	cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+	if (!cc->cipher_string) {
+		ti->error = "Cannot allocate cipher strings";
+		return -ENOMEM;
+	}
+
+	if (strstarts(cipher_in, "capi:"))
+		ret = crypt_ctr_cipher_new(ti, cipher_in, key, &ivmode, &ivopts);
+	else
+		ret = crypt_ctr_cipher_old(ti, cipher_in, key, &ivmode, &ivopts);
+	if (ret)
+		return ret;
+
 	/* Initialize IV */
 	ret = crypt_ctr_ivmode(ti, ivmode);
 	if (ret < 0)
-		goto bad;
+		return ret;
 
 	/* Initialize and set key */
 	ret = crypt_set_key(cc, key);
 	if (ret < 0) {
 		ti->error = "Error decoding and setting key";
-		goto bad;
+		return ret;
 	}
 
 	/* Allocate IV */
@@ -2358,7 +2485,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
 		if (ret < 0) {
 			ti->error = "Error creating IV";
-			goto bad;
+			return ret;
 		}
 	}
 
@@ -2367,18 +2494,11 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		ret = cc->iv_gen_ops->init(cc);
 		if (ret < 0) {
 			ti->error = "Error initialising IV";
-			goto bad;
+			return ret;
 		}
 	}
 
-	ret = 0;
-bad:
-	kfree(cipher_api);
 	return ret;
-
-bad_mem:
-	ti->error = "Cannot allocate cipher strings";
-	return -ENOMEM;
 }
 
 static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **argv)
@@ -2424,15 +2544,6 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 			sval = strchr(opt_string + strlen("integrity:"), ':') + 1;
 			if (!strcasecmp(sval, "aead")) {
 				set_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags);
-			} else  if (!strncasecmp(sval, "hmac(", strlen("hmac("))) {
-				struct crypto_ahash *hmac_tfm = crypto_alloc_ahash(sval, 0, 0);
-				if (IS_ERR(hmac_tfm)) {
-					ti->error = "Error initializing HMAC integrity hash.";
-					return PTR_ERR(hmac_tfm);
-				}
-				cc->key_mac_size = crypto_ahash_digestsize(hmac_tfm);
-				crypto_free_ahash(hmac_tfm);
-				set_bit(CRYPT_MODE_INTEGRITY_HMAC, &cc->cipher_flags);
 			} else  if (strcasecmp(sval, "none")) {
 				ti->error = "Unknown integrity profile";
 				return -EINVAL;
@@ -2495,7 +2606,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (ret < 0)
 		goto bad;
 
-	if (crypt_integrity_mode(cc)) {
+	if (crypt_integrity_aead(cc)) {
 		cc->dmreq_start = sizeof(struct aead_request);
 		cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
 		align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
@@ -2572,7 +2683,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	cc->start = tmpll;
 
-	if (crypt_integrity_mode(cc) || cc->integrity_iv_size) {
+	if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
 		ret = crypt_integrity_ctr(cc, ti);
 		if (ret)
 			goto bad;
@@ -2670,7 +2781,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 		}
 	}
 
-	if (crypt_integrity_mode(cc))
+	if (crypt_integrity_aead(cc))
 		io->ctx.r.req_aead = (struct aead_request *)(io + 1);
 	else
 		io->ctx.r.req = (struct skcipher_request *)(io + 1);

From 8f0009a225171cc1b76a6b443de5137b26e1374b Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Thu, 16 Mar 2017 15:39:44 +0100
Subject: [PATCH 09/50] dm crypt: optionally support larger encryption sector
 size

Add  optional "sector_size"  parameter that specifies encryption sector
size (atomic unit of block device encryption).

Parameter can be in range 512 - 4096 bytes and must be power of two.
For compatibility reasons, the maximal IO must fit into the page limit,
so the limit is set to the minimal page size possible (4096 bytes).

NOTE: this device cannot yet be handled by cryptsetup if this parameter
is set.

IV for the sector is calculated from the 512 bytes sector offset unless
the iv_large_sectors option is used.

Test script using dmsetup:

  DEV="/dev/sdb"
  DEV_SIZE=$(blockdev --getsz $DEV)
  KEY="9c1185a5c5e9fc54612808977ee8f548b2258d31ddadef707ba62c166051b9e3cd0294c27515f2bccee924e8823ca6e124b8fc3167ed478bca702babe4e130ac"
  BLOCK_SIZE=4096

  # dmsetup create test_crypt --table "0 $DEV_SIZE crypt aes-xts-plain64 $KEY 0 $DEV 0 1 sector_size:$BLOCK_SIZE"
  # dmsetup table --showkeys test_crypt

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-crypt.txt |  14 +++
 drivers/md/dm-crypt.c                    | 105 ++++++++++++++++++-----
 2 files changed, 96 insertions(+), 23 deletions(-)

diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 8140b71f3c54..3b3e1de21c9c 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -122,6 +122,20 @@ integrity:<bytes>:<type>
     integrity for the encrypted device. The additional space is then
     used for storing authentication tag (and persistent IV if needed).
 
+sector_size:<bytes>
+    Use <bytes> as the encryption unit instead of 512 bytes sectors.
+    This option can be in range 512 - 4096 bytes and must be power of two.
+    Virtual device will announce this size as a minimal IO and logical sector.
+
+iv_large_sectors
+   IV generators will use sector number counted in <sector_size> units
+   instead of default 512 bytes sectors.
+
+   For example, if <sector_size> is 4096 bytes, plain64 IV for the second
+   sector will be 8 (without flag) and 1 if iv_large_sectors is present.
+   The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
+   if this flag is specified.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b2e48b26fd40..05acc42bdd38 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -129,6 +129,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
 
 enum cipher_flags {
 	CRYPT_MODE_INTEGRITY_AEAD,	/* Use authenticated mode for cihper */
+	CRYPT_IV_LARGE_SECTORS,		/* Calculate IV from sector_size, not 512B sectors */
 };
 
 /*
@@ -171,6 +172,7 @@ struct crypt_config {
 	} iv_gen_private;
 	sector_t iv_offset;
 	unsigned int iv_size;
+	unsigned int sector_size;
 
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
@@ -524,6 +526,11 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
 {
 	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
 
+	if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+		ti->error = "Unsupported sector size for LMK";
+		return -EINVAL;
+	}
+
 	lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
 	if (IS_ERR(lmk->hash_tfm)) {
 		ti->error = "Error initializing LMK hash";
@@ -677,6 +684,11 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
 {
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
 
+	if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+		ti->error = "Unsupported sector size for TCW";
+		return -EINVAL;
+	}
+
 	if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
 		ti->error = "Wrong key size for TCW";
 		return -EINVAL;
@@ -1037,15 +1049,20 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
 	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
 	struct dm_crypt_request *dmreq;
-	unsigned int data_len = 1 << SECTOR_SHIFT;
 	u8 *iv, *org_iv, *tag_iv, *tag;
 	uint64_t *sector;
 	int r = 0;
 
 	BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
 
+	/* Reject unexpected unaligned bio. */
+	if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+		return -EIO;
+
 	dmreq = dmreq_of_req(cc, req);
 	dmreq->iv_sector = ctx->cc_sector;
+	if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+		sector_div(dmreq->iv_sector, cc->sector_size >> SECTOR_SHIFT);
 	dmreq->ctx = ctx;
 
 	*org_tag_of_dmreq(cc, dmreq) = tag_offset;
@@ -1066,13 +1083,13 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	sg_init_table(dmreq->sg_in, 4);
 	sg_set_buf(&dmreq->sg_in[0], sector, sizeof(uint64_t));
 	sg_set_buf(&dmreq->sg_in[1], org_iv, cc->iv_size);
-	sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, data_len, bv_in.bv_offset);
+	sg_set_page(&dmreq->sg_in[2], bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
 	sg_set_buf(&dmreq->sg_in[3], tag, cc->integrity_tag_size);
 
 	sg_init_table(dmreq->sg_out, 4);
 	sg_set_buf(&dmreq->sg_out[0], sector, sizeof(uint64_t));
 	sg_set_buf(&dmreq->sg_out[1], org_iv, cc->iv_size);
-	sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, data_len, bv_out.bv_offset);
+	sg_set_page(&dmreq->sg_out[2], bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
 	sg_set_buf(&dmreq->sg_out[3], tag, cc->integrity_tag_size);
 
 	if (cc->iv_gen_ops) {
@@ -1094,14 +1111,14 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	aead_request_set_ad(req, sizeof(uint64_t) + cc->iv_size);
 	if (bio_data_dir(ctx->bio_in) == WRITE) {
 		aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
-				       data_len, iv);
+				       cc->sector_size, iv);
 		r = crypto_aead_encrypt(req);
 		if (cc->integrity_tag_size + cc->integrity_iv_size != cc->on_disk_tag_size)
 			memset(tag + cc->integrity_tag_size + cc->integrity_iv_size, 0,
 			       cc->on_disk_tag_size - (cc->integrity_tag_size + cc->integrity_iv_size));
 	} else {
 		aead_request_set_crypt(req, dmreq->sg_in, dmreq->sg_out,
-				       data_len + cc->integrity_tag_size, iv);
+				       cc->sector_size + cc->integrity_tag_size, iv);
 		r = crypto_aead_decrypt(req);
 	}
 
@@ -1112,8 +1129,8 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
 
-	bio_advance_iter(ctx->bio_in, &ctx->iter_in, data_len);
-	bio_advance_iter(ctx->bio_out, &ctx->iter_out, data_len);
+	bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+	bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
 
 	return r;
 }
@@ -1127,13 +1144,18 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
 	struct scatterlist *sg_in, *sg_out;
 	struct dm_crypt_request *dmreq;
-	unsigned int data_len = 1 << SECTOR_SHIFT;
 	u8 *iv, *org_iv, *tag_iv;
 	uint64_t *sector;
 	int r = 0;
 
+	/* Reject unexpected unaligned bio. */
+	if (unlikely(bv_in.bv_offset & (cc->sector_size - 1)))
+		return -EIO;
+
 	dmreq = dmreq_of_req(cc, req);
 	dmreq->iv_sector = ctx->cc_sector;
+	if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+		sector_div(dmreq->iv_sector, cc->sector_size >> SECTOR_SHIFT);
 	dmreq->ctx = ctx;
 
 	*org_tag_of_dmreq(cc, dmreq) = tag_offset;
@@ -1150,10 +1172,10 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 	sg_out = &dmreq->sg_out[0];
 
 	sg_init_table(sg_in, 1);
-	sg_set_page(sg_in, bv_in.bv_page, data_len, bv_in.bv_offset);
+	sg_set_page(sg_in, bv_in.bv_page, cc->sector_size, bv_in.bv_offset);
 
 	sg_init_table(sg_out, 1);
-	sg_set_page(sg_out, bv_out.bv_page, data_len, bv_out.bv_offset);
+	sg_set_page(sg_out, bv_out.bv_page, cc->sector_size, bv_out.bv_offset);
 
 	if (cc->iv_gen_ops) {
 		/* For READs use IV stored in integrity metadata */
@@ -1171,7 +1193,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 		memcpy(iv, org_iv, cc->iv_size);
 	}
 
-	skcipher_request_set_crypt(req, sg_in, sg_out, data_len, iv);
+	skcipher_request_set_crypt(req, sg_in, sg_out, cc->sector_size, iv);
 
 	if (bio_data_dir(ctx->bio_in) == WRITE)
 		r = crypto_skcipher_encrypt(req);
@@ -1181,8 +1203,8 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
 
-	bio_advance_iter(ctx->bio_in, &ctx->iter_in, data_len);
-	bio_advance_iter(ctx->bio_out, &ctx->iter_out, data_len);
+	bio_advance_iter(ctx->bio_in, &ctx->iter_in, cc->sector_size);
+	bio_advance_iter(ctx->bio_out, &ctx->iter_out, cc->sector_size);
 
 	return r;
 }
@@ -1268,6 +1290,7 @@ static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
 	unsigned int tag_offset = 0;
+	unsigned int sector_step = cc->sector_size / (1 << SECTOR_SHIFT);
 	int r;
 
 	atomic_set(&ctx->cc_pending, 1);
@@ -1275,7 +1298,6 @@ static int crypt_convert(struct crypt_config *cc,
 	while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
 
 		crypt_alloc_req(cc, ctx);
-
 		atomic_inc(&ctx->cc_pending);
 
 		if (crypt_integrity_aead(cc))
@@ -1298,16 +1320,16 @@ static int crypt_convert(struct crypt_config *cc,
 		 */
 		case -EINPROGRESS:
 			ctx->r.req = NULL;
-			ctx->cc_sector++;
-			tag_offset++;
+			ctx->cc_sector += sector_step;
+			tag_offset += sector_step;
 			continue;
 		/*
 		 * The request was already processed (synchronously).
 		 */
 		case 0:
 			atomic_dec(&ctx->cc_pending);
-			ctx->cc_sector++;
-			tag_offset++;
+			ctx->cc_sector += sector_step;
+			tag_offset += sector_step;
 			cond_resched();
 			continue;
 		/*
@@ -2506,10 +2528,11 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 	struct crypt_config *cc = ti->private;
 	struct dm_arg_set as;
 	static struct dm_arg _args[] = {
-		{0, 3, "Invalid number of feature args"},
+		{0, 6, "Invalid number of feature args"},
 	};
 	unsigned int opt_params, val;
 	const char *opt_string, *sval;
+	char dummy;
 	int ret;
 
 	/* Optional parameters */
@@ -2552,7 +2575,16 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 			cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
 			if (!cc->cipher_auth)
 				return -ENOMEM;
-		}  else {
+		} else if (sscanf(opt_string, "sector_size:%u%c", &cc->sector_size, &dummy) == 1) {
+			if (cc->sector_size < (1 << SECTOR_SHIFT) ||
+			    cc->sector_size > 4096 ||
+			    (1 << ilog2(cc->sector_size) != cc->sector_size)) {
+				ti->error = "Invalid feature value for sector_size";
+				return -EINVAL;
+			}
+		} else if (!strcasecmp(opt_string, "iv_large_sectors"))
+			set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
+		else {
 			ti->error = "Invalid feature arguments";
 			return -EINVAL;
 		}
@@ -2592,6 +2624,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 	cc->key_size = key_size;
+	cc->sector_size = (1 << SECTOR_SHIFT);
 
 	ti->private = cc;
 
@@ -2664,7 +2697,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	mutex_init(&cc->bio_alloc_lock);
 
 	ret = -EINVAL;
-	if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
+	if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
+	    (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
 		ti->error = "Invalid iv_offset sector";
 		goto bad;
 	}
@@ -2765,6 +2799,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	    (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
 		dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
 
+	/*
+	 * Ensure that bio is a multiple of internal sector encryption size
+	 * and is aligned to this size as defined in IO hints.
+	 */
+	if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
+		return -EIO;
+
+	if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
+		return -EIO;
+
 	io = dm_per_bio_data(bio, cc->per_bio_data_size);
 	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
 
@@ -2772,12 +2816,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 		unsigned tag_len = cc->on_disk_tag_size * bio_sectors(bio);
 
 		if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
-		    unlikely(!(io->integrity_metadata = kmalloc(tag_len,
+		    unlikely(!(io->integrity_metadata = kzalloc(tag_len,
 				GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
 			if (bio_sectors(bio) > cc->tag_pool_max_sectors)
 				dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
 			io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
 			io->integrity_metadata_from_pool = true;
+			memset(io->integrity_metadata, 0, cc->tag_pool_max_sectors * (1 << SECTOR_SHIFT));
 		}
 	}
 
@@ -2825,6 +2870,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 		num_feature_args += !!ti->num_discard_bios;
 		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
 		num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+		num_feature_args += (cc->sector_size != (1 << SECTOR_SHIFT)) ? 1 : 0;
+		num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
 		if (cc->on_disk_tag_size)
 			num_feature_args++;
 		if (num_feature_args) {
@@ -2837,6 +2884,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 				DMEMIT(" submit_from_crypt_cpus");
 			if (cc->on_disk_tag_size)
 				DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
+			if (cc->sector_size != (1 << SECTOR_SHIFT))
+				DMEMIT(" sector_size:%d", cc->sector_size);
+			if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
+				DMEMIT(" iv_large_sectors");
 		}
 
 		break;
@@ -2926,6 +2977,8 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
+	struct crypt_config *cc = ti->private;
+
 	/*
 	 * Unfortunate constraint that is required to avoid the potential
 	 * for exceeding underlying device's max_segments limits -- due to
@@ -2933,11 +2986,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	 * bio that are not as physically contiguous as the original bio.
 	 */
 	limits->max_segment_size = PAGE_SIZE;
+
+	if (cc->sector_size != (1 << SECTOR_SHIFT)) {
+		limits->logical_block_size = cc->sector_size;
+		limits->physical_block_size = cc->sector_size;
+		blk_limits_io_min(limits, cc->sector_size);
+	}
 }
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 16, 0},
+	.version = {1, 17, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

From 1aa0efd4210df1c57764b77040a6615bc9b3ac0f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 17 Mar 2017 14:56:17 -0400
Subject: [PATCH 10/50] dm integrity: factor out create_journal() from
 dm_integrity_ctr()

Preparation for next commit that makes call to create_journal()
optional.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 379 ++++++++++++++++++++------------------
 1 file changed, 196 insertions(+), 183 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index ea779cca8b45..ecb0b592f5a0 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2429,6 +2429,200 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
 	return 0;
 }
 
+static int create_journal(struct dm_integrity_c *ic, char **error)
+{
+	int r = 0;
+	unsigned i;
+	__u64 journal_pages, journal_desc_size, journal_tree_size;
+
+	journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
+				PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
+	journal_desc_size = journal_pages * sizeof(struct page_list);
+	if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
+		*error = "Journal doesn't fit into memory";
+		r = -ENOMEM;
+		goto bad;
+	}
+	ic->journal_pages = journal_pages;
+
+	ic->journal = dm_integrity_alloc_page_list(ic);
+	if (!ic->journal) {
+		*error = "Could not allocate memory for journal";
+		r = -ENOMEM;
+		goto bad;
+	}
+	if (ic->journal_crypt_alg.alg_string) {
+		unsigned ivsize, blocksize;
+		struct journal_completion comp;
+
+		comp.ic = ic;
+		ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
+		if (IS_ERR(ic->journal_crypt)) {
+			*error = "Invalid journal cipher";
+			r = PTR_ERR(ic->journal_crypt);
+			ic->journal_crypt = NULL;
+			goto bad;
+		}
+		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
+		blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
+
+		if (ic->journal_crypt_alg.key) {
+			r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
+						   ic->journal_crypt_alg.key_size);
+			if (r) {
+				*error = "Error setting encryption key";
+				goto bad;
+			}
+		}
+		DEBUG_print("cipher %s, block size %u iv size %u\n",
+			    ic->journal_crypt_alg.alg_string, blocksize, ivsize);
+
+		ic->journal_io = dm_integrity_alloc_page_list(ic);
+		if (!ic->journal_io) {
+			*error = "Could not allocate memory for journal io";
+			r = -ENOMEM;
+			goto bad;
+		}
+
+		if (blocksize == 1) {
+			struct scatterlist *sg;
+			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
+			unsigned char iv[ivsize];
+			skcipher_request_set_tfm(req, ic->journal_crypt);
+
+			ic->journal_xor = dm_integrity_alloc_page_list(ic);
+			if (!ic->journal_xor) {
+				*error = "Could not allocate memory for journal xor";
+				r = -ENOMEM;
+				goto bad;
+			}
+
+			sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0);
+			if (!sg) {
+				*error = "Unable to allocate sg list";
+				r = -ENOMEM;
+				goto bad;
+			}
+			sg_init_table(sg, ic->journal_pages + 1);
+			for (i = 0; i < ic->journal_pages; i++) {
+				char *va = lowmem_page_address(ic->journal_xor[i].page);
+				clear_page(va);
+				sg_set_buf(&sg[i], va, PAGE_SIZE);
+			}
+			sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
+			memset(iv, 0x00, ivsize);
+
+			skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
+			comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+			comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+			if (do_crypt(true, req, &comp))
+				wait_for_completion(&comp.comp);
+			kvfree(sg);
+			r = dm_integrity_failed(ic);
+			if (r) {
+				*error = "Unable to encrypt journal";
+				goto bad;
+			}
+			DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
+
+			crypto_free_skcipher(ic->journal_crypt);
+			ic->journal_crypt = NULL;
+		} else {
+			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
+			unsigned char iv[ivsize];
+			unsigned crypt_len = roundup(ivsize, blocksize);
+			unsigned char crypt_data[crypt_len];
+
+			skcipher_request_set_tfm(req, ic->journal_crypt);
+
+			ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
+			if (!ic->journal_scatterlist) {
+				*error = "Unable to allocate sg list";
+				r = -ENOMEM;
+				goto bad;
+			}
+			ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
+			if (!ic->journal_io_scatterlist) {
+				*error = "Unable to allocate sg list";
+				r = -ENOMEM;
+				goto bad;
+			}
+			ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO);
+			if (!ic->sk_requests) {
+				*error = "Unable to allocate sk requests";
+				r = -ENOMEM;
+				goto bad;
+			}
+			for (i = 0; i < ic->journal_sections; i++) {
+				struct scatterlist sg;
+				struct skcipher_request *section_req;
+				__u32 section_le = cpu_to_le32(i);
+
+				memset(iv, 0x00, ivsize);
+				memset(crypt_data, 0x00, crypt_len);
+				memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
+
+				sg_init_one(&sg, crypt_data, crypt_len);
+				skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
+				comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+				comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+				if (do_crypt(true, req, &comp))
+					wait_for_completion(&comp.comp);
+
+				r = dm_integrity_failed(ic);
+				if (r) {
+					*error = "Unable to generate iv";
+					goto bad;
+				}
+
+				section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
+				if (!section_req) {
+					*error = "Unable to allocate crypt request";
+					r = -ENOMEM;
+					goto bad;
+				}
+				section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL);
+				if (!section_req->iv) {
+					skcipher_request_free(section_req);
+					*error = "Unable to allocate iv";
+					r = -ENOMEM;
+					goto bad;
+				}
+				memcpy(section_req->iv + ivsize, crypt_data, ivsize);
+				section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
+				ic->sk_requests[i] = section_req;
+				DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
+			}
+		}
+	}
+
+	for (i = 0; i < N_COMMIT_IDS; i++) {
+		unsigned j;
+retest_commit_id:
+		for (j = 0; j < i; j++) {
+			if (ic->commit_ids[j] == ic->commit_ids[i]) {
+				ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
+				goto retest_commit_id;
+			}
+		}
+		DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
+	}
+
+	journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
+	if (journal_tree_size > ULONG_MAX) {
+		*error = "Journal doesn't fit into memory";
+		r = -ENOMEM;
+		goto bad;
+	}
+	ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0);
+	if (!ic->journal_tree) {
+		*error = "Could not allocate memory for journal tree";
+		r = -ENOMEM;
+	}
+bad:
+	return r;
+}
+
 /*
  * Construct a integrity mapping: <dev_path> <offset> <tag_size>
  *
@@ -2461,7 +2655,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	};
 	unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
 	bool should_write_sb;
-	__u64 journal_pages, journal_desc_size, journal_tree_size;
 	__u64 threshold;
 	unsigned long long start;
 
@@ -2761,189 +2954,9 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 	dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
 
-	journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
-				PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
-	journal_desc_size = journal_pages * sizeof(struct page_list);
-	if (journal_pages >= totalram_pages - totalhigh_pages || journal_desc_size > ULONG_MAX) {
-		ti->error = "Journal doesn't fit into memory";
-		r = -ENOMEM;
+	r = create_journal(ic, &ti->error);
+	if (r)
 		goto bad;
-	}
-	ic->journal_pages = journal_pages;
-
-	ic->journal = dm_integrity_alloc_page_list(ic);
-	if (!ic->journal) {
-		ti->error = "Could not allocate memory for journal";
-		r = -ENOMEM;
-		goto bad;
-	}
-	if (ic->journal_crypt_alg.alg_string) {
-		unsigned ivsize, blocksize;
-		struct journal_completion comp;
-		comp.ic = ic;
-
-		ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, 0);
-		if (IS_ERR(ic->journal_crypt)) {
-			ti->error = "Invalid journal cipher";
-			r = PTR_ERR(ic->journal_crypt);
-			ic->journal_crypt = NULL;
-			goto bad;
-		}
-		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
-		blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
-
-		if (ic->journal_crypt_alg.key) {
-			r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
-						   ic->journal_crypt_alg.key_size);
-			if (r) {
-				ti->error = "Error setting encryption key";
-				goto bad;
-			}
-		}
-		DEBUG_print("cipher %s, block size %u iv size %u\n",
-			    ic->journal_crypt_alg.alg_string, blocksize, ivsize);
-
-		ic->journal_io = dm_integrity_alloc_page_list(ic);
-		if (!ic->journal_io) {
-			ti->error = "Could not allocate memory for journal io";
-			r = -ENOMEM;
-			goto bad;
-		}
-
-		if (blocksize == 1) {
-			struct scatterlist *sg;
-			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
-			unsigned char iv[ivsize];
-			skcipher_request_set_tfm(req, ic->journal_crypt);
-
-			ic->journal_xor = dm_integrity_alloc_page_list(ic);
-			if (!ic->journal_xor) {
-				ti->error = "Could not allocate memory for journal xor";
-				r = -ENOMEM;
-				goto bad;
-			}
-
-			sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0);
-			if (!sg) {
-				ti->error = "Unable to allocate sg list";
-				r = -ENOMEM;
-				goto bad;
-			}
-			sg_init_table(sg, ic->journal_pages + 1);
-			for (i = 0; i < ic->journal_pages; i++) {
-				char *va = lowmem_page_address(ic->journal_xor[i].page);
-				clear_page(va);
-				sg_set_buf(&sg[i], va, PAGE_SIZE);
-			}
-			sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
-			memset(iv, 0x00, ivsize);
-
-			skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
-			comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
-			comp.in_flight = (atomic_t)ATOMIC_INIT(1);
-			if (do_crypt(true, req, &comp))
-				wait_for_completion(&comp.comp);
-			kvfree(sg);
-			if ((r = dm_integrity_failed(ic))) {
-				ti->error = "Unable to encrypt journal";
-				goto bad;
-			}
-			DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
-
-			crypto_free_skcipher(ic->journal_crypt);
-			ic->journal_crypt = NULL;
-		} else {
-			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
-			unsigned char iv[ivsize];
-			unsigned crypt_len = roundup(ivsize, blocksize);
-			unsigned char crypt_data[crypt_len];
-
-			skcipher_request_set_tfm(req, ic->journal_crypt);
-
-			ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
-			if (!ic->journal_scatterlist) {
-				ti->error = "Unable to allocate sg list";
-				r = -ENOMEM;
-				goto bad;
-			}
-			ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
-			if (!ic->journal_io_scatterlist) {
-				ti->error = "Unable to allocate sg list";
-				r = -ENOMEM;
-				goto bad;
-			}
-			ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO);
-			if (!ic->sk_requests) {
-				ti->error = "Unable to allocate sk requests";
-				r = -ENOMEM;
-				goto bad;
-			}
-			for (i = 0; i < ic->journal_sections; i++) {
-				struct scatterlist sg;
-				struct skcipher_request *section_req;
-				__u32 section_le = cpu_to_le32(i);
-
-				memset(iv, 0x00, ivsize);
-				memset(crypt_data, 0x00, crypt_len);
-				memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
-
-				sg_init_one(&sg, crypt_data, crypt_len);
-				skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
-				comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
-				comp.in_flight = (atomic_t)ATOMIC_INIT(1);
-				if (do_crypt(true, req, &comp))
-					wait_for_completion(&comp.comp);
-
-				if ((r = dm_integrity_failed(ic))) {
-					ti->error = "Unable to generate iv";
-					goto bad;
-				}
-
-				section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
-				if (!section_req) {
-					ti->error = "Unable to allocate crypt request";
-					r = -ENOMEM;
-					goto bad;
-				}
-				section_req->iv = kmalloc(ivsize * 2, GFP_KERNEL);
-				if (!section_req->iv) {
-					skcipher_request_free(section_req);
-					ti->error = "Unable to allocate iv";
-					r = -ENOMEM;
-					goto bad;
-				}
-				memcpy(section_req->iv + ivsize, crypt_data, ivsize);
-				section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
-				ic->sk_requests[i] = section_req;
-				DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
-			}
-		}
-	}
-
-	for (i = 0; i < N_COMMIT_IDS; i++) {
-		unsigned j;
-retest_commit_id:
-		for (j = 0; j < i; j++) {
-			if (ic->commit_ids[j] == ic->commit_ids[i]) {
-				ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
-				goto retest_commit_id;
-			}
-		}
-		DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
-	}
-
-	journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
-	if (journal_tree_size > ULONG_MAX) {
-		ti->error = "Journal doesn't fit into memory";
-		r = -ENOMEM;
-		goto bad;
-	}
-	ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0);
-	if (!ic->journal_tree) {
-		ti->error = "Could not allocate memory for journal tree";
-		r = -ENOMEM;
-		goto bad;
-	}
 
 	if (should_write_sb) {
 		int r;

From c2bcb2b702e4684e566d295d687228498184e0c4 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 17 Mar 2017 12:40:51 -0400
Subject: [PATCH 11/50] dm integrity: add recovery mode

In recovery mode, we don't:
- replay the journal
- check checksums
- allow writes to the device

This mode can be used as a last resort for data recovery.  The
motivation for recovery mode is that when there is a single error in the
journal, the user should not lose access to the whole device.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-integrity.txt |  5 +++
 drivers/md/dm-integrity.c                    | 40 +++++++++++++-------
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
index 2406f56501dc..9d9089f74206 100644
--- a/Documentation/device-mapper/dm-integrity.txt
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -59,6 +59,11 @@ Target arguments:
 		either both data and tag or none of them are written. The
 		journaled mode degrades write throughput twice because the
 		data have to be written twice.
+	R - recovery mode - in this mode, journal is not replayed,
+		checksums are not checked and writes to the device are not
+		allowed. This mode is useful for data recovery if the
+		device cannot be activated in any of the other standard
+		modes.
 
 5. the number of additional arguments
 
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index ecb0b592f5a0..e26a079b41ea 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1216,6 +1216,9 @@ static void integrity_metadata(struct work_struct *w)
 		unsigned sectors_to_process = dio->range.n_sectors;
 		sector_t sector = dio->range.logical_sector;
 
+		if (unlikely(ic->mode == 'R'))
+			goto skip_io;
+
 		checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT) * ic->tag_size + extra_space,
 				    GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
 		if (!checksums)
@@ -1288,6 +1291,7 @@ again:
 			}
 		}
 	}
+skip_io:
 	dec_in_flight(dio);
 	return;
 error:
@@ -1327,6 +1331,9 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 		return -EIO;
 	}
 
+	if (unlikely(ic->mode == 'R') && unlikely(dio->write))
+		return -EIO;
+
 	get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
 	dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
 	bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
@@ -1926,6 +1933,9 @@ static void replay_journal(struct dm_integrity_c *ic)
 	bool journal_empty;
 	unsigned char unused, last_used, want_commit_seq;
 
+	if (ic->mode == 'R')
+		return;
+
 	if (ic->journal_uptodate)
 		return;
 
@@ -2705,7 +2715,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		}
 	}
 
-	if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D"))
+	if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
 		ic->mode = argv[3][0];
 	else {
 		ti->error = "Invalid mode (expecting J or D)";
@@ -2864,14 +2874,15 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		ti->error = "Error reading superblock";
 		goto bad;
 	}
-	if (!memcmp(ic->sb->magic, SB_MAGIC, 8)) {
-		should_write_sb = false;
-	} else {
-		for (i = 0; i < 512; i += 8) {
-			if (*(__u64 *)((__u8 *)ic->sb + i)) {
-				r = -EINVAL;
-				ti->error = "The device is not initialized";
-				goto bad;
+	should_write_sb = false;
+	if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
+		if (ic->mode != 'R') {
+			for (i = 0; i < 512; i += 8) {
+				if (*(__u64 *)((__u8 *)ic->sb + i)) {
+					r = -EINVAL;
+					ti->error = "The device is not initialized";
+					goto bad;
+				}
 			}
 		}
 
@@ -2880,7 +2891,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			ti->error = "Could not initialize superblock";
 			goto bad;
 		}
-		should_write_sb = true;
+		if (ic->mode != 'R')
+			should_write_sb = true;
 	}
 
 	if (ic->sb->version != SB_VERSION) {
@@ -2954,9 +2966,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 	dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
 
-	r = create_journal(ic, &ti->error);
-	if (r)
-		goto bad;
+	if (ic->mode != 'R') {
+		r = create_journal(ic, &ti->error);
+		if (r)
+			goto bad;
+	}
 
 	if (should_write_sb) {
 		int r;

From ff3af92b4461be773337111daea80bb91b2cd545 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 23 Mar 2017 10:23:14 -0400
Subject: [PATCH 12/50] dm crypt: use shifts instead of sector_div

sector_div is very slow, so we introduce a variable sector_shift and
use shift instead of sector_div.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 05acc42bdd38..ccbc7f36bb2e 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -172,7 +172,8 @@ struct crypt_config {
 	} iv_gen_private;
 	sector_t iv_offset;
 	unsigned int iv_size;
-	unsigned int sector_size;
+	unsigned short int sector_size;
+	unsigned char sector_shift;
 
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
@@ -1062,7 +1063,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	dmreq = dmreq_of_req(cc, req);
 	dmreq->iv_sector = ctx->cc_sector;
 	if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
-		sector_div(dmreq->iv_sector, cc->sector_size >> SECTOR_SHIFT);
+		dmreq->iv_sector >>= cc->sector_shift;
 	dmreq->ctx = ctx;
 
 	*org_tag_of_dmreq(cc, dmreq) = tag_offset;
@@ -1155,7 +1156,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 	dmreq = dmreq_of_req(cc, req);
 	dmreq->iv_sector = ctx->cc_sector;
 	if (test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags))
-		sector_div(dmreq->iv_sector, cc->sector_size >> SECTOR_SHIFT);
+		dmreq->iv_sector >>= cc->sector_shift;
 	dmreq->ctx = ctx;
 
 	*org_tag_of_dmreq(cc, dmreq) = tag_offset;
@@ -1290,7 +1291,7 @@ static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
 	unsigned int tag_offset = 0;
-	unsigned int sector_step = cc->sector_size / (1 << SECTOR_SHIFT);
+	unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
 	int r;
 
 	atomic_set(&ctx->cc_pending, 1);
@@ -2575,13 +2576,14 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 			cc->cipher_auth = kstrdup(sval, GFP_KERNEL);
 			if (!cc->cipher_auth)
 				return -ENOMEM;
-		} else if (sscanf(opt_string, "sector_size:%u%c", &cc->sector_size, &dummy) == 1) {
+		} else if (sscanf(opt_string, "sector_size:%hu%c", &cc->sector_size, &dummy) == 1) {
 			if (cc->sector_size < (1 << SECTOR_SHIFT) ||
 			    cc->sector_size > 4096 ||
-			    (1 << ilog2(cc->sector_size) != cc->sector_size)) {
+			    (cc->sector_size & (cc->sector_size - 1))) {
 				ti->error = "Invalid feature value for sector_size";
 				return -EINVAL;
 			}
+			cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT;
 		} else if (!strcasecmp(opt_string, "iv_large_sectors"))
 			set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
 		else {
@@ -2625,6 +2627,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	cc->key_size = key_size;
 	cc->sector_size = (1 << SECTOR_SHIFT);
+	cc->sector_shift = 0;
 
 	ti->private = cc;
 
@@ -2870,7 +2873,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 		num_feature_args += !!ti->num_discard_bios;
 		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
 		num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
-		num_feature_args += (cc->sector_size != (1 << SECTOR_SHIFT)) ? 1 : 0;
+		num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
 		num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
 		if (cc->on_disk_tag_size)
 			num_feature_args++;

From 78e470c26f524f4706c2555613b9641d85190cbe Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 22 Mar 2017 17:44:37 +0100
Subject: [PATCH 13/50] md: add raid4/5/6 journal mode switching API

Commit 2ded370373a4 ("md/r5cache: State machine for raid5-cache write
back mode") added support for "write-back" caching on the raid journal
device.

In order to allow the dm-raid target to switch between the available
"write-through" and "write-back" modes, provide a new
r5c_journal_mode_set() API.

Use the new API in existing r5c_journal_mode_store()

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Acked-by: Shaohua Li <shli@fb.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/raid5-cache.c | 62 ++++++++++++++++++++++------------------
 drivers/md/raid5.h       | 11 +++++++
 2 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3f307be01b10..218b6f37da85 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -53,16 +53,6 @@
  */
 #define R5L_POOL_SIZE	4
 
-/*
- * r5c journal modes of the array: write-back or write-through.
- * write-through mode has identical behavior as existing log only
- * implementation.
- */
-enum r5c_journal_mode {
-	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
-	R5C_JOURNAL_MODE_WRITE_BACK = 1,
-};
-
 static char *r5c_journal_mode_str[] = {"write-through",
 				       "write-back"};
 /*
@@ -2327,40 +2317,56 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 	return ret;
 }
 
-static ssize_t r5c_journal_mode_store(struct mddev *mddev,
-				      const char *page, size_t length)
+/*
+ * Set journal cache mode on @mddev (external API initially needed by dm-raid).
+ *
+ * @mode as defined in 'enum r5c_journal_mode'.
+ *
+ */
+int r5c_journal_mode_set(struct mddev *mddev, int mode)
 {
 	struct r5conf *conf = mddev->private;
 	struct r5l_log *log = conf->log;
-	int val = -1, i;
-	int len = length;
 
 	if (!log)
 		return -ENODEV;
 
-	if (len && page[len - 1] == '\n')
-		len -= 1;
-	for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
-		if (strlen(r5c_journal_mode_str[i]) == len &&
-		    strncmp(page, r5c_journal_mode_str[i], len) == 0) {
-			val = i;
-			break;
-		}
-	if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
-	    val > R5C_JOURNAL_MODE_WRITE_BACK)
+	if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
+	    mode > R5C_JOURNAL_MODE_WRITE_BACK)
 		return -EINVAL;
 
 	if (raid5_calc_degraded(conf) > 0 &&
-	    val == R5C_JOURNAL_MODE_WRITE_BACK)
+	    mode == R5C_JOURNAL_MODE_WRITE_BACK)
 		return -EINVAL;
 
 	mddev_suspend(mddev);
-	conf->log->r5c_journal_mode = val;
+	conf->log->r5c_journal_mode = mode;
 	mddev_resume(mddev);
 
 	pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
-		 mdname(mddev), val, r5c_journal_mode_str[val]);
-	return length;
+		 mdname(mddev), mode, r5c_journal_mode_str[mode]);
+	return 0;
+}
+EXPORT_SYMBOL(r5c_journal_mode_set);
+
+static ssize_t r5c_journal_mode_store(struct mddev *mddev,
+				      const char *page, size_t length)
+{
+	int mode = ARRAY_SIZE(r5c_journal_mode_str);
+	size_t len = length;
+
+	if (len < 2)
+		return -EINVAL;
+
+	if (page[len - 1] == '\n')
+		len--;
+
+	while (mode--)
+		if (strlen(r5c_journal_mode_str[mode]) == len &&
+		    !strncmp(page, r5c_journal_mode_str[mode], len))
+			break;
+
+	return r5c_journal_mode_set(mddev, mode) ?: length;
 }
 
 struct md_sysfs_entry
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4bb27b97bf6b..ec8ca15774d7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -547,6 +547,16 @@ struct r5worker_group {
 	int stripes_cnt;
 };
 
+/*
+ * r5c journal modes of the array: write-back or write-through.
+ * write-through mode has identical behavior as existing log only
+ * implementation.
+ */
+enum r5c_journal_mode {
+	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
+	R5C_JOURNAL_MODE_WRITE_BACK = 1,
+};
+
 enum r5_cache_state {
 	R5_INACTIVE_BLOCKED,	/* release of inactive stripes blocked,
 				 * waiting for 25% to be free
@@ -795,4 +805,5 @@ extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev);
 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
 #endif

From 4464e36e06470e3d68dc26a874f0dbdffa09a6e8 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 18 Mar 2017 01:39:12 +0100
Subject: [PATCH 14/50] dm raid: fix table line argument order in status

Commit 3a1c1ef2f ("dm raid: enhance status interface and fixup
takeover/raid0") added new table line arguments and introduced an
ordering flaw.  The sequence of the raid10_copies and raid10_format
raid parameters got reversed which causes lvm2 userspace to fail by
falsely assuming a changed table line.

Sequence those 2 parameters as before so that old lvm2 can function
properly with new kernels by adjusting the table line output as
documented in Documentation/device-mapper/dm-raid.txt.

Also, add missing version 1.10.1 highlight to the documention.

Fixes: 3a1c1ef2f ("dm raid: enhance status interface and fixup takeover/raid0")
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  3 ++
 drivers/md/dm-raid.c                    | 62 +++++++++++++------------
 2 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index cd2cb2fc85ea..95c4c8dd6dd1 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -331,3 +331,6 @@ Version History
 	'D' on the status line.  If '- -' is passed into the constructor, emit
 	'- -' on the table line and '-' as the status line health character.
 1.10.0  Add support for raid4/5/6 journal device
+1.10.1  Fix data corruption on reshape request
+1.11.0  Fix table line argument order
+	(wrong raid10_copies/raid10_format sequence)
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f8564d63982f..e07185fca638 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3382,38 +3382,28 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
 				  (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
+
 		/* Emit table line */
+		/* This has to be in the documented order for userspace! */
 		DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
-		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
-			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
-					 raid10_md_layout_to_format(mddev->layout));
-		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
-					 raid10_md_layout_to_copies(mddev->layout));
-		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
-			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
 		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
 			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
-		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
-			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
-					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
-		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
-			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
-					   (unsigned long long) rs->data_offset);
-		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
-			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
-					  mddev->bitmap_info.daemon_sleep);
-		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
-					 max(rs->delta_disks, mddev->delta_disks));
-		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
-					 max_nr_stripes);
+		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
+			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
 		if (rebuild_disks)
 			for (i = 0; i < rs->raid_disks; i++)
 				if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
 					DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
 							 rs->dev[i].rdev.raid_disk);
+		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
+			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
+					  mddev->bitmap_info.daemon_sleep);
+		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+					 mddev->sync_speed_min);
+		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+					 mddev->sync_speed_max);
 		if (write_mostly_params)
 			for (i = 0; i < rs->raid_disks; i++)
 				if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
@@ -3422,12 +3412,24 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
 			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
 					  mddev->bitmap_info.max_write_behind);
-		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
-					 mddev->sync_speed_max);
-		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
-					 mddev->sync_speed_min);
+		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
+					 max_nr_stripes);
+		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
+					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
+		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
+					 raid10_md_layout_to_copies(mddev->layout));
+		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
+			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
+					 raid10_md_layout_to_format(mddev->layout));
+		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
+					 max(rs->delta_disks, mddev->delta_disks));
+		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
+					   (unsigned long long) rs->data_offset);
 		if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
 			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
 					__get_dev_name(rs->journal_dev.dev));
@@ -3791,7 +3793,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 10, 1},
+	.version = {1, 11, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,

From 6e53636fe81465d6810f4e0910e7238edf12a133 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 22 Mar 2017 17:44:38 +0100
Subject: [PATCH 15/50] dm raid: add raid4/5/6 journal write-back support via
 journal_mode option

Commit 63c32ed4afc ("dm raid: add raid4/5/6 journaling support") added
journal support to close the raid4/5/6 "write hole" -- in terms of
writethrough caching.

Introduce a "journal_mode" feature and use the new
r5c_journal_mode_set() API to add support for switching the journal
device's cache mode between write-through (the current default) and
write-back.

NOTE: If the journal device is not layered on resilent storage and it
fails, write-through mode will cause the "write hole" to reoccur.  But
if the journal fails while in write-back mode it will cause data loss
for any dirty cache entries unless resilent storage is used for the
journal.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  11 ++-
 drivers/md/dm-raid.c                    | 104 +++++++++++++++++++++---
 2 files changed, 101 insertions(+), 14 deletions(-)

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 95c4c8dd6dd1..7e06e65586d4 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters:
 		Takeover/reshape is not possible with a raid4/5/6 journal device;
 		it has to be deconfigured before requesting these.
 
+	[journal_mode <mode>]
+		This option sets the caching mode on journaled raid4/5/6 raid sets
+		(see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
+		If 'writeback' is selected the journal device has to be resilient
+		and must not suffer from the 'write hole' problem itself (e.g. use
+		raid1 or raid10) to avoid a single point of failure.
+
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the
@@ -254,7 +261,8 @@ recovery.  Here is a fuller description of the individual fields:
 	<data_offset>   The current data offset to the start of the user data on
 			each component device of a raid set (see the respective
 			raid parameter to support out-of-place reshaping).
-	<journal_char>	'A' - active raid4/5/6 journal device.
+	<journal_char>	'A' - active write-through journal device.
+			'a' - active write-back journal device.
 			'D' - dead journal device.
 			'-' - no journal device.
 
@@ -334,3 +342,4 @@ Version History
 1.10.1  Fix data corruption on reshape request
 1.11.0  Fix table line argument order
 	(wrong raid10_copies/raid10_format sequence)
+1.11.1  Add raid4/5/6 journal write-back support via journal_mode option
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e07185fca638..0f61bb659b73 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -79,7 +79,10 @@ struct raid_dev {
 #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
 
 /* New for v1.10.0 */
-#define __CTR_FLAG_JOURNAL_DEV		15 /* 2 */ /* Only with raid4/5/6! */
+#define __CTR_FLAG_JOURNAL_DEV		15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
+
+/* New for v1.11.1 */
+#define __CTR_FLAG_JOURNAL_MODE		16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */
 
 /*
  * Flags for rs->ctr_flags field.
@@ -100,6 +103,7 @@ struct raid_dev {
 #define CTR_FLAG_DATA_OFFSET		(1 << __CTR_FLAG_DATA_OFFSET)
 #define CTR_FLAG_RAID10_USE_NEAR_SETS	(1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
 #define CTR_FLAG_JOURNAL_DEV		(1 << __CTR_FLAG_JOURNAL_DEV)
+#define CTR_FLAG_JOURNAL_MODE		(1 << __CTR_FLAG_JOURNAL_MODE)
 
 #define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
 
@@ -175,7 +179,8 @@ struct raid_dev {
 				 CTR_FLAG_REGION_SIZE | \
 				 CTR_FLAG_DELTA_DISKS | \
 				 CTR_FLAG_DATA_OFFSET | \
-				 CTR_FLAG_JOURNAL_DEV)
+				 CTR_FLAG_JOURNAL_DEV | \
+				 CTR_FLAG_JOURNAL_MODE)
 
 #define RAID6_VALID_FLAGS	(CTR_FLAG_SYNC | \
 				 CTR_FLAG_REBUILD | \
@@ -186,7 +191,8 @@ struct raid_dev {
 				 CTR_FLAG_REGION_SIZE | \
 				 CTR_FLAG_DELTA_DISKS | \
 				 CTR_FLAG_DATA_OFFSET | \
-				 CTR_FLAG_JOURNAL_DEV)
+				 CTR_FLAG_JOURNAL_DEV | \
+				 CTR_FLAG_JOURNAL_MODE)
 /* ...valid options definitions per raid level */
 
 /*
@@ -239,6 +245,7 @@ struct raid_set {
 	struct journal_dev {
 		struct dm_dev *dev;
 		struct md_rdev rdev;
+		int mode;
 	} journal_dev;
 
 	struct raid_dev dev[0];
@@ -326,6 +333,7 @@ static struct arg_name_flag {
 	{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
 	{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
 	{ CTR_FLAG_JOURNAL_DEV, "journal_dev" },
+	{ CTR_FLAG_JOURNAL_MODE, "journal_mode" },
 };
 
 /* Return argument name string for given @flag */
@@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
 	return NULL;
 }
 
+/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
+static struct {
+	const int mode;
+	const char *param;
+} _raid456_journal_mode[] = {
+	{ R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
+	{ R5C_JOURNAL_MODE_WRITE_BACK    , "writeback" }
+};
+
+/* Return MD raid4/5/6 journal mode for dm @journal_mode one */
+static int dm_raid_journal_mode_to_md(const char *mode)
+{
+	int m = ARRAY_SIZE(_raid456_journal_mode);
+
+	while (m--)
+		if (!strcasecmp(mode, _raid456_journal_mode[m].param))
+			return _raid456_journal_mode[m].mode;
+
+	return -EINVAL;
+}
+
+/* Return dm-raid raid4/5/6 journal mode string for @mode */
+static const char *md_journal_mode_to_dm_raid(const int mode)
+{
+	int m = ARRAY_SIZE(_raid456_journal_mode);
+
+	while (m--)
+		if (mode == _raid456_journal_mode[m].mode)
+			return _raid456_journal_mode[m].param;
+
+	return "unknown";
+}
+
 /*
  * Bool helpers to test for various raid levels of a raid set.
  * It's level as reported by the superblock rather than
@@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			continue;
 		}
 
-		/* "journal_dev dev" */
+		/* "journal_dev <dev>" */
 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
 			int r;
 			struct md_rdev *jdev;
@@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				rs->ti->error = "No space for raid4/5/6 journal";
 				return -ENOSPC;
 			}
+			rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
 			set_bit(Journal, &jdev->flags);
 			continue;
 		}
 
+		/* "journal_mode <mode>" ("journal_dev" mandatory!) */
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
+			int r;
+
+			if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+				rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
+				return -EINVAL;
+			}
+			if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
+				return -EINVAL;
+			}
+			r = dm_raid_journal_mode_to_md(arg);
+			if (r < 0) {
+				rs->ti->error = "Invalid 'journal_mode' argument";
+				return r;
+			}
+			rs->journal_dev.mode = r;
+			continue;
+		}
+
 		/*
 		 * Parameters with number values from here on.
 		 */
@@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	rs->callbacks.congested_fn = raid_is_congested;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
+	/* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
+	if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+		r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
+		if (r) {
+			ti->error = "Failed to set raid4/5/6 journal mode";
+			mddev_unlock(&rs->md);
+			goto bad_journal_mode_set;
+		}
+	}
+
 	mddev_suspend(&rs->md);
 
 	/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	mddev_unlock(&rs->md);
 	return 0;
 
+bad_journal_mode_set:
 bad_stripe_cache:
 bad_check_reshape:
 	md_stop(&rs->md);
@@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev)
  * Status characters:
  *
  *  'D' = Dead/Failed raid set component or raid4/5/6 journal device
- *  'a' = Alive but not in-sync
- *  'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
+ *  'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
+ *  'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
  *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
  */
-static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
+static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
 {
 	if (!rdev->bdev)
 		return "-";
 	else if (test_bit(Faulty, &rdev->flags))
 		return "D";
 	else if (test_bit(Journal, &rdev->flags))
-		return "A";
+		return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
 	else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
 		return "a";
 	else
@@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
 		for (i = 0; i < rs->raid_disks; i++)
-			DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
+			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
 
 		/*
 		 * In-sync/Reshape ratio:
@@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		 * v1.10.0+:
 		 */
 		DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
-			      __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
+			      __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
 		break;
 
 	case STATUSTYPE_TABLE:
@@ -3381,7 +3455,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 				  write_mostly_params +
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
-				  (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
+				  (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
+				  (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
 
 		/* Emit table line */
 		/* This has to be in the documented order for userspace! */
@@ -3433,6 +3508,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
 			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
 					__get_dev_name(rs->journal_dev.dev));
+		if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
+			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
+					 md_journal_mode_to_dm_raid(rs->journal_dev.mode));
 		DMEMIT(" %d", rs->raid_disks);
 		for (i = 0; i < rs->raid_disks; i++)
 			DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3793,7 +3871,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 11, 0},
+	.version = {1, 11, 1},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,

From 7b81ef8b14f80033e4a4168d199a0f5fd79b9426 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 28 Mar 2017 12:53:39 -0400
Subject: [PATCH 16/50] dm raid: select the Kconfig option CONFIG_MD_RAID0

Since the commit 0cf4503174c1 ("dm raid: add support for the MD RAID0
personality"), the dm-raid subsystem can activate a RAID-0 array.
Therefore, add MD_RAID0 to the dependencies of DM_RAID, so that MD_RAID0
will be selected when DM_RAID is selected.

Fixes: 0cf4503174c1 ("dm raid: add support for the MD RAID0 personality")
Cc: stable@vger.kernel.org # v4.2+
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 5c5ed97c9fda..ee2c21e3d232 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -357,6 +357,7 @@ config DM_LOG_USERSPACE
 config DM_RAID
        tristate "RAID 1/4/5/6/10 target"
        depends on BLK_DEV_DM
+       select MD_RAID0
        select MD_RAID1
        select MD_RAID10
        select MD_RAID456

From 449b668ce0b9069fcaafa6344c7f10fa2ba9632e Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 31 Mar 2017 10:09:45 -0400
Subject: [PATCH 17/50] dm cache: set/clear the cache core's dirty_bitset when
 loading mappings

When loading metadata make sure to set/clear the dirty bits in the cache
core's dirty_bitset as well as the policy.

Otherwise the cache core is unaware that any blocks were dirty when the
cache was last shutdown.  A very serious side-effect being that the
cleaner policy would therefore never be tasked with writing back dirty
data from a cache that was in writeback mode (e.g. when switching from
smq policy to cleaner policy when decommissioning a writeback cache).

This fixes a serious data corruption bug associated with writeback mode.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index b7de289a10bb..6e747fcbdf0f 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2960,6 +2960,12 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
 	int r;
 	struct cache *cache = context;
 
+	if (dirty) {
+		set_bit(from_cblock(cblock), cache->dirty_bitset);
+		atomic_inc(&cache->nr_dirty);
+	} else
+		clear_bit(from_cblock(cblock), cache->dirty_bitset);
+
 	r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
 	if (r)
 		return r;

From cc7e394024770d4bfd8463fab1a9e2e262a7d7c1 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 31 Mar 2017 10:46:54 -0400
Subject: [PATCH 18/50] dm cache policy smq: make the cleaner policy write-back
 more aggressively

By ignoring the sentinels the cleaner policy is able to write-back dirty
cache data much faster.  There is no reason to respect the sentinels,
which denote that a block was changed recently, when using the cleaner
policy given that the cleaner is tasked with writing back all dirty
data.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-smq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 74436dc2122f..e0c40aec5e96 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1166,7 +1166,7 @@ static void queue_writeback(struct smq_policy *mq)
 	struct policy_work work;
 	struct entry *e;
 
-	e = q_peek(&mq->dirty, mq->dirty.nr_levels, false);
+	e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
 	if (e) {
 		mark_pending(mq, e);
 		q_del(&mq->dirty, e);

From 3c12016910061c2a19d985fba7f7dec19d6a3a09 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2017 16:51:46 -0400
Subject: [PATCH 19/50] dm table: replace while loops with for loops

Also remove some unnecessary use of uninitialized_var().

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 63 ++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index b0600840e734..7fb29db478cd 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -373,7 +373,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
  */
 dev_t dm_get_dev_t(const char *path)
 {
-	dev_t uninitialized_var(dev);
+	dev_t dev;
 	struct block_device *bdev;
 
 	bdev = lookup_bdev(path);
@@ -627,13 +627,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
 
 	struct dm_target *uninitialized_var(ti);
 	struct queue_limits ti_limits;
-	unsigned i = 0;
+	unsigned i;
 
 	/*
 	 * Check each entry in the table in turn.
 	 */
-	while (i < dm_table_get_num_targets(table)) {
-		ti = dm_table_get_target(table, i++);
+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
+		ti = dm_table_get_target(table, i);
 
 		blk_set_stacking_limits(&ti_limits);
 
@@ -854,11 +854,11 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
 static bool dm_table_supports_dax(struct dm_table *t)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;
 
 	/* Ensure that all targets support DAX. */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
 
 		if (!ti->type->direct_access)
 			return false;
@@ -1010,11 +1010,11 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
 
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
 {
-	struct dm_target *uninitialized_var(ti);
-	unsigned i = 0;
+	struct dm_target *ti;
+	unsigned i;
 
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
 		if (dm_target_is_wildcard(ti->type))
 			return ti;
 	}
@@ -1321,15 +1321,16 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
  */
 bool dm_table_has_no_data_devices(struct dm_table *table)
 {
-	struct dm_target *uninitialized_var(ti);
-	unsigned i = 0, num_devices = 0;
+	struct dm_target *ti;
+	unsigned i, num_devices;
 
-	while (i < dm_table_get_num_targets(table)) {
-		ti = dm_table_get_target(table, i++);
+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
+		ti = dm_table_get_target(table, i);
 
 		if (!ti->type->iterate_devices)
 			return false;
 
+		num_devices = 0;
 		ti->type->iterate_devices(ti, count_device, &num_devices);
 		if (num_devices)
 			return false;
@@ -1344,16 +1345,16 @@ bool dm_table_has_no_data_devices(struct dm_table *table)
 int dm_calculate_queue_limits(struct dm_table *table,
 			      struct queue_limits *limits)
 {
-	struct dm_target *uninitialized_var(ti);
+	struct dm_target *ti;
 	struct queue_limits ti_limits;
-	unsigned i = 0;
+	unsigned i;
 
 	blk_set_stacking_limits(limits);
 
-	while (i < dm_table_get_num_targets(table)) {
+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
 		blk_set_stacking_limits(&ti_limits);
 
-		ti = dm_table_get_target(table, i++);
+		ti = dm_table_get_target(table, i);
 
 		if (!ti->type->iterate_devices)
 			goto combine_limits;
@@ -1435,7 +1436,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
 static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;
 
 	/*
 	 * Require at least one underlying device to support flushes.
@@ -1443,8 +1444,8 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	 * so we need to use iterate_devices here, which targets
 	 * supporting flushes must provide.
 	 */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
 
 		if (!ti->num_flush_bios)
 			continue;
@@ -1504,10 +1505,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 					   iterate_devices_callout_fn func)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;
 
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
 
 		if (!ti->type->iterate_devices ||
 		    !ti->type->iterate_devices(ti, func, NULL))
@@ -1528,10 +1529,10 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de
 static bool dm_table_supports_write_same(struct dm_table *t)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;
 
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
 
 		if (!ti->num_write_same_bios)
 			return false;
@@ -1555,7 +1556,7 @@ static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 static bool dm_table_supports_discards(struct dm_table *t)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;
 
 	/*
 	 * Unless any target used by the table set discards_supported,
@@ -1564,8 +1565,8 @@ static bool dm_table_supports_discards(struct dm_table *t)
 	 * so we need to use iterate_devices here, which targets
 	 * supporting discard selectively must provide.
 	 */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
 
 		if (!ti->num_discard_bios)
 			continue;

From e2460f2a4bc740fae9e23f14d653cf53e90b3f9a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2017 16:51:48 -0400
Subject: [PATCH 20/50] dm: mark targets that pass integrity data

A dm-crypt on dm-integrity device incorrectly advertises an integrity
profile on the DM crypt device.  It can be seen in the files
"/sys/block/dm-*/integrity/*" that both dm-integrity and dm-crypt target
advertise the integrity profile.  That is incorrect, only the
dm-integrity target should advertise the integrity profile.

A general problem in DM is that if we have a DM device that depends on
another device with an integrity profile, the upper device will always
advertise the integrity profile, even when the target driver doesn't
support handling integrity data.

Most targets don't support integrity data, so we provide a whitelist of
targets that support it (linear, delay and striped).  The targets that
support passing integrity data to the lower device are marked with the
flag DM_TARGET_PASSES_INTEGRITY.  The DM core will now advertise
integrity data on a DM device only if all the targets support the
integrity data.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-delay.c         |  1 +
 drivers/md/dm-linear.c        |  1 +
 drivers/md/dm-stripe.c        |  1 +
 drivers/md/dm-table.c         |  7 +++++++
 drivers/md/dm.c               | 16 +++++++++++++---
 include/linux/device-mapper.h |  6 ++++++
 6 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index cc70871a6d29..ae3158795d26 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -340,6 +340,7 @@ out:
 static struct target_type delay_target = {
 	.name	     = "delay",
 	.version     = {1, 2, 1},
+	.features    = DM_TARGET_PASSES_INTEGRITY,
 	.module      = THIS_MODULE,
 	.ctr	     = delay_ctr,
 	.dtr	     = delay_dtr,
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..ffa0c9c5968a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -162,6 +162,7 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector,
 static struct target_type linear_target = {
 	.name   = "linear",
 	.version = {1, 3, 0},
+	.features = DM_TARGET_PASSES_INTEGRITY,
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..d7e1b86e7570 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -440,6 +440,7 @@ static void stripe_io_hints(struct dm_target *ti,
 static struct target_type stripe_target = {
 	.name   = "striped",
 	.version = {1, 6, 0},
+	.features = DM_TARGET_PASSES_INTEGRITY,
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7fb29db478cd..c68757f94e16 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1135,6 +1135,13 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
 	struct list_head *devices = dm_table_get_devices(t);
 	struct dm_dev_internal *dd = NULL;
 	struct gendisk *prev_disk = NULL, *template_disk = NULL;
+	unsigned i;
+
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		struct dm_target *ti = dm_table_get_target(t, i);
+		if (!dm_target_passes_integrity(ti->type))
+			goto no_integrity;
+	}
 
 	list_for_each_entry(dd, devices, list) {
 		template_disk = dd->dm_dev->bdev->bd_disk;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f4ffd1eb8f44..e602ae0d5d75 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1089,8 +1089,18 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 
 	__bio_clone_fast(clone, bio);
 
-	if (bio_integrity(bio)) {
-		int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+	if (unlikely(bio_integrity(bio) != NULL)) {
+		int r;
+
+		if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
+			     !dm_target_passes_integrity(tio->ti->type))) {
+			DMWARN("%s: the target %s doesn't support integrity data.",
+				dm_device_name(tio->io->md),
+				tio->ti->type->name);
+			return -EIO;
+		}
+
+		r = bio_integrity_clone(clone, bio, GFP_NOIO);
 		if (r < 0)
 			return r;
 	}
@@ -1098,7 +1108,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
 	clone->bi_iter.bi_size = to_bytes(len);
 
-	if (bio_integrity(bio))
+	if (unlikely(bio_integrity(bio) != NULL))
 		bio_integrity_trim(clone, 0, len);
 
 	return 0;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 874462153f14..98f981026e4e 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -227,6 +227,12 @@ typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio)
 #define DM_TARGET_INTEGRITY		0x00000010
 #define dm_target_has_integrity(type)	((type)->features & DM_TARGET_INTEGRITY)
 
+/*
+ * A target passes integrity data to the lower device.
+ */
+#define DM_TARGET_PASSES_INTEGRITY	0x00000020
+#define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
+
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;

From 56b67a4f292f14548f4046979d46d07bcf8ba971 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2017 16:51:50 -0400
Subject: [PATCH 21/50] dm integrity: various small changes and cleanups

Some coding style changes.

Fix a bug that the array test_tag has insufficient size if the digest
size of internal has is bigger than the tag size.

The function __fls is undefined for zero argument, this patch fixes
undefined behavior if the user sets zero interleave_sectors.

Fix the limit of optional arguments to 8.

Don't allocate crypt_data on the stack to avoid a BUG with debug kernel.

Rename all optional argument names to have underscores rather than
dashes.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-integrity.txt |  20 ++--
 drivers/md/dm-integrity.c                    | 116 ++++++++++---------
 2 files changed, 72 insertions(+), 64 deletions(-)

diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
index 9d9089f74206..ced34cd915ef 100644
--- a/Documentation/device-mapper/dm-integrity.txt
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -69,17 +69,17 @@ Target arguments:
 
 Additional arguments:
 
-journal-sectors:number
+journal_sectors:number
 	The size of journal, this argument is used only if formatting the
 	device. If the device is already formatted, the value from the
 	superblock is used.
 
-interleave-sectors:number
+interleave_sectors:number
 	The number of interleaved sectors. This values is rounded down to
 	a power of two. If the device is already formatted, the value from
 	the superblock is used.
 
-buffer-sectors:number
+buffer_sectors:number
 	The number of sectors in one buffer. The value is rounded down to
 	a power of two.
 
@@ -87,17 +87,17 @@ buffer-sectors:number
 	configurable. The large buffer size means that the I/O size will
 	be larger, but there could be less I/Os issued.
 
-journal-watermark:number
+journal_watermark:number
 	The journal watermark in percents. When the size of the journal
 	exceeds this watermark, the thread that flushes the journal will
 	be started.
 
-commit-time:number
+commit_time:number
 	Commit time in milliseconds. When this time passes, the journal is
 	written. The journal is also written immediatelly if the FLUSH
 	request is received.
 
-internal-hash:algorithm(:key)	(the key is optional)
+internal_hash:algorithm(:key)	(the key is optional)
 	Use internal hash or crc.
 	When this argument is used, the dm-integrity target won't accept
 	integrity tags from the upper target, but it will automatically
@@ -113,7 +113,7 @@ internal-hash:algorithm(:key)	(the key is optional)
 	from an upper layer target, such as dm-crypt. The upper layer
 	target should check the validity of the integrity tags.
 
-journal-crypt:algorithm(:key)	(the key is optional)
+journal_crypt:algorithm(:key)	(the key is optional)
 	Encrypt the journal using given algorithm to make sure that the
 	attacker can't read the journal. You can use a block cipher here
 	(such as "cbc(aes)") or a stream cipher (for example "chacha20",
@@ -125,7 +125,7 @@ journal-crypt:algorithm(:key)	(the key is optional)
 	the size of files that were written. To protect against this
 	situation, you can encrypt the journal.
 
-journal-mac:algorithm(:key)	(the key is optional)
+journal_mac:algorithm(:key)	(the key is optional)
 	Protect sector numbers in the journal from accidental or malicious
 	modification. To protect against accidental modification, use a
 	crc algorithm, to protect against malicious modification, use a
@@ -137,7 +137,7 @@ journal-mac:algorithm(:key)	(the key is optional)
 	this stage.
 
 
-The journal mode (D/J), buffer-sectors, journal-watermark, commit-time can
+The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
 be changed when reloading the target (load an inactive table and swap the
 tables with suspend and resume). The other arguments should not be changed
 when reloading the target because the layout of disk data depend on them
@@ -158,7 +158,7 @@ The layout of the formatted block device:
 	  provides (i.e. the size of the device minus the size of all
 	  metadata and padding). The user of this target should not send
 	  bios that access data beyond the "provided data sectors" limit.
-	* flags - a flag is set if journal-mac is used
+	* flags - a flag is set if journal_mac is used
 * journal
 	The journal is divided into sections, each section contains:
 	* metadata area (4kiB), it contains journal entries
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index e26a079b41ea..95cdffbb206c 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -27,8 +27,8 @@
 #define DEFAULT_JOURNAL_WATERMARK	50
 #define DEFAULT_SYNC_MSEC		10000
 #define DEFAULT_MAX_JOURNAL_SECTORS	131072
-#define MIN_INTERLEAVE_SECTORS		3
-#define MAX_INTERLEAVE_SECTORS		31
+#define MIN_LOG2_INTERLEAVE_SECTORS	3
+#define MAX_LOG2_INTERLEAVE_SECTORS	31
 #define METADATA_WORKQUEUE_MAX_ACTIVE	16
 
 /*
@@ -414,7 +414,7 @@ static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsi
 {
 	unsigned sector;
 
-	access_journal_check(ic, section, offset, false, "access_journal");
+	access_journal_check(ic, section, offset, false, "page_list_location");
 
 	sector = section * ic->journal_section_sectors + offset;
 
@@ -1211,7 +1211,7 @@ static void integrity_metadata(struct work_struct *w)
 		unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
 		struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 		char *checksums;
-		unsigned extra_space = digest_size > ic->tag_size ? digest_size - ic->tag_size : 0;
+		unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
 		char checksums_onstack[ic->tag_size + extra_space];
 		unsigned sectors_to_process = dio->range.n_sectors;
 		sector_t sector = dio->range.logical_sector;
@@ -1820,7 +1820,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 				    unlikely(from_replay) &&
 #endif
 				    ic->internal_hash) {
-					unsigned char test_tag[ic->tag_size];
+					char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
 
 					integrity_sector_checksum(ic, sec + (l - j),
 								  (char *)access_journal_data(ic, i, l), test_tag);
@@ -2135,11 +2135,11 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 		arg_count += !!ic->journal_mac_alg.alg_string;
 		DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
 		       ic->tag_size, ic->mode, arg_count);
-		DMEMIT(" journal-sectors:%u", ic->initial_sectors - SB_SECTORS);
-		DMEMIT(" interleave-sectors:%u", 1U << ic->sb->log2_interleave_sectors);
-		DMEMIT(" buffer-sectors:%u", 1U << ic->log2_buffer_sectors);
-		DMEMIT(" journal-watermark:%u", (unsigned)watermark_percentage);
-		DMEMIT(" commit-time:%u", ic->autocommit_msec);
+		DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
+		DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
+		DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
+		DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
+		DMEMIT(" commit_time:%u", ic->autocommit_msec);
 
 #define EMIT_ALG(a, n)							\
 		do {							\
@@ -2149,9 +2149,9 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 					DMEMIT(":%s", ic->a.key_string);\
 			}						\
 		} while (0)
-		EMIT_ALG(internal_hash_alg, "internal-hash");
-		EMIT_ALG(journal_crypt_alg, "journal-crypt");
-		EMIT_ALG(journal_mac_alg, "journal-mac");
+		EMIT_ALG(internal_hash_alg, "internal_hash");
+		EMIT_ALG(journal_crypt_alg, "journal_crypt");
+		EMIT_ALG(journal_mac_alg, "journal_mac");
 		break;
 	}
 	}
@@ -2213,6 +2213,7 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
 	unsigned journal_sections;
 	int test_bit;
 
+	memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
 	memcpy(ic->sb->magic, SB_MAGIC, 8);
 	ic->sb->version = SB_VERSION;
 	ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
@@ -2225,9 +2226,11 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
 		journal_sections = 1;
 	ic->sb->journal_sections = cpu_to_le32(journal_sections);
 
+	if (!interleave_sectors)
+		interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
 	ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
-	ic->sb->log2_interleave_sectors = max((__u8)MIN_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
-	ic->sb->log2_interleave_sectors = min((__u8)MAX_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
+	ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
+	ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
 
 	ic->provided_data_sectors = 0;
 	for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) {
@@ -2238,7 +2241,7 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
 			ic->provided_data_sectors = prev_data_sectors;
 	}
 
-	if (!le64_to_cpu(ic->provided_data_sectors))
+	if (!ic->provided_data_sectors)
 		return -EINVAL;
 
 	ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
@@ -2444,6 +2447,12 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 	int r = 0;
 	unsigned i;
 	__u64 journal_pages, journal_desc_size, journal_tree_size;
+	unsigned char *crypt_data = NULL;
+
+	ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
+	ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
+	ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
+	ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
 
 	journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
 				PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
@@ -2541,7 +2550,13 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 			SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt);
 			unsigned char iv[ivsize];
 			unsigned crypt_len = roundup(ivsize, blocksize);
-			unsigned char crypt_data[crypt_len];
+
+			crypt_data = kmalloc(crypt_len, GFP_KERNEL);
+			if (!crypt_data) {
+				*error = "Unable to allocate crypt data";
+				r = -ENOMEM;
+				goto bad;
+			}
 
 			skcipher_request_set_tfm(req, ic->journal_crypt);
 
@@ -2630,38 +2645,38 @@ retest_commit_id:
 		r = -ENOMEM;
 	}
 bad:
+	kfree(crypt_data);
 	return r;
 }
 
 /*
- * Construct a integrity mapping: <dev_path> <offset> <tag_size>
+ * Construct a integrity mapping
  *
  * Arguments:
  *	device
  *	offset from the start of the device
  *	tag size
- *	D - direct writes, J - journal writes
+ *	D - direct writes, J - journal writes, R - recovery mode
  *	number of optional arguments
  *	optional arguments:
- *		journal-sectors
- *		interleave-sectors
- *		buffer-sectors
- *		journal-watermark
- *		commit-time
- *		internal-hash
- *		journal-crypt
- *		journal-mac
+ *		journal_sectors
+ *		interleave_sectors
+ *		buffer_sectors
+ *		journal_watermark
+ *		commit_time
+ *		internal_hash
+ *		journal_crypt
+ *		journal_mac
  */
 static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct dm_integrity_c *ic;
 	char dummy;
 	int r;
-	unsigned i;
 	unsigned extra_args;
 	struct dm_arg_set as;
 	static struct dm_arg _args[] = {
-		{0, 7, "Invalid number of feature args"},
+		{0, 8, "Invalid number of feature args"},
 	};
 	unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
 	bool should_write_sb;
@@ -2683,11 +2698,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->private = ic;
 	ti->per_io_data_size = sizeof(struct dm_integrity_io);
 
-	ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
-	ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
-	ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
-	ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
-
 	ic->in_progress = RB_ROOT;
 	init_waitqueue_head(&ic->endio_wait);
 	bio_list_init(&ic->flush_bio_list);
@@ -2718,7 +2728,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
 		ic->mode = argv[3][0];
 	else {
-		ti->error = "Invalid mode (expecting J or D)";
+		ti->error = "Invalid mode (expecting J, D, R)";
 		r = -EINVAL;
 		goto bad;
 	}
@@ -2746,29 +2756,29 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			ti->error = "Not enough feature arguments";
 			goto bad;
 		}
-		if (sscanf(opt_string, "journal-sectors:%u%c", &val, &dummy) == 1)
+		if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
 			journal_sectors = val;
-		else if (sscanf(opt_string, "interleave-sectors:%u%c", &val, &dummy) == 1)
+		else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
 			interleave_sectors = val;
-		else if (sscanf(opt_string, "buffer-sectors:%u%c", &val, &dummy) == 1)
+		else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
 			buffer_sectors = val;
-		else if (sscanf(opt_string, "journal-watermark:%u%c", &val, &dummy) == 1 && val <= 100)
+		else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
 			journal_watermark = val;
-		else if (sscanf(opt_string, "commit-time:%u%c", &val, &dummy) == 1)
+		else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
 			sync_msec = val;
-		else if (!memcmp(opt_string, "internal-hash:", strlen("internal-hash:"))) {
+		else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
 			r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
-					    "Invalid internal-hash argument");
+					    "Invalid internal_hash argument");
 			if (r)
 				goto bad;
-		} else if (!memcmp(opt_string, "journal-crypt:", strlen("journal-crypt:"))) {
+		} else if (!memcmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
 			r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
-					    "Invalid journal-crypt argument");
+					    "Invalid journal_crypt argument");
 			if (r)
 				goto bad;
-		} else if (!memcmp(opt_string, "journal-mac:", strlen("journal-mac:"))) {
+		} else if (!memcmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
 			r = get_alg_and_key(opt_string, &ic->journal_mac_alg,  &ti->error,
-					    "Invalid journal-mac argument");
+					    "Invalid journal_mac argument");
 			if (r)
 				goto bad;
 		} else {
@@ -2877,12 +2887,10 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	should_write_sb = false;
 	if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
 		if (ic->mode != 'R') {
-			for (i = 0; i < 512; i += 8) {
-				if (*(__u64 *)((__u8 *)ic->sb + i)) {
-					r = -EINVAL;
-					ti->error = "The device is not initialized";
-					goto bad;
-				}
+			if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
+				r = -EINVAL;
+				ti->error = "The device is not initialized";
+				goto bad;
 			}
 		}
 
@@ -2906,8 +2914,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 	/* make sure that ti->max_io_len doesn't overflow */
-	if (ic->sb->log2_interleave_sectors < MIN_INTERLEAVE_SECTORS ||
-	    ic->sb->log2_interleave_sectors > MAX_INTERLEAVE_SECTORS) {
+	if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
+	    ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
 		r = -EINVAL;
 		ti->error = "Invalid interleave_sectors in the superblock";
 		goto bad;

From 9d609f85b7eb96697ae22ad1d47a3a3920174ba8 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2017 16:51:52 -0400
Subject: [PATCH 22/50] dm integrity: support larger block sizes

The DM integrity block size can now be 512, 1k, 2k or 4k.  Using larger
blocks reduces metadata handling overhead.  The block size can be
configured at table load time using the "block_size:<value>" option;
where <value> is expressed in bytes (defult is still 512 bytes).

It is safe to use larger block sizes with DM integrity, because the
DM integrity journal makes sure that the whole block is updated
atomically even if the underlying device doesn't support atomic writes
of that size (e.g. 4k block ontop of a 512b device).

Depends-on: 2859323e ("block: fix blk_integrity_register to use template's interval_exp if not 0")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/dm-integrity.txt |   5 +
 drivers/md/dm-integrity.c                    | 219 +++++++++++++++----
 2 files changed, 179 insertions(+), 45 deletions(-)

diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
index ced34cd915ef..f33e3ade7a09 100644
--- a/Documentation/device-mapper/dm-integrity.txt
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -136,6 +136,11 @@ journal_mac:algorithm(:key)	(the key is optional)
 	the journal. Thus, modified sector number would be detected at
 	this stage.
 
+block_size:number
+	The size of a data block in bytes.  The larger the block size the
+	less overhead there is for per-block integrity metadata.
+	Supported values are 512, 1024, 2048 and 4096 bytes.  If not
+	specified the default block size is 512 bytes.
 
 The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
 be changed when reloading the target (load an inactive table and swap the
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 95cdffbb206c..0354af4cd713 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/delay.h>
 #include <linux/random.h>
+#include <linux/log2.h>
 #include <crypto/hash.h>
 #include <crypto/skcipher.h>
 #include <linux/async_tx.h>
@@ -45,6 +46,7 @@
 #define SB_MAGIC			"integrt"
 #define SB_VERSION			1
 #define SB_SECTORS			8
+#define MAX_SECTORS_PER_BLOCK		8
 
 struct superblock {
 	__u8 magic[8];
@@ -54,6 +56,7 @@ struct superblock {
 	__u32 journal_sections;
 	__u64 provided_data_sectors;	/* userspace uses this value */
 	__u32 flags;
+	__u8 log2_sectors_per_block;
 };
 
 #define SB_FLAG_HAVE_JOURNAL_MAC	0x1
@@ -71,10 +74,12 @@ struct journal_entry {
 		} s;
 		__u64 sector;
 	} u;
-	commit_id_t last_bytes;
-	__u8 tag[0];
+	commit_id_t last_bytes[0];
+	/* __u8 tag[0]; */
 };
 
+#define journal_entry_tag(ic, je)		((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
+
 #if BITS_PER_LONG == 64
 #define journal_entry_set_sector(je, x)		do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
 #define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
@@ -100,7 +105,7 @@ struct journal_sector {
 	commit_id_t commit_id;
 };
 
-#define MAX_TAG_SIZE			(JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, tag))
+#define MAX_TAG_SIZE			(JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
 
 #define METADATA_PADDING_SECTORS	8
 
@@ -162,7 +167,7 @@ struct dm_integrity_c {
 	unsigned short journal_entry_size;
 	unsigned char journal_entries_per_sector;
 	unsigned char journal_section_entries;
-	unsigned char journal_section_sectors;
+	unsigned short journal_section_sectors;
 	unsigned journal_sections;
 	unsigned journal_entries;
 	sector_t device_sectors;
@@ -170,6 +175,7 @@ struct dm_integrity_c {
 	unsigned metadata_run;
 	__s8 log2_metadata_run;
 	__u8 log2_buffer_sectors;
+	__u8 sectors_per_block;
 
 	unsigned char mode;
 	bool suspending;
@@ -332,6 +338,12 @@ static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
 	*offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
 }
 
+#define sector_to_block(ic, n)						\
+do {									\
+	BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1));		\
+	(n) >>= (ic)->sb->log2_sectors_per_block;			\
+} while (0)
+
 static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
 					    sector_t offset, unsigned *metadata_offset)
 {
@@ -345,6 +357,8 @@ static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t
 		ms += area * ic->metadata_run;
 	ms >>= ic->log2_buffer_sectors;
 
+	sector_to_block(ic, offset);
+
 	if (likely(ic->log2_tag_size >= 0)) {
 		ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
 		mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
@@ -459,9 +473,13 @@ static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, uns
 
 static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
 {
-	access_journal_check(ic, section, n, true, "access_journal_data");
+	n <<= ic->sb->log2_sectors_per_block;
 
-	return access_journal(ic, section, n + JOURNAL_BLOCK_SECTORS);
+	n += JOURNAL_BLOCK_SECTORS;
+
+	access_journal_check(ic, section, n, false, "access_journal_data");
+
+	return access_journal(ic, section, n);
 }
 
 static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
@@ -812,6 +830,8 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig
 	int r;
 	unsigned sector, pl_index, pl_offset;
 
+	BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
+
 	if (unlikely(dm_integrity_failed(ic))) {
 		fn(-1UL, data);
 		return;
@@ -846,6 +866,8 @@ static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *
 	struct rb_node **n = &ic->in_progress.rb_node;
 	struct rb_node *parent;
 
+	BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
+
 	parent = NULL;
 
 	while (*n) {
@@ -1175,7 +1197,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
 		goto failed;
 	}
 
-	r = crypto_shash_update(req, data, 1 << SECTOR_SHIFT);
+	r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
 	if (unlikely(r < 0)) {
 		dm_integrity_io_error(ic, "crypto_shash_update", r);
 		goto failed;
@@ -1219,7 +1241,7 @@ static void integrity_metadata(struct work_struct *w)
 		if (unlikely(ic->mode == 'R'))
 			goto skip_io;
 
-		checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT) * ic->tag_size + extra_space,
+		checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
 				    GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
 		if (!checksums)
 			checksums = checksums_onstack;
@@ -1235,9 +1257,9 @@ again:
 			do {
 				integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
 				checksums_ptr += ic->tag_size;
-				sectors_to_process--;
-				pos += 1 << SECTOR_SHIFT;
-				sector++;
+				sectors_to_process -= ic->sectors_per_block;
+				pos += ic->sectors_per_block << SECTOR_SHIFT;
+				sector += ic->sectors_per_block;
 			} while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
 			kunmap_atomic(mem);
 
@@ -1272,7 +1294,9 @@ again:
 		if (bip) {
 			struct bio_vec biv;
 			struct bvec_iter iter;
-			unsigned data_to_process = dio->range.n_sectors * ic->tag_size;
+			unsigned data_to_process = dio->range.n_sectors;
+			sector_to_block(ic, data_to_process);
+			data_to_process *= ic->tag_size;
 
 			bip_for_each_vec(biv, bip, iter) {
 				unsigned char *tag;
@@ -1303,6 +1327,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 {
 	struct dm_integrity_c *ic = ti->private;
 	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+	struct bio_integrity_payload *bip;
 
 	sector_t area, offset;
 
@@ -1330,6 +1355,44 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 		      (unsigned long long)ic->provided_data_sectors);
 		return -EIO;
 	}
+	if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
+		DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
+		      ic->sectors_per_block,
+		      (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
+		return -EIO;
+	}
+
+	if (ic->sectors_per_block > 1) {
+		struct bvec_iter iter;
+		struct bio_vec bv;
+		bio_for_each_segment(bv, bio, iter) {
+			if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
+				DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
+					bv.bv_offset, bv.bv_len, ic->sectors_per_block);
+				return -EIO;
+			}
+		}
+	}
+
+	bip = bio_integrity(bio);
+	if (!ic->internal_hash) {
+		if (bip) {
+			unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
+			if (ic->log2_tag_size >= 0)
+				wanted_tag_size <<= ic->log2_tag_size;
+			else
+				wanted_tag_size *= ic->tag_size;
+			if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
+				DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
+				return -EIO;
+			}
+		}
+	} else {
+		if (unlikely(bip != NULL)) {
+			DMERR("Unexpected integrity data when using internal hash");
+			return -EIO;
+		}
+	}
 
 	if (unlikely(ic->mode == 'R') && unlikely(dio->write))
 		return -EIO;
@@ -1369,6 +1432,8 @@ retry_kmap:
 
 			if (unlikely(!dio->write)) {
 				struct journal_sector *js;
+				char *mem_ptr;
+				unsigned s;
 
 				if (unlikely(journal_entry_is_inprogress(je))) {
 					flush_dcache_page(bv.bv_page);
@@ -1380,14 +1445,20 @@ retry_kmap:
 				smp_rmb();
 				BUG_ON(journal_entry_get_sector(je) != logical_sector);
 				js = access_journal_data(ic, journal_section, journal_entry);
-				memcpy(mem + bv.bv_offset, js, JOURNAL_SECTOR_DATA);
-				memcpy(mem + bv.bv_offset + JOURNAL_SECTOR_DATA, &je->last_bytes, sizeof je->last_bytes);
+				mem_ptr = mem + bv.bv_offset;
+				s = 0;
+				do {
+					memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
+					*(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
+					js++;
+					mem_ptr += 1 << SECTOR_SHIFT;
+				} while (++s < ic->sectors_per_block);
 #ifdef INTERNAL_VERIFY
 				if (ic->internal_hash) {
 					char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
 
 					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
-					if (unlikely(memcmp(checksums_onstack, je->tag, ic->tag_size))) {
+					if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
 						DMERR("Checksum failed when reading from journal, at sector 0x%llx",
 						      (unsigned long long)logical_sector);
 					}
@@ -1398,7 +1469,7 @@ retry_kmap:
 			if (!ic->internal_hash) {
 				struct bio_integrity_payload *bip = bio_integrity(bio);
 				unsigned tag_todo = ic->tag_size;
-				char *tag_ptr = je->tag;
+				char *tag_ptr = journal_entry_tag(ic, je);
 
 				if (bip) do {
 					struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
@@ -1421,24 +1492,29 @@ retry_kmap:
 
 			if (likely(dio->write)) {
 				struct journal_sector *js;
+				unsigned s;
 
 				js = access_journal_data(ic, journal_section, journal_entry);
-				memcpy(js, mem + bv.bv_offset, 1 << SECTOR_SHIFT);
-				je->last_bytes = js->commit_id;
+				memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
+
+				s = 0;
+				do {
+					je->last_bytes[s] = js[s].commit_id;
+				} while (++s < ic->sectors_per_block);
 
 				if (ic->internal_hash) {
 					unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
 					if (unlikely(digest_size > ic->tag_size)) {
 						char checksums_onstack[digest_size];
 						integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
-						memcpy(je->tag, checksums_onstack, ic->tag_size);
+						memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
 					} else
-						integrity_sector_checksum(ic, logical_sector, (char *)js, je->tag);
+						integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
 				}
 
 				journal_entry_set_sector(je, logical_sector);
 			}
-			logical_sector++;
+			logical_sector += ic->sectors_per_block;
 
 			journal_entry++;
 			if (unlikely(journal_entry == ic->journal_section_entries)) {
@@ -1447,8 +1523,8 @@ retry_kmap:
 				wraparound_section(ic, &journal_section);
 			}
 
-			bv.bv_offset += 1 << SECTOR_SHIFT;
-		} while (bv.bv_len -= 1 << SECTOR_SHIFT);
+			bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
+		} while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
 
 		if (unlikely(!dio->write))
 			flush_dcache_page(bv.bv_page);
@@ -1526,7 +1602,8 @@ retry:
 			pos = journal_section * ic->journal_section_entries + journal_entry;
 			ws = journal_section;
 			we = journal_entry;
-			for (i = 0; i < dio->range.n_sectors; i++) {
+			i = 0;
+			do {
 				struct journal_entry *je;
 
 				add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
@@ -1543,7 +1620,7 @@ retry:
 					ws++;
 					wraparound_section(ic, &ws);
 				}
-			}
+			} while ((i += ic->sectors_per_block) < dio->range.n_sectors);
 
 			spin_unlock_irq(&ic->endio_wait.lock);
 			goto journal_read_write;
@@ -1555,8 +1632,9 @@ retry:
 					dio->range.n_sectors = next_sector - dio->range.logical_sector;
 			} else {
 				unsigned i;
-				for (i = 1; i < dio->range.n_sectors; i++) {
-					if (!test_journal_node(ic, journal_read_pos + i, dio->range.logical_sector + i))
+				unsigned jp = journal_read_pos + 1;
+				for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
+					if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
 						break;
 				}
 				dio->range.n_sectors = i;
@@ -1725,6 +1803,16 @@ static void complete_copy_from_journal(unsigned long error, void *context)
 	complete_journal_op(comp);
 }
 
+static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
+			       struct journal_entry *je)
+{
+	unsigned s = 0;
+	do {
+		js->commit_id = je->last_bytes[s];
+		js++;
+	} while (++s < ic->sectors_per_block);
+}
+
 static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 			     unsigned write_sections, bool from_replay)
 {
@@ -1753,8 +1841,14 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 				continue;
 			BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
 			sec = journal_entry_get_sector(je);
+			if (unlikely(from_replay)) {
+				if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) {
+					dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
+					sec &= ~(sector_t)(ic->sectors_per_block - 1);
+				}
+			}
 			get_area_and_offset(ic, sec, &area, &offset);
-			access_journal_data(ic, i, j)->commit_id = je->last_bytes;
+			restore_last_bytes(ic, access_journal_data(ic, i, j), je);
 			for (k = j + 1; k < ic->journal_section_entries; k++) {
 				struct journal_entry *je2 = access_journal_entry(ic, i, k);
 				sector_t sec2, area2, offset2;
@@ -1763,16 +1857,16 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 				BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
 				sec2 = journal_entry_get_sector(je2);
 				get_area_and_offset(ic, sec2, &area2, &offset2);
-				if (area2 != area || offset2 != offset + (k - j))
+				if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
 					break;
-				access_journal_data(ic, i, k)->commit_id = je2->last_bytes;
+				restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
 			}
 			next_loop = k - 1;
 
 			io = mempool_alloc(ic->journal_io_mempool, GFP_NOIO);
 			io->comp = &comp;
 			io->range.logical_sector = sec;
-			io->range.n_sectors = k - j;
+			io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
 
 			spin_lock_irq(&ic->endio_wait.lock);
 			while (unlikely(!add_new_range(ic, &io->range)))
@@ -1788,8 +1882,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 					journal_entry_set_unused(je2);
 					remove_journal_node(ic, &section_node[j]);
 					j++;
-					sec++;
-					offset++;
+					sec += ic->sectors_per_block;
+					offset += ic->sectors_per_block;
 				}
 				while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
 					struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
@@ -1822,14 +1916,14 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 				    ic->internal_hash) {
 					char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
 
-					integrity_sector_checksum(ic, sec + (l - j),
+					integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
 								  (char *)access_journal_data(ic, i, l), test_tag);
-					if (unlikely(memcmp(test_tag, je2->tag, ic->tag_size)))
+					if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size)))
 						dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
 				}
 
 				journal_entry_set_unused(je2);
-				r = dm_integrity_rw_tag(ic, je2->tag, &metadata_block, &metadata_offset,
+				r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
 							ic->tag_size, TAG_WRITE);
 				if (unlikely(r)) {
 					dm_integrity_io_error(ic, "reading tags", r);
@@ -1837,7 +1931,9 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 			}
 
 			atomic_inc(&comp.in_flight);
-			copy_from_journal(ic, i, j, k - j, get_data_sector(ic, area, offset),
+			copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
+					  (k - j) << ic->sb->log2_sectors_per_block,
+					  get_data_sector(ic, area, offset),
 					  complete_copy_from_journal, io);
 skip_io:
 			j = next_loop;
@@ -2130,6 +2226,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 		watermark_percentage += ic->journal_entries / 2;
 		do_div(watermark_percentage, ic->journal_entries);
 		arg_count = 5;
+		arg_count += ic->sectors_per_block != 1;
 		arg_count += !!ic->internal_hash_alg.alg_string;
 		arg_count += !!ic->journal_crypt_alg.alg_string;
 		arg_count += !!ic->journal_mac_alg.alg_string;
@@ -2140,6 +2237,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 		DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
 		DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
 		DMEMIT(" commit_time:%u", ic->autocommit_msec);
+		if (ic->sectors_per_block != 1)
+			DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
 
 #define EMIT_ALG(a, n)							\
 		do {							\
@@ -2165,19 +2264,30 @@ static int dm_integrity_iterate_devices(struct dm_target *ti,
 	return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
 }
 
+static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct dm_integrity_c *ic = ti->private;
+
+	if (ic->sectors_per_block > 1) {
+		limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
+		limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
+		blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
+	}
+}
+
 static void calculate_journal_section_size(struct dm_integrity_c *ic)
 {
 	unsigned sector_space = JOURNAL_SECTOR_DATA;
 
 	ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
-	ic->journal_entry_size = roundup(offsetof(struct journal_entry, tag) + ic->tag_size,
+	ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
 					 JOURNAL_ENTRY_ROUNDUP);
 
 	if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
 		sector_space -= JOURNAL_MAC_PER_SECTOR;
 	ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
 	ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
-	ic->journal_section_sectors = ic->journal_section_entries + JOURNAL_BLOCK_SECTORS;
+	ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
 	ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
 }
 
@@ -2192,7 +2302,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
 		return -EINVAL;
 	ic->initial_sectors = initial_sectors;
 
-	ic->metadata_run = roundup((__u64)ic->tag_size << ic->sb->log2_interleave_sectors,
+	ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
 				   (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
 	if (!(ic->metadata_run & (ic->metadata_run - 1)))
 		ic->log2_metadata_run = __ffs(ic->metadata_run);
@@ -2217,6 +2327,7 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
 	memcpy(ic->sb->magic, SB_MAGIC, 8);
 	ic->sb->version = SB_VERSION;
 	ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
+	ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
 	if (ic->journal_mac_alg.alg_string)
 		ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
 
@@ -2256,8 +2367,9 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
 
 	memset(&bi, 0, sizeof(bi));
 	bi.profile = &dm_integrity_profile;
-	bi.tuple_size = ic->tag_size * (queue_logical_block_size(disk->queue) >> SECTOR_SHIFT);
-	bi.tag_size = ic->tag_size;
+	bi.tuple_size = ic->tag_size;
+	bi.tag_size = bi.tuple_size;
+	bi.interval_exp = ilog2(ic->sectors_per_block << SECTOR_SHIFT);
 
 	blk_integrity_register(disk, &bi);
 	blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
@@ -2667,6 +2779,7 @@ bad:
  *		internal_hash
  *		journal_crypt
  *		journal_mac
+ *		block_size
  */
 static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -2676,7 +2789,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	unsigned extra_args;
 	struct dm_arg_set as;
 	static struct dm_arg _args[] = {
-		{0, 8, "Invalid number of feature args"},
+		{0, 9, "Invalid number of feature args"},
 	};
 	unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
 	bool should_write_sb;
@@ -2740,6 +2853,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	buffer_sectors = DEFAULT_BUFFER_SECTORS;
 	journal_watermark = DEFAULT_JOURNAL_WATERMARK;
 	sync_msec = DEFAULT_SYNC_MSEC;
+	ic->sectors_per_block = 1;
 
 	as.argc = argc - DIRECT_ARGUMENTS;
 	as.argv = argv + DIRECT_ARGUMENTS;
@@ -2766,7 +2880,16 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			journal_watermark = val;
 		else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
 			sync_msec = val;
-		else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
+		else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
+			if (val < 1 << SECTOR_SHIFT ||
+			    val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
+			    (val & (val -1))) {
+				r = -EINVAL;
+				ti->error = "Invalid block_size argument";
+				goto bad;
+			}
+			ic->sectors_per_block = val >> SECTOR_SHIFT;
+		} else if (!memcmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
 			r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
 					    "Invalid internal_hash argument");
 			if (r)
@@ -2910,7 +3033,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 	if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
 		r = -EINVAL;
-		ti->error = "Invalid tag size";
+		ti->error = "Tag size doesn't match the information in superblock";
+		goto bad;
+	}
+	if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
+		r = -EINVAL;
+		ti->error = "Block size doesn't match the information in superblock";
 		goto bad;
 	}
 	/* make sure that ti->max_io_len doesn't overflow */
@@ -3084,6 +3212,7 @@ static struct target_type integrity_target = {
 	.resume			= dm_integrity_resume,
 	.status			= dm_integrity_status,
 	.iterate_devices	= dm_integrity_iterate_devices,
+	.io_hints		= dm_integrity_io_hints,
 };
 
 int __init dm_integrity_init(void)

From 583fe7474c05ee5523e14ef791ea59604cd846dc Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2017 16:51:54 -0400
Subject: [PATCH 23/50] dm crypt: fix large block integrity support

Previously, dm-crypt could use blocks composed of multiple 512b sectors
but it created integrity profile for each 512b sector (it padded it with
zeroes).  Fix dm-crypt so that the integrity profile is sent for each
block not each sector.

The user must use the same block size in the DM crypt and integrity
targets.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ccbc7f36bb2e..8bff6f7a4c6c 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -938,10 +938,15 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
 		return -EINVAL;
 	}
 
-	if (bi->tag_size != cc->on_disk_tag_size) {
+	if (bi->tag_size != cc->on_disk_tag_size ||
+	    bi->tuple_size != cc->on_disk_tag_size) {
 		ti->error = "Integrity profile tag size mismatch.";
 		return -EINVAL;
 	}
+	if (1 << bi->interval_exp != cc->sector_size) {
+		ti->error = "Integrity profile sector size mismatch.";
+		return -EINVAL;
+	}
 
 	if (crypt_integrity_aead(cc)) {
 		cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
@@ -1322,7 +1327,7 @@ static int crypt_convert(struct crypt_config *cc,
 		case -EINPROGRESS:
 			ctx->r.req = NULL;
 			ctx->cc_sector += sector_step;
-			tag_offset += sector_step;
+			tag_offset++;
 			continue;
 		/*
 		 * The request was already processed (synchronously).
@@ -1330,7 +1335,7 @@ static int crypt_convert(struct crypt_config *cc,
 		case 0:
 			atomic_dec(&ctx->cc_pending);
 			ctx->cc_sector += sector_step;
-			tag_offset += sector_step;
+			tag_offset++;
 			cond_resched();
 			continue;
 		/*
@@ -2735,6 +2740,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			ti->error = "Cannot allocate integrity tags mempool";
 			goto bad;
 		}
+
+		cc->tag_pool_max_sectors <<= cc->sector_shift;
 	}
 
 	ret = -ENOMEM;
@@ -2816,16 +2823,15 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
 
 	if (cc->on_disk_tag_size) {
-		unsigned tag_len = cc->on_disk_tag_size * bio_sectors(bio);
+		unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
 
 		if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
-		    unlikely(!(io->integrity_metadata = kzalloc(tag_len,
+		    unlikely(!(io->integrity_metadata = kmalloc(tag_len,
 				GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
 			if (bio_sectors(bio) > cc->tag_pool_max_sectors)
 				dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
 			io->integrity_metadata = mempool_alloc(cc->tag_pool, GFP_NOIO);
 			io->integrity_metadata_from_pool = true;
-			memset(io->integrity_metadata, 0, cc->tag_pool_max_sectors * (1 << SECTOR_SHIFT));
 		}
 	}
 

From 9119fedddb5b630eec3f3e39c4e382d9f60cca07 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 19 Apr 2017 10:52:54 -0400
Subject: [PATCH 24/50] dm: remove dummy dm_table definition

This dummy structure definition was required for RCU macros, but it
isn't required anymore, so delete it.

The dummy definition confuses the crash tool, see:
https://www.redhat.com/archives/dm-devel/2017-April/msg00197.html

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4da6fc6b1ffd..641d5e9cfedd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -37,14 +37,6 @@ struct hash_cell {
 	struct dm_table *new_map;
 };
 
-/*
- * A dummy definition to make RCU happy.
- * struct dm_table should never be dereferenced in this file.
- */
-struct dm_table {
-	int undefined__;
-};
-
 struct vers_iter {
     size_t param_size;
     struct dm_target_versions *vers, *old_vers;

From e36215d87f301f9567c8c99fd34e6c3521a94ddf Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 17 Apr 2017 11:05:03 -0700
Subject: [PATCH 25/50] dm ioctl: remove double parentheses

The extra pair of parantheses is not needed and causes clang to generate
warnings about the DM_DEV_CREATE_CMD comparison in validate_params().

Also remove another double parentheses that doesn't cause a warning.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 641d5e9cfedd..0956b8659360 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1770,12 +1770,12 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 	    cmd == DM_LIST_VERSIONS_CMD)
 		return 0;
 
-	if ((cmd == DM_DEV_CREATE_CMD)) {
+	if (cmd == DM_DEV_CREATE_CMD) {
 		if (!*param->name) {
 			DMWARN("name not supplied when creating device");
 			return -EINVAL;
 		}
-	} else if ((*param->uuid && *param->name)) {
+	} else if (*param->uuid && *param->name) {
 		DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
 		return -EINVAL;
 	}

From 7d1fedb6e96a960aa91e4ff70714c3fb09195a5a Mon Sep 17 00:00:00 2001
From: Vinothkumar Raja <vinraja@cs.stonybrook.edu>
Date: Thu, 6 Apr 2017 22:09:38 -0400
Subject: [PATCH 26/50] dm btree: fix for dm_btree_find_lowest_key()

dm_btree_find_lowest_key() is giving incorrect results.  find_key()
traverses the btree correctly for finding the highest key, but there is
an error in the way it traverses the btree for retrieving the lowest
key.  dm_btree_find_lowest_key() fetches the first key of the rightmost
block of the btree instead of fetching the first key from the leftmost
block.

Fix this by conditionally passing the correct parameter to value64()
based on the @find_highest flag.

Cc: stable@vger.kernel.org
Signed-off-by: Erez Zadok <ezk@fsl.cs.sunysb.edu>
Signed-off-by: Vinothkumar Raja <vinraja@cs.stonybrook.edu>
Signed-off-by: Nidhi Panpalia <npanpalia@cs.stonybrook.edu>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-btree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 02e2ee0d8a00..f21ce6a3d4cf 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -902,8 +902,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
 		else
 			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
 
-		if (next_block || flags & INTERNAL_NODE)
-			block = value64(ro_node(s), i);
+		if (next_block || flags & INTERNAL_NODE) {
+			if (find_highest)
+				block = value64(ro_node(s), i);
+			else
+				block = value64(ro_node(s), 0);
+		}
 
 	} while (flags & INTERNAL_NODE);
 

From 948f581a53b704b984aa20df009f0a2b4cf7f907 Mon Sep 17 00:00:00 2001
From: Dennis Yang <dennisyang@qnap.com>
Date: Tue, 18 Apr 2017 15:27:06 +0800
Subject: [PATCH 27/50] dm thin: fix a memory leak when passing discard bio
 down

dm-thin does not free the discard_parent bio after all chained sub
bios finished. The following kmemleak report could be observed after
pool with discard_passdown option processes discard bios in
linux v4.11-rc7. To fix this, we drop the discard_parent bio reference
when its endio (passdown_endio) called.

unreferenced object 0xffff8803d6b29700 (size 256):
  comm "kworker/u8:0", pid 30349, jiffies 4379504020 (age 143002.776s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    01 00 00 00 00 00 00 f0 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff81a5efd9>] kmemleak_alloc+0x49/0xa0
    [<ffffffff8114ec34>] kmem_cache_alloc+0xb4/0x100
    [<ffffffff8110eec0>] mempool_alloc_slab+0x10/0x20
    [<ffffffff8110efa5>] mempool_alloc+0x55/0x150
    [<ffffffff81374939>] bio_alloc_bioset+0xb9/0x260
    [<ffffffffa018fd20>] process_prepared_discard_passdown_pt1+0x40/0x1c0 [dm_thin_pool]
    [<ffffffffa018b409>] break_up_discard_bio+0x1a9/0x200 [dm_thin_pool]
    [<ffffffffa018b484>] process_discard_cell_passdown+0x24/0x40 [dm_thin_pool]
    [<ffffffffa018b24d>] process_discard_bio+0xdd/0xf0 [dm_thin_pool]
    [<ffffffffa018ecf6>] do_worker+0xa76/0xd50 [dm_thin_pool]
    [<ffffffff81086239>] process_one_work+0x139/0x370
    [<ffffffff810867b1>] worker_thread+0x61/0x450
    [<ffffffff8108b316>] kthread+0xd6/0xf0
    [<ffffffff81a6cd1f>] ret_from_fork+0x3f/0x70
    [<ffffffffffffffff>] 0xffffffffffffffff

Cc: stable@vger.kernel.org
Signed-off-by: Dennis Yang <dennisyang@qnap.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 9b3e2fcbfb1b..f90bcbf24ebc 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1069,6 +1069,7 @@ static void passdown_endio(struct bio *bio)
 	 * to unmap (we ignore err).
 	 */
 	queue_passdown_pt2(bio->bi_private);
+	bio_put(bio);
 }
 
 static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)

From 117aceb030307dcd431fdcff87ce988d3016c34a Mon Sep 17 00:00:00 2001
From: Somasundaram Krishnasamy <somasundaram.krishnasamy@oracle.com>
Date: Fri, 7 Apr 2017 12:14:55 -0700
Subject: [PATCH 28/50] dm era: save spacemap metadata root after the
 pre-commit

When committing era metadata to disk, it doesn't always save the latest
spacemap metadata root in superblock. Due to this, metadata is getting
corrupted sometimes when reopening the device. The correct order of update
should be, pre-commit (shadows spacemap root), save the spacemap root
(newly shadowed block) to in-core superblock and then the final commit.

Cc: stable@vger.kernel.org
Signed-off-by: Somasundaram Krishnasamy <somasundaram.krishnasamy@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-era-target.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 9fab33b113c4..68d4084377ad 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -961,18 +961,18 @@ static int metadata_commit(struct era_metadata *md)
 		}
 	}
 
-	r = save_sm_root(md);
-	if (r) {
-		DMERR("%s: save_sm_root failed", __func__);
-		return r;
-	}
-
 	r = dm_tm_pre_commit(md->tm);
 	if (r) {
 		DMERR("%s: pre commit failed", __func__);
 		return r;
 	}
 
+	r = save_sm_root(md);
+	if (r) {
+		DMERR("%s: save_sm_root failed", __func__);
+		return r;
+	}
+
 	r = superblock_lock(md, &sblock);
 	if (r) {
 		DMERR("%s: superblock lock failed", __func__);

From 06eb061f48594aa369f6e852b352410298b317a8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 7 Apr 2017 16:50:44 -0700
Subject: [PATCH 29/50] dm mpath: requeue after a small delay if
 blk_get_request() fails

If blk_get_request() returns ENODEV then multipath_clone_and_map()
causes a request to be requeued immediately. This can cause a kworker
thread to spend 100% of the CPU time of a single core in
__blk_mq_run_hw_queue() and also can cause device removal to never
finish.

Avoid this by only requeuing after a delay if blk_get_request() fails.
Additionally, reduce the requeue delay.

Cc: stable@vger.kernel.org # 4.9+
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 5 ++---
 drivers/md/dm-rq.c    | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7f223dbed49f..a4cc4d42117b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -484,7 +484,6 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 				   struct request **__clone)
 {
 	struct multipath *m = ti->private;
-	int r = DM_MAPIO_REQUEUE;
 	size_t nr_bytes = blk_rq_bytes(rq);
 	struct pgpath *pgpath;
 	struct block_device *bdev;
@@ -503,7 +502,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
 		pg_init_all_paths(m);
-		return r;
+		return DM_MAPIO_REQUEUE;
 	}
 
 	memset(mpio, 0, sizeof(*mpio));
@@ -517,7 +516,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 			GFP_ATOMIC);
 	if (IS_ERR(clone)) {
 		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
-		return r;
+		return DM_MAPIO_DELAY_REQUEUE;
 	}
 	clone->bio = clone->biotail = NULL;
 	clone->rq_disk = bdev->bd_disk;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 28955b94d2b2..90756a56c4d3 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -280,7 +280,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
 	if (!rq->q->mq_ops)
 		dm_old_requeue_request(rq);
 	else
-		dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
+		dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0);
 
 	rq_completed(md, rw, false);
 }

From c82feeec9a014b72c4ffea36648cfb6f81cc1b73 Mon Sep 17 00:00:00 2001
From: Ondrej Kozina <okozina@redhat.com>
Date: Mon, 24 Apr 2017 14:21:53 +0200
Subject: [PATCH 30/50] dm crypt: rewrite (wipe) key in crypto layer using
 random data

The message "key wipe" used to wipe real key stored in crypto layer by
rewriting it with zeroes.  Since commit 28856a9 ("crypto: xts -
consolidate sanity check for keys") this no longer works in FIPS mode
for XTS.

While running in FIPS mode the crypto key part has to differ from the
tweak key.

Fixes: 28856a9 ("crypto: xts - consolidate sanity check for keys")
Cc: stable@vger.kernel.org
Signed-off-by: Ondrej Kozina <okozina@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8bff6f7a4c6c..bb2e747907b0 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2152,12 +2152,16 @@ out:
 
 static int crypt_wipe_key(struct crypt_config *cc)
 {
+	int r;
+
 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
-	memset(&cc->key, 0, cc->key_size * sizeof(u8));
+	get_random_bytes(&cc->key, cc->key_size);
 	kzfree(cc->key_string);
 	cc->key_string = NULL;
+	r = crypt_setkey(cc);
+	memset(&cc->key, 0, cc->key_size * sizeof(u8));
 
-	return crypt_setkey(cc);
+	return r;
 }
 
 static void crypt_dtr(struct dm_target *ti)

From a1b89132dc4f61071bdeaab92ea958e0953380a1 Mon Sep 17 00:00:00 2001
From: Tim Murray <timmurray@google.com>
Date: Fri, 21 Apr 2017 11:11:36 +0200
Subject: [PATCH 31/50] dm crypt: use WQ_HIGHPRI for the IO and crypt
 workqueues

Running dm-crypt with workqueues at the standard priority results in IO
competing for CPU time with standard user apps, which can lead to
pipeline bubbles and seriously degraded performance.  Move to using
WQ_HIGHPRI workqueues to protect against that.

Signed-off-by: Tim Murray <timmurray@google.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bb2e747907b0..64c5022449ee 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2749,16 +2749,17 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ret = -ENOMEM;
-	cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
+	cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
 	if (!cc->io_queue) {
 		ti->error = "Couldn't create kcryptd io queue";
 		goto bad;
 	}
 
 	if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
-		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
 	else
-		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+		cc->crypt_queue = alloc_workqueue("kcryptd",
+						  WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
 						  num_online_cpus());
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";

From d1ac3ff008fb9a48f91fc15920b4c8db24c0f03e Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@benyossef.com>
Date: Sun, 19 Feb 2017 14:46:07 +0200
Subject: [PATCH 32/50] dm verity: switch to using asynchronous hash crypto API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use of the synchronous digest API limits dm-verity to using pure
CPU based algorithm providers and rules out the use of off CPU
algorithm providers which are normally asynchronous by nature,
potentially freeing CPU cycles.

This can reduce performance per Watt in situations such as during
boot time when a lot of concurrent file accesses are made to the
protected volume.

Signed-off-by: Gilad Ben-Yossef <gilad@benyossef.com>
CC: Eric Biggers <ebiggers3@gmail.com>
CC: Ondrej Mosnáček <omosnacek+linux-crypto@gmail.com>
Tested-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-verity-fec.c    |   4 +-
 drivers/md/dm-verity-target.c | 207 +++++++++++++++++++++++-----------
 drivers/md/dm-verity.h        |  23 ++--
 3 files changed, 160 insertions(+), 74 deletions(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 0f0eb8a3d922..dab98fee0754 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -188,7 +188,7 @@ error:
 static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
 			  u8 *want_digest, u8 *data)
 {
-	if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
+	if (unlikely(verity_hash(v, verity_io_hash_req(v, io),
 				 data, 1 << v->data_dev_block_bits,
 				 verity_io_real_digest(v, io))))
 		return 0;
@@ -397,7 +397,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 	}
 
 	/* Always re-validate the corrected block against the expected hash */
-	r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
+	r = verity_hash(v, verity_io_hash_req(v, io), fio->output,
 			1 << v->data_dev_block_bits,
 			verity_io_real_digest(v, io));
 	if (unlikely(r < 0))
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 7335d8a3fc47..97de961a3bfc 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -93,81 +93,123 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
 }
 
 /*
- * Wrapper for crypto_shash_init, which handles verity salting.
+ * Callback function for asynchrnous crypto API completion notification
  */
-static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc)
+static void verity_op_done(struct crypto_async_request *base, int err)
+{
+	struct verity_result *res = (struct verity_result *)base->data;
+
+	if (err == -EINPROGRESS)
+		return;
+
+	res->err = err;
+	complete(&res->completion);
+}
+
+/*
+ * Wait for async crypto API callback
+ */
+static inline int verity_complete_op(struct verity_result *res, int ret)
+{
+	switch (ret) {
+	case 0:
+		break;
+
+	case -EINPROGRESS:
+	case -EBUSY:
+		ret = wait_for_completion_interruptible(&res->completion);
+		if (!ret)
+			ret = res->err;
+		reinit_completion(&res->completion);
+		break;
+
+	default:
+		DMERR("verity_wait_hash: crypto op submission failed: %d", ret);
+	}
+
+	if (unlikely(ret < 0))
+		DMERR("verity_wait_hash: crypto op failed: %d", ret);
+
+	return ret;
+}
+
+static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
+				const u8 *data, size_t len,
+				struct verity_result *res)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, data, len);
+	ahash_request_set_crypt(req, &sg, NULL, len);
+
+	return verity_complete_op(res, crypto_ahash_update(req));
+}
+
+/*
+ * Wrapper for crypto_ahash_init, which handles verity salting.
+ */
+static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
+				struct verity_result *res)
 {
 	int r;
 
-	desc->tfm = v->tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	ahash_request_set_tfm(req, v->tfm);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					verity_op_done, (void *)res);
+	init_completion(&res->completion);
 
-	r = crypto_shash_init(desc);
+	r = verity_complete_op(res, crypto_ahash_init(req));
 
 	if (unlikely(r < 0)) {
-		DMERR("crypto_shash_init failed: %d", r);
+		DMERR("crypto_ahash_init failed: %d", r);
 		return r;
 	}
 
-	if (likely(v->version >= 1)) {
-		r = crypto_shash_update(desc, v->salt, v->salt_size);
-
-		if (unlikely(r < 0)) {
-			DMERR("crypto_shash_update failed: %d", r);
-			return r;
-		}
-	}
-
-	return 0;
-}
-
-static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc,
-			      const u8 *data, size_t len)
-{
-	int r = crypto_shash_update(desc, data, len);
-
-	if (unlikely(r < 0))
-		DMERR("crypto_shash_update failed: %d", r);
+	if (likely(v->version >= 1))
+		r = verity_hash_update(v, req, v->salt, v->salt_size, res);
 
 	return r;
 }
 
-static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
-			     u8 *digest)
+static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
+			     u8 *digest, struct verity_result *res)
 {
 	int r;
 
 	if (unlikely(!v->version)) {
-		r = crypto_shash_update(desc, v->salt, v->salt_size);
+		r = verity_hash_update(v, req, v->salt, v->salt_size, res);
 
 		if (r < 0) {
-			DMERR("crypto_shash_update failed: %d", r);
-			return r;
+			DMERR("verity_hash_final failed updating salt: %d", r);
+			goto out;
 		}
 	}
 
-	r = crypto_shash_final(desc, digest);
-
-	if (unlikely(r < 0))
-		DMERR("crypto_shash_final failed: %d", r);
-
+	ahash_request_set_crypt(req, NULL, digest, 0);
+	r = verity_complete_op(res, crypto_ahash_final(req));
+out:
 	return r;
 }
 
-int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+int verity_hash(struct dm_verity *v, struct ahash_request *req,
 		const u8 *data, size_t len, u8 *digest)
 {
 	int r;
+	struct verity_result res;
 
-	r = verity_hash_init(v, desc);
+	r = verity_hash_init(v, req, &res);
 	if (unlikely(r < 0))
-		return r;
+		goto out;
 
-	r = verity_hash_update(v, desc, data, len);
+	r = verity_hash_update(v, req, data, len, &res);
 	if (unlikely(r < 0))
-		return r;
+		goto out;
 
-	return verity_hash_final(v, desc, digest);
+	r = verity_hash_final(v, req, digest, &res);
+
+out:
+	return r;
 }
 
 static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
@@ -275,7 +317,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			goto release_ret_r;
 		}
 
-		r = verity_hash(v, verity_io_hash_desc(v, io),
+		r = verity_hash(v, verity_io_hash_req(v, io),
 				data, 1 << v->hash_dev_block_bits,
 				verity_io_real_digest(v, io));
 		if (unlikely(r < 0))
@@ -343,6 +385,49 @@ out:
 	return r;
 }
 
+/*
+ * Calculates the digest for the given bio
+ */
+int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
+			struct bvec_iter *iter, struct verity_result *res)
+{
+	unsigned int todo = 1 << v->data_dev_block_bits;
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+	struct scatterlist sg;
+	struct ahash_request *req = verity_io_hash_req(v, io);
+
+	do {
+		int r;
+		unsigned int len;
+		struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+		sg_init_table(&sg, 1);
+
+		len = bv.bv_len;
+
+		if (likely(len >= todo))
+			len = todo;
+		/*
+		 * Operating on a single page at a time looks suboptimal
+		 * until you consider the typical block size is 4,096B.
+		 * Going through this loops twice should be very rare.
+		 */
+		sg_set_page(&sg, bv.bv_page, len, bv.bv_offset);
+		ahash_request_set_crypt(req, &sg, NULL, len);
+		r = verity_complete_op(res, crypto_ahash_update(req));
+
+		if (unlikely(r < 0)) {
+			DMERR("verity_for_io_block crypto op failed: %d", r);
+			return r;
+		}
+
+		bio_advance_iter(bio, iter, len);
+		todo -= len;
+	} while (todo);
+
+	return 0;
+}
+
 /*
  * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
  * starting from iter.
@@ -381,12 +466,6 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 	return 0;
 }
 
-static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
-				 u8 *data, size_t len)
-{
-	return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
-}
-
 static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
 			  u8 *data, size_t len)
 {
@@ -403,10 +482,11 @@ static int verity_verify_io(struct dm_verity_io *io)
 	struct dm_verity *v = io->v;
 	struct bvec_iter start;
 	unsigned b;
+	struct verity_result res;
 
 	for (b = 0; b < io->n_blocks; b++) {
 		int r;
-		struct shash_desc *desc = verity_io_hash_desc(v, io);
+		struct ahash_request *req = verity_io_hash_req(v, io);
 
 		r = verity_hash_for_block(v, io, io->block + b,
 					  verity_io_want_digest(v, io),
@@ -427,16 +507,17 @@ static int verity_verify_io(struct dm_verity_io *io)
 			continue;
 		}
 
-		r = verity_hash_init(v, desc);
+		r = verity_hash_init(v, req, &res);
 		if (unlikely(r < 0))
 			return r;
 
 		start = io->iter;
-		r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update);
+		r = verity_for_io_block(v, io, &io->iter, &res);
 		if (unlikely(r < 0))
 			return r;
 
-		r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
+		r = verity_hash_final(v, req, verity_io_real_digest(v, io),
+					&res);
 		if (unlikely(r < 0))
 			return r;
 
@@ -705,7 +786,7 @@ static void verity_dtr(struct dm_target *ti)
 	kfree(v->zero_digest);
 
 	if (v->tfm)
-		crypto_free_shash(v->tfm);
+		crypto_free_ahash(v->tfm);
 
 	kfree(v->alg_name);
 
@@ -723,7 +804,7 @@ static void verity_dtr(struct dm_target *ti)
 static int verity_alloc_zero_digest(struct dm_verity *v)
 {
 	int r = -ENOMEM;
-	struct shash_desc *desc;
+	struct ahash_request *req;
 	u8 *zero_data;
 
 	v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
@@ -731,9 +812,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
 	if (!v->zero_digest)
 		return r;
 
-	desc = kmalloc(v->shash_descsize, GFP_KERNEL);
+	req = kmalloc(v->ahash_reqsize, GFP_KERNEL);
 
-	if (!desc)
+	if (!req)
 		return r; /* verity_dtr will free zero_digest */
 
 	zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
@@ -741,11 +822,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
 	if (!zero_data)
 		goto out;
 
-	r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits,
+	r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits,
 			v->zero_digest);
 
 out:
-	kfree(desc);
+	kfree(req);
 	kfree(zero_data);
 
 	return r;
@@ -923,21 +1004,21 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
+	v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0);
 	if (IS_ERR(v->tfm)) {
 		ti->error = "Cannot initialize hash function";
 		r = PTR_ERR(v->tfm);
 		v->tfm = NULL;
 		goto bad;
 	}
-	v->digest_size = crypto_shash_digestsize(v->tfm);
+	v->digest_size = crypto_ahash_digestsize(v->tfm);
 	if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
 		ti->error = "Digest size too big";
 		r = -EINVAL;
 		goto bad;
 	}
-	v->shash_descsize =
-		sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
+	v->ahash_reqsize = sizeof(struct ahash_request) +
+		crypto_ahash_reqsize(v->tfm);
 
 	v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
 	if (!v->root_digest) {
@@ -1037,7 +1118,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	ti->per_io_data_size = sizeof(struct dm_verity_io) +
-				v->shash_descsize + v->digest_size * 2;
+				v->ahash_reqsize + v->digest_size * 2;
 
 	r = verity_fec_ctr(v);
 	if (r)
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index fb419f422d73..a59e0ada6fd3 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -37,7 +37,7 @@ struct dm_verity {
 	struct dm_target *ti;
 	struct dm_bufio_client *bufio;
 	char *alg_name;
-	struct crypto_shash *tfm;
+	struct crypto_ahash *tfm;
 	u8 *root_digest;	/* digest of the root block */
 	u8 *salt;		/* salt: its size is salt_size */
 	u8 *zero_digest;	/* digest for a zero block */
@@ -52,7 +52,7 @@ struct dm_verity {
 	unsigned char levels;	/* the number of tree levels */
 	unsigned char version;
 	unsigned digest_size;	/* digest size for the current hash algorithm */
-	unsigned shash_descsize;/* the size of temporary space for crypto */
+	unsigned int ahash_reqsize;/* the size of temporary space for crypto */
 	int hash_failed;	/* set to 1 if hash of any block failed */
 	enum verity_mode mode;	/* mode for handling verification errors */
 	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
@@ -81,31 +81,36 @@ struct dm_verity_io {
 	/*
 	 * Three variably-size fields follow this struct:
 	 *
-	 * u8 hash_desc[v->shash_descsize];
+	 * u8 hash_req[v->ahash_reqsize];
 	 * u8 real_digest[v->digest_size];
 	 * u8 want_digest[v->digest_size];
 	 *
-	 * To access them use: verity_io_hash_desc(), verity_io_real_digest()
+	 * To access them use: verity_io_hash_req(), verity_io_real_digest()
 	 * and verity_io_want_digest().
 	 */
 };
 
-static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v,
+struct verity_result {
+	struct completion completion;
+	int err;
+};
+
+static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
 						     struct dm_verity_io *io)
 {
-	return (struct shash_desc *)(io + 1);
+	return (struct ahash_request *)(io + 1);
 }
 
 static inline u8 *verity_io_real_digest(struct dm_verity *v,
 					struct dm_verity_io *io)
 {
-	return (u8 *)(io + 1) + v->shash_descsize;
+	return (u8 *)(io + 1) + v->ahash_reqsize;
 }
 
 static inline u8 *verity_io_want_digest(struct dm_verity *v,
 					struct dm_verity_io *io)
 {
-	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+	return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size;
 }
 
 static inline u8 *verity_io_digest_end(struct dm_verity *v,
@@ -120,7 +125,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 					      struct dm_verity_io *io,
 					      u8 *data, size_t len));
 
-extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
 		       const u8 *data, size_t len, u8 *digest);
 
 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,

From 86f917adea2dc0ef440a248601e48154dc48e6d0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 30 Mar 2017 22:18:48 -0700
Subject: [PATCH 33/50] dm crypt: remove obsolete references to per-CPU state

dm-crypt used to use separate crypto transforms for each CPU, but this
is no longer the case.  To avoid confusion, fix up obsolete comments and
rename setup_essiv_cpu().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 64c5022449ee..e2de5715d528 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -225,7 +225,7 @@ static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
 					     struct scatterlist *sg);
 
 /*
- * Use this to access cipher attributes that are the same for each CPU.
+ * Use this to access cipher attributes that are independent of the key.
  */
 static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
 {
@@ -349,10 +349,11 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 	return err;
 }
 
-/* Set up per cpu cipher state */
-static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
-					     struct dm_target *ti,
-					     u8 *salt, unsigned saltsize)
+/* Allocate the cipher for ESSIV */
+static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
+						struct dm_target *ti,
+						const u8 *salt,
+						unsigned int saltsize)
 {
 	struct crypto_cipher *essiv_tfm;
 	int err;
@@ -431,9 +432,8 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	cc->iv_gen_private.essiv.salt = salt;
 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
-	essiv_tfm = setup_essiv_cpu(cc, ti, salt,
-				crypto_ahash_digestsize(hash_tfm));
-
+	essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
+				       crypto_ahash_digestsize(hash_tfm));
 	if (IS_ERR(essiv_tfm)) {
 		crypt_iv_essiv_dtr(cc);
 		return PTR_ERR(essiv_tfm);

From e944e03e336f7ffa02aabc71291933d93dcd077c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andy.shevchenko@gmail.com>
Date: Thu, 27 Apr 2017 16:52:04 +0300
Subject: [PATCH 34/50] dm crypt: replace custom implementation of hex2bin()

There is no need to have a duplication of the generic library, i.e. hex2bin().
Replace the open coded variant.

Signed-off-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Tested-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 27 ++-------------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e2de5715d528..24f3b9fdeeb6 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1814,30 +1814,6 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 	queue_work(cc->crypt_queue, &io->work);
 }
 
-/*
- * Decode key from its hex representation
- */
-static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
-{
-	char buffer[3];
-	unsigned int i;
-
-	buffer[2] = '\0';
-
-	for (i = 0; i < size; i++) {
-		buffer[0] = *hex++;
-		buffer[1] = *hex++;
-
-		if (kstrtou8(buffer, 16, &key[i]))
-			return -EINVAL;
-	}
-
-	if (*hex != '\0')
-		return -EINVAL;
-
-	return 0;
-}
-
 static void crypt_free_tfms_aead(struct crypt_config *cc)
 {
 	if (!cc->cipher_tfm.tfms_aead)
@@ -2136,7 +2112,8 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 	kzfree(cc->key_string);
 	cc->key_string = NULL;
 
-	if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
+	/* Decode key from its hex representation. */
+	if (cc->key_size && hex2bin(cc->key, key, cc->key_size) < 0)
 		goto out;
 
 	r = crypt_setkey(cc);

From 6625d903253eb6f003849823e22d7b8de5bfb5b2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 27 Apr 2017 11:49:33 -0400
Subject: [PATCH 35/50] dm integrity: use hex2bin instead of open-coded variant

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 0354af4cd713..023d3f8a51cc 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2496,8 +2496,6 @@ static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, ch
 
 	k = strchr(a->alg_string, ':');
 	if (k) {
-		unsigned i;
-
 		*k = 0;
 		a->key_string = k + 1;
 		if (strlen(a->key_string) & 1)
@@ -2507,16 +2505,8 @@ static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, ch
 		a->key = kmalloc(a->key_size, GFP_KERNEL);
 		if (!a->key)
 			goto nomem;
-		for (i = 0; i < a->key_size; i++) {
-			char digit[3];
-			digit[0] = a->key_string[i * 2];
-			digit[1] = a->key_string[i * 2 + 1];
-			digit[2] = 0;
-			if (strspn(digit, "0123456789abcdefABCDEF") != 2)
-				goto inval;
-			if (kstrtou8(digit, 16, &a->key[i]))
-				goto inval;
-		}
+		if (hex2bin(a->key, a->key_string, a->key_size))
+			goto inval;
 	}
 
 	return 0;

From 84ff1bcc2e25f1ddf5b350c4fa718ca01fdd88e9 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 26 Apr 2017 18:39:47 -0400
Subject: [PATCH 36/50] dm integrity: use previously calculated log2 of
 sectors_per_block

The log2 of sectors_per_block was already calculated, so we don't have
to use the ilog2 function.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 023d3f8a51cc..c7f7c8d76576 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -14,7 +14,6 @@
 #include <linux/rbtree.h>
 #include <linux/delay.h>
 #include <linux/random.h>
-#include <linux/log2.h>
 #include <crypto/hash.h>
 #include <crypto/skcipher.h>
 #include <linux/async_tx.h>
@@ -2369,7 +2368,7 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
 	bi.profile = &dm_integrity_profile;
 	bi.tuple_size = ic->tag_size;
 	bi.tag_size = bi.tuple_size;
-	bi.interval_exp = ilog2(ic->sectors_per_block << SECTOR_SHIFT);
+	bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
 
 	blk_integrity_register(disk, &bi);
 	blk_queue_max_integrity_segments(disk->queue, UINT_MAX);

From 4617f564c06117c7d1b611be49521a4430042287 Mon Sep 17 00:00:00 2001
From: Adrian Salido <salidoa@google.com>
Date: Thu, 27 Apr 2017 10:32:55 -0700
Subject: [PATCH 37/50] dm ioctl: prevent stack leak in dm ioctl call

When calling a dm ioctl that doesn't process any data
(IOCTL_FLAGS_NO_PARAMS), the contents of the data field in struct
dm_ioctl are left initialized.  Current code is incorrectly extending
the size of data copied back to user, causing the contents of kernel
stack to be leaked to user.  Fix by only copying contents before data
and allow the functions processing the ioctl to override.

Cc: stable@vger.kernel.org
Signed-off-by: Adrian Salido <salidoa@google.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0956b8659360..ddda8107aa7e 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1840,7 +1840,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	if (r)
 		goto out;
 
-	param->data_size = sizeof(*param);
+	param->data_size = offsetof(struct dm_ioctl, data);
 	r = fn(param, input_param_size);
 
 	if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&

From 89bfce763e43fa4897e0d3af6b29ed909df64cfd Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:14 -0700
Subject: [PATCH 38/50] dm mpath: split and rename activate_path() to prepare
 for its expanded use

activate_path() is renamed to activate_path_work() which now calls
activate_or_offline_path().  activate_or_offline_path() will be used
by the next commit.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index a4cc4d42117b..4b891d9ff349 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -111,7 +111,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);
 
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void trigger_event(struct work_struct *work);
-static void activate_path(struct work_struct *work);
+static void activate_or_offline_path(struct pgpath *pgpath);
+static void activate_path_work(struct work_struct *work);
 static void process_queued_bios(struct work_struct *work);
 
 /*-----------------------------------------------
@@ -136,7 +137,7 @@ static struct pgpath *alloc_pgpath(void)
 
 	if (pgpath) {
 		pgpath->is_active = true;
-		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
+		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
 	}
 
 	return pgpath;
@@ -1436,10 +1437,8 @@ out:
 	spin_unlock_irqrestore(&m->lock, flags);
 }
 
-static void activate_path(struct work_struct *work)
+static void activate_or_offline_path(struct pgpath *pgpath)
 {
-	struct pgpath *pgpath =
-		container_of(work, struct pgpath, activate_path.work);
 	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
 
 	if (pgpath->is_active && !blk_queue_dying(q))
@@ -1448,6 +1447,14 @@ static void activate_path(struct work_struct *work)
 		pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
 }
 
+static void activate_path_work(struct work_struct *work)
+{
+	struct pgpath *pgpath =
+		container_of(work, struct pgpath, activate_path.work);
+
+	activate_or_offline_path(pgpath);
+}
+
 static int noretry_error(int error)
 {
 	switch (error) {

From 7083abbbfc4fa706ff72d27d33a5214881979336 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:15 -0700
Subject: [PATCH 39/50] dm mpath: avoid that path removal can trigger an
 infinite loop

If blk_get_request() fails, check whether the failure is due to a path
being removed.  If that is the case, fail the path by triggering a call
to fail_path().  This avoids that the following scenario can be
encountered while removing paths:
* CPU usage of a kworker thread jumps to 100%.
* Removing the DM device becomes impossible.

Delay requeueing if blk_get_request() returns -EBUSY or -EWOULDBLOCK,
and the queue is not dying, because in these cases immediate requeuing
is inappropriate.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 4b891d9ff349..f3c79f188747 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -489,6 +489,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	struct pgpath *pgpath;
 	struct block_device *bdev;
 	struct dm_mpath_io *mpio = get_mpio(map_context);
+	struct request_queue *q;
 	struct request *clone;
 
 	/* Do we need to select a new pgpath? */
@@ -511,12 +512,18 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	mpio->nr_bytes = nr_bytes;
 
 	bdev = pgpath->path.dev->bdev;
-
-	clone = blk_get_request(bdev_get_queue(bdev),
-			rq->cmd_flags | REQ_NOMERGE,
-			GFP_ATOMIC);
+	q = bdev_get_queue(bdev);
+	clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
 	if (IS_ERR(clone)) {
 		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
+		bool queue_dying = blk_queue_dying(q);
+		DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
+			    PTR_ERR(clone), queue_dying ? " (path offline)" : "");
+		if (queue_dying) {
+			atomic_inc(&m->pg_init_in_progress);
+			activate_or_offline_path(pgpath);
+			return DM_MAPIO_REQUEUE;
+		}
 		return DM_MAPIO_DELAY_REQUEUE;
 	}
 	clone->bio = clone->biotail = NULL;

From c1d7ecf7ca11d0edd3085262c8597203440d056c Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:16 -0700
Subject: [PATCH 40/50] dm mpath: delay requeuing while path initialization is
 in progress

Requeuing a request immediately while path initialization is ongoing
causes high CPU usage, something that is undesired.  Hence delay
requeuing while path initialization is in progress.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f3c79f188747..d85baffa3377 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -322,13 +322,16 @@ static int __pg_init_all_paths(struct multipath *m)
 	return atomic_read(&m->pg_init_in_progress);
 }
 
-static void pg_init_all_paths(struct multipath *m)
+static int pg_init_all_paths(struct multipath *m)
 {
+	int ret;
 	unsigned long flags;
 
 	spin_lock_irqsave(&m->lock, flags);
-	__pg_init_all_paths(m);
+	ret = __pg_init_all_paths(m);
 	spin_unlock_irqrestore(&m->lock, flags);
+
+	return ret;
 }
 
 static void __switch_pg(struct multipath *m, struct priority_group *pg)
@@ -503,7 +506,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 		return -EIO;	/* Failed */
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
-		pg_init_all_paths(m);
+		if (pg_init_all_paths(m))
+			return DM_MAPIO_DELAY_REQUEUE;
 		return DM_MAPIO_REQUEUE;
 	}
 

From 23a601248958fa4142d49294352fe8d1fdf3e509 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:19 -0700
Subject: [PATCH 41/50] dm rq: check blk_mq_register_dev() return value in
 dm_mq_init_request_queue()

Otherwise the request-based DM blk-mq request_queue will be put into
service without being properly exported via sysfs.

Cc: stable@vger.kernel.org
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-rq.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 90756a56c4d3..a6e8da9da7a4 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -809,10 +809,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	dm_init_md_queue(md);
 
 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
-	blk_mq_register_dev(disk_to_dev(md->disk), q);
+	err = blk_mq_register_dev(disk_to_dev(md->disk), q);
+	if (err)
+		goto out_cleanup_queue;
 
 	return 0;
 
+out_cleanup_queue:
+	blk_cleanup_queue(q);
 out_tag_set:
 	blk_mq_free_tag_set(md->tag_set);
 out_kfree_tag_set:

From 73cbca6a637eb88738ea5a5cd6a611bbbca8ac19 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:20 -0700
Subject: [PATCH 42/50] dm block manager: remove an unused argument from
 dm_block_manager_create()

The 'cache_size' argument of dm_block_manager_create() has never been
used.  Remove it along with the definitions of the constants passed as
the 'cache_size' argument.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-metadata.c                | 3 ---
 drivers/md/dm-era-target.c                    | 2 --
 drivers/md/dm-thin-metadata.c                 | 2 --
 drivers/md/persistent-data/dm-block-manager.c | 1 -
 drivers/md/persistent-data/dm-block-manager.h | 2 +-
 5 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index e4c2c1a1e993..5a026dc24db6 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -27,8 +27,6 @@
 #define MIN_CACHE_VERSION 1
 #define MAX_CACHE_VERSION 2
 
-#define CACHE_METADATA_CACHE_SIZE 64
-
 /*
  *  3 for btree insert +
  *  2 for btree lookup used within space map
@@ -535,7 +533,6 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
 {
 	int r;
 	cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
-					  CACHE_METADATA_CACHE_SIZE,
 					  CACHE_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(cmd->bm)) {
 		DMERR("could not create block manager");
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 68d4084377ad..e7ba89f98d8d 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -254,7 +254,6 @@ static struct dm_block_validator sb_validator = {
  * Low level metadata handling
  *--------------------------------------------------------------*/
 #define DM_ERA_METADATA_BLOCK_SIZE 4096
-#define DM_ERA_METADATA_CACHE_SIZE 64
 #define ERA_MAX_CONCURRENT_LOCKS 5
 
 struct era_metadata {
@@ -615,7 +614,6 @@ static int create_persistent_data_objects(struct era_metadata *md,
 	int r;
 
 	md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
-					 DM_ERA_METADATA_CACHE_SIZE,
 					 ERA_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(md->bm)) {
 		DMERR("could not create block manager");
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index a15091a0d40c..0f0251d0d337 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -77,7 +77,6 @@
 #define THIN_SUPERBLOCK_MAGIC 27022010
 #define THIN_SUPERBLOCK_LOCATION 0
 #define THIN_VERSION 2
-#define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3
 
 /*
@@ -686,7 +685,6 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
 	int r;
 
 	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
-					  THIN_METADATA_CACHE_SIZE,
 					  THIN_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(pmd->bm)) {
 		DMERR("could not create block manager");
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 8589e0a14068..ea15d220ced7 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -378,7 +378,6 @@ struct dm_block_manager {
 
 struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
 						 unsigned block_size,
-						 unsigned cache_size,
 						 unsigned max_held_per_thread)
 {
 	int r;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 3627d1b7667a..e728937f376a 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -33,7 +33,7 @@ void *dm_block_data(struct dm_block *b);
 struct dm_block_manager;
 struct dm_block_manager *dm_block_manager_create(
 	struct block_device *bdev, unsigned block_size,
-	unsigned cache_size, unsigned max_held_per_thread);
+	unsigned max_held_per_thread);
 void dm_block_manager_destroy(struct dm_block_manager *bm);
 
 unsigned dm_bm_block_size(struct dm_block_manager *bm);

From 1ea0654e46eb62acc379000be2f16350101ebf85 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:21 -0700
Subject: [PATCH 43/50] dm: verify suspend_locking assumptions at runtime

Ensure that the assumptions about the caller holding suspend_lock
are checked at runtime if lockdep is enabled.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 4 ++++
 drivers/md/dm.c       | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index c68757f94e16..515136f29115 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1680,6 +1680,8 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
 	int i = t->num_targets;
 	struct dm_target *ti = t->targets;
 
+	lockdep_assert_held(&t->md->suspend_lock);
+
 	while (i--) {
 		switch (mode) {
 		case PRESUSPEND:
@@ -1727,6 +1729,8 @@ int dm_table_resume_targets(struct dm_table *t)
 {
 	int i, r = 0;
 
+	lockdep_assert_held(&t->md->suspend_lock);
+
 	for (i = 0; i < t->num_targets; i++) {
 		struct dm_target *ti = t->targets + i;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e602ae0d5d75..9940c9a42665 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1698,6 +1698,8 @@ static void event_callback(void *context)
  */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
+	lockdep_assert_held(&md->suspend_lock);
+
 	set_capacity(md->disk, size);
 
 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
@@ -2147,8 +2149,6 @@ static void unlock_fs(struct mapped_device *md)
  * If __dm_suspend returns 0, the device is completely quiescent
  * now. There is no request-processing activity. All new requests
  * are being added to md->deferred list.
- *
- * Caller must hold md->suspend_lock
  */
 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 			unsigned suspend_flags, long task_state,
@@ -2364,6 +2364,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla
 {
 	struct dm_table *map = NULL;
 
+	lockdep_assert_held(&md->suspend_lock);
+
 	if (md->internal_suspend_count++)
 		return; /* nested internal suspend */
 

From b194679fac3067e785f432109c69759aa75b038c Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:22 -0700
Subject: [PATCH 44/50] dm mpath: verify __pg_init_all_paths locking
 assumptions at runtime

Verify at runtime that __pg_init_all_paths() is called with
multipath.lock held if lockdep is enabled.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d85baffa3377..730ec0be1afa 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -298,6 +298,8 @@ static int __pg_init_all_paths(struct multipath *m)
 	struct pgpath *pgpath;
 	unsigned long pg_init_delay = 0;
 
+	lockdep_assert_held(&m->lock);
+
 	if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
 		return 0;
 

From 7e0d574f2683a2346c978613a72ff07afc89b17a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:23 -0700
Subject: [PATCH 45/50] dm: introduce enum dm_queue_mode to cleanup related
 code

Introduce an enumeration type for the queue mode.  This patch does
not change any functionality but makes the DM code easier to read.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |  2 +-
 drivers/md/dm-ioctl.c         |  2 +-
 drivers/md/dm-mpath.c         |  5 ++++-
 drivers/md/dm-table.c         | 14 +++++++-------
 drivers/md/dm.c               | 11 +++++++----
 drivers/md/dm.h               |  8 ++++----
 include/linux/device-mapper.h | 14 ++++++++------
 7 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e5..b92f74d9a982 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -47,7 +47,7 @@ struct mapped_device {
 	struct request_queue *queue;
 	int numa_node_id;
 
-	unsigned type;
+	enum dm_queue_mode type;
 	/* Protect queue and type against concurrent access. */
 	struct mutex type_lock;
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index ddda8107aa7e..2d5d7064acbf 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1260,7 +1260,7 @@ static int populate_table(struct dm_table *table,
 	return dm_table_complete(table);
 }
 
-static bool is_valid_type(unsigned cur, unsigned new)
+static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
 {
 	if (cur == new ||
 	    (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 730ec0be1afa..8336332bd61f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -90,7 +90,7 @@ struct multipath {
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_count;		/* Number of times pg_init called */
 
-	unsigned queue_mode;
+	enum dm_queue_mode queue_mode;
 
 	struct mutex work_mutex;
 	struct work_struct trigger_event;
@@ -1700,6 +1700,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			case DM_TYPE_MQ_REQUEST_BASED:
 				DMEMIT("queue_mode mq ");
 				break;
+			default:
+				WARN_ON_ONCE(true);
+				break;
 			}
 		}
 	}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 515136f29115..a02a04829156 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -30,7 +30,7 @@
 
 struct dm_table {
 	struct mapped_device *md;
-	unsigned type;
+	enum dm_queue_mode type;
 
 	/* btree table */
 	unsigned int depth;
@@ -825,19 +825,19 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 }
 EXPORT_SYMBOL(dm_consume_args);
 
-static bool __table_type_bio_based(unsigned table_type)
+static bool __table_type_bio_based(enum dm_queue_mode table_type)
 {
 	return (table_type == DM_TYPE_BIO_BASED ||
 		table_type == DM_TYPE_DAX_BIO_BASED);
 }
 
-static bool __table_type_request_based(unsigned table_type)
+static bool __table_type_request_based(enum dm_queue_mode table_type)
 {
 	return (table_type == DM_TYPE_REQUEST_BASED ||
 		table_type == DM_TYPE_MQ_REQUEST_BASED);
 }
 
-void dm_table_set_type(struct dm_table *t, unsigned type)
+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
 {
 	t->type = type;
 }
@@ -879,7 +879,7 @@ static int dm_table_determine_type(struct dm_table *t)
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
-	unsigned live_md_type = dm_get_md_type(t->md);
+	enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
 
 	if (t->type != DM_TYPE_NONE) {
 		/* target already set the table's type */
@@ -988,7 +988,7 @@ verify_rq_based:
 	return 0;
 }
 
-unsigned dm_table_get_type(struct dm_table *t)
+enum dm_queue_mode dm_table_get_type(struct dm_table *t)
 {
 	return t->type;
 }
@@ -1039,7 +1039,7 @@ bool dm_table_all_blk_mq_devices(struct dm_table *t)
 
 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
-	unsigned type = dm_table_get_type(t);
+	enum dm_queue_mode type = dm_table_get_type(t);
 	unsigned per_io_data_size = 0;
 	struct dm_target *tgt;
 	unsigned i;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9940c9a42665..45660246e8f5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1807,13 +1807,13 @@ void dm_unlock_md_type(struct mapped_device *md)
 	mutex_unlock(&md->type_lock);
 }
 
-void dm_set_md_type(struct mapped_device *md, unsigned type)
+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
 {
 	BUG_ON(!mutex_is_locked(&md->type_lock));
 	md->type = type;
 }
 
-unsigned dm_get_md_type(struct mapped_device *md)
+enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
 {
 	return md->type;
 }
@@ -1840,7 +1840,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
-	unsigned type = dm_get_md_type(md);
+	enum dm_queue_mode type = dm_get_md_type(md);
 
 	switch (type) {
 	case DM_TYPE_REQUEST_BASED:
@@ -1871,6 +1871,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		if (type == DM_TYPE_DAX_BIO_BASED)
 			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
 		break;
+	case DM_TYPE_NONE:
+		WARN_ON_ONCE(true);
+		break;
 	}
 
 	return 0;
@@ -2556,7 +2559,7 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
 					    unsigned integrity, unsigned per_io_data_size)
 {
 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f298b01f7ab3..38c84c0a35d4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -64,7 +64,7 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-unsigned dm_table_get_type(struct dm_table *t);
+enum dm_queue_mode dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
@@ -76,8 +76,8 @@ struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
-void dm_set_md_type(struct mapped_device *md, unsigned type);
-unsigned dm_get_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type);
+enum dm_queue_mode dm_get_md_type(struct mapped_device *md);
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
 
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
@@ -204,7 +204,7 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
 					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 98f981026e4e..1ce4036224eb 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -22,11 +22,13 @@ struct bio_vec;
 /*
  * Type of table, mapped_device's mempool and request_queue
  */
-#define DM_TYPE_NONE			0
-#define DM_TYPE_BIO_BASED		1
-#define DM_TYPE_REQUEST_BASED		2
-#define DM_TYPE_MQ_REQUEST_BASED	3
-#define DM_TYPE_DAX_BIO_BASED		4
+enum dm_queue_mode {
+	DM_TYPE_NONE		 = 0,
+	DM_TYPE_BIO_BASED	 = 1,
+	DM_TYPE_REQUEST_BASED	 = 2,
+	DM_TYPE_MQ_REQUEST_BASED = 3,
+	DM_TYPE_DAX_BIO_BASED	 = 4,
+};
 
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
@@ -476,7 +478,7 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
  * Useful for "hybrid" target (supports both bio-based
  * and request-based).
  */
-void dm_table_set_type(struct dm_table *t, unsigned type);
+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
 
 /*
  * Finally call this to make the table ready for use.

From ca5beb76c32af33e5be4e01c85c8bd5a067c4543 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:24 -0700
Subject: [PATCH 46/50] dm mpath: micro-optimize the hot path relative to
 MPATHF_QUEUE_IF_NO_PATH

Instead of checking MPATHF_QUEUE_IF_NO_PATH,
MPATHF_SAVED_QUEUE_IF_NO_PATH and the no_flush flag to decide whether
or not to push back a request (or bio) if there are no paths available,
only clear MPATHF_QUEUE_IF_NO_PATH in queue_if_no_path() if no_flush has
not been set.  The result is that only a single bit has to be tested in
the hot path to decide whether or not a request must be pushed back and
also that m->lock does not have to be taken in the hot path.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 70 +++++++------------------------------------
 1 file changed, 11 insertions(+), 59 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 8336332bd61f..5cb1beccd1e2 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -441,47 +441,6 @@ failed:
 	return NULL;
 }
 
-/*
- * Check whether bios must be queued in the device-mapper core rather
- * than here in the target.
- *
- * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
- * same value then we are not between multipath_presuspend()
- * and multipath_resume() calls and we have no need to check
- * for the DMF_NOFLUSH_SUSPENDING flag.
- */
-static bool __must_push_back(struct multipath *m)
-{
-	return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
-		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
-		dm_noflush_suspending(m->ti));
-}
-
-static bool must_push_back_rq(struct multipath *m)
-{
-	bool r;
-	unsigned long flags;
-
-	spin_lock_irqsave(&m->lock, flags);
-	r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
-	     __must_push_back(m));
-	spin_unlock_irqrestore(&m->lock, flags);
-
-	return r;
-}
-
-static bool must_push_back_bio(struct multipath *m)
-{
-	bool r;
-	unsigned long flags;
-
-	spin_lock_irqsave(&m->lock, flags);
-	r = __must_push_back(m);
-	spin_unlock_irqrestore(&m->lock, flags);
-
-	return r;
-}
-
 /*
  * Map cloned requests (request-based multipath)
  */
@@ -503,7 +462,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 		pgpath = choose_pgpath(m, nr_bytes);
 
 	if (!pgpath) {
-		if (must_push_back_rq(m))
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			return DM_MAPIO_DELAY_REQUEUE;
 		return -EIO;	/* Failed */
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
@@ -580,9 +539,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	}
 
 	if (!pgpath) {
-		if (!must_push_back_bio(m))
-			return -EIO;
-		return DM_MAPIO_REQUEUE;
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+			return DM_MAPIO_REQUEUE;
+		return -EIO;
 	}
 
 	mpio->pgpath = pgpath;
@@ -674,7 +633,7 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 		else
 			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
 	}
-	if (queue_if_no_path)
+	if (queue_if_no_path || dm_noflush_suspending(m->ti))
 		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
 	else
 		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
@@ -1519,12 +1478,9 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);
 
-	if (!atomic_read(&m->nr_valid_paths)) {
-		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			if (!must_push_back_rq(m))
-				r = -EIO;
-		}
-	}
+	if (atomic_read(&m->nr_valid_paths) == 0 &&
+	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+		r = -EIO;
 
 	return r;
 }
@@ -1565,13 +1521,9 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);
 
-	if (!atomic_read(&m->nr_valid_paths)) {
-		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			if (!must_push_back_bio(m))
-				return -EIO;
-			return DM_ENDIO_REQUEUE;
-		}
-	}
+	if (atomic_read(&m->nr_valid_paths) == 0 &&
+	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+		return -EIO;
 
 	/* Queue for the daemon to resubmit */
 	dm_bio_restore(get_bio_details_from_bio(clone), clone);

From 9a8ac3ae682e8760afebab71556a9a8d1b18f906 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:25 -0700
Subject: [PATCH 47/50] dm mpath: cleanup QUEUE_IF_NO_PATH bit manipulation by
 introducing assign_bit()

No functional change but makes the code easier to read.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5cb1beccd1e2..cc529537c8ce 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -612,6 +612,14 @@ static void process_queued_bios(struct work_struct *work)
 	blk_finish_plug(&plug);
 }
 
+static void assign_bit(bool value, long nr, unsigned long *addr)
+{
+	if (value)
+		set_bit(nr, addr);
+	else
+		clear_bit(nr, addr);
+}
+
 /*
  * If we run out of usable paths, should we queue I/O or error it?
  */
@@ -621,23 +629,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 	unsigned long flags;
 
 	spin_lock_irqsave(&m->lock, flags);
-
-	if (save_old_value) {
-		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
-			set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-		else
-			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-	} else {
-		if (queue_if_no_path)
-			set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-		else
-			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-	}
-	if (queue_if_no_path || dm_noflush_suspending(m->ti))
-		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-	else
-		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-
+	assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
+		   (!save_old_value && queue_if_no_path),
+		   MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+	assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
+		   MPATHF_QUEUE_IF_NO_PATH, &m->flags);
 	spin_unlock_irqrestore(&m->lock, flags);
 
 	if (!queue_if_no_path) {
@@ -1589,10 +1585,8 @@ static void multipath_resume(struct dm_target *ti)
 	unsigned long flags;
 
 	spin_lock_irqsave(&m->lock, flags);
-	if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
-		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-	else
-		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+	assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
+		   MPATHF_QUEUE_IF_NO_PATH, &m->flags);
 	spin_unlock_irqrestore(&m->lock, flags);
 }
 

From 86331f39a5935b092d3ea59446d416563ed05d16 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:26 -0700
Subject: [PATCH 48/50] dm mpath: make it easier to detect unintended I/O
 request flushes

I/O errors triggered by multipathd incorrectly not enabling the no-flush
flag for DM_DEVICE_SUSPEND or DM_DEVICE_RESUME are hard to debug.  Add
more logging to make it easier to debug this.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 25 +++++++++++++++++++++----
 drivers/md/dm.c       |  2 ++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index cc529537c8ce..fd7cdc4ce2a5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -441,6 +441,23 @@ failed:
 	return NULL;
 }
 
+/*
+ * dm_report_EIO() is a macro instead of a function to make pr_debug()
+ * report the function name and line number of the function from which
+ * it has been invoked.
+ */
+#define dm_report_EIO(m)						\
+({									\
+	struct mapped_device *md = dm_table_get_md((m)->ti->table);	\
+									\
+	pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
+		 dm_device_name(md),					\
+		 test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),	\
+		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags),	\
+		 dm_noflush_suspending((m)->ti));			\
+	-EIO;								\
+})
+
 /*
  * Map cloned requests (request-based multipath)
  */
@@ -464,7 +481,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	if (!pgpath) {
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			return DM_MAPIO_DELAY_REQUEUE;
-		return -EIO;	/* Failed */
+		return dm_report_EIO(m);	/* Failed */
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
 		if (pg_init_all_paths(m))
@@ -541,7 +558,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	if (!pgpath) {
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			return DM_MAPIO_REQUEUE;
-		return -EIO;
+		return dm_report_EIO(m);
 	}
 
 	mpio->pgpath = pgpath;
@@ -1476,7 +1493,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 
 	if (atomic_read(&m->nr_valid_paths) == 0 &&
 	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
-		r = -EIO;
+		r = dm_report_EIO(m);
 
 	return r;
 }
@@ -1519,7 +1536,7 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
 
 	if (atomic_read(&m->nr_valid_paths) == 0 &&
 	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
-		return -EIO;
+		return dm_report_EIO(m);
 
 	/* Queue for the daemon to resubmit */
 	dm_bio_restore(get_bio_details_from_bio(clone), clone);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 45660246e8f5..dbfaf6dde657 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2169,6 +2169,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	 */
 	if (noflush)
 		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+	else
+		pr_debug("%s: suspending with flush\n", dm_device_name(md));
 
 	/*
 	 * This gets reverted if there's an error later and the targets

From 1b0fb5a5b2dc0dddcfa575060441a7176ba7ac37 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 30 Apr 2017 17:33:26 -0400
Subject: [PATCH 49/50] dm bufio: avoid a possible ABBA deadlock

__get_memory_limit() tests if dm_bufio_cache_size changed and calls
__cache_size_refresh() if it did.  It takes dm_bufio_clients_lock while
it already holds the client lock.  However, lock ordering is violated
because in cleanup_old_buffers() dm_bufio_clients_lock is taken before
the client lock.

This results in a possible deadlock and lockdep engine warning.

Fix this deadlock by changing mutex_lock() to mutex_trylock().  If the
lock can't be taken, it will be re-checked next time when a new buffer
is allocated.

Also add "unlikely" to the if condition, so that the optimizer assumes
that the condition is false.

Cc: stable@vger.kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 280b56c229d6..fd28146dd952 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -940,10 +940,11 @@ static void __get_memory_limit(struct dm_bufio_client *c,
 {
 	unsigned long buffers;
 
-	if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
-		mutex_lock(&dm_bufio_clients_lock);
-		__cache_size_refresh();
-		mutex_unlock(&dm_bufio_clients_lock);
+	if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
+		if (mutex_trylock(&dm_bufio_clients_lock)) {
+			__cache_size_refresh();
+			mutex_unlock(&dm_bufio_clients_lock);
+		}
 	}
 
 	buffers = dm_bufio_cache_size_per_client >>

From 390020ad2af9ca04844c4f3b1f299ad8746d84c8 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 30 Apr 2017 17:34:53 -0400
Subject: [PATCH 50/50] dm bufio: check new buffer allocation watermark every
 30 seconds

dm-bufio checks a watermark when it allocates a new buffer in
__bufio_new().  However, it doesn't check the watermark when the user
changes /sys/module/dm_bufio/parameters/max_cache_size_bytes.

This may result in a problem - if the watermark is high enough so that
all possible buffers are allocated and if the user lowers the value of
"max_cache_size_bytes", the watermark will never be checked against the
new value because no new buffer would be allocated.

To fix this, change __evict_old_buffers() so that it checks the
watermark.  __evict_old_buffers() is called every 30 seconds, so if the
user reduces "max_cache_size_bytes", dm-bufio will react to this change
within 30 seconds and decrease memory consumption.

Depends-on: 1b0fb5a5b2 ("dm bufio: avoid a possible ABBA deadlock")
Cc: stable@vger.kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index fd28146dd952..c92c31b23e54 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1796,9 +1796,17 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
 	struct dm_buffer *b, *tmp;
 	unsigned retain_target = get_retain_buffers(c);
 	unsigned count;
+	LIST_HEAD(write_list);
 
 	dm_bufio_lock(c);
 
+	__check_watermark(c, &write_list);
+	if (unlikely(!list_empty(&write_list))) {
+		dm_bufio_unlock(c);
+		__flush_write_list(&write_list);
+		dm_bufio_lock(c);
+	}
+
 	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
 		if (count <= retain_target)
@@ -1823,6 +1831,8 @@ static void cleanup_old_buffers(void)
 
 	mutex_lock(&dm_bufio_clients_lock);
 
+	__cache_size_refresh();
+
 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
 		__evict_old_buffers(c, max_age_hz);