From f046f89a99ccfd9408b94c653374ff3065c7edb3 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:24 +0000 Subject: [PATCH 01/10] dm thin: fix discard corruption Fix a bug in dm_btree_remove that could leave leaf values with incorrect reference counts. The effect of this was that removal of a shared block could result in the space maps thinking the block was no longer used. More concretely, if you have a thin device and a snapshot of it, sending a discard to a shared region of the thin could corrupt the snapshot. Thinp uses a 2-level nested btree to store it's mappings. This first level is indexed by thin device, and the second level by logical block. Often when we're removing an entry in this mapping tree we need to rebalance nodes, which can involve shadowing them, possibly creating a copy if the block is shared. If we do create a copy then children of that node need to have their reference counts incremented. In this way reference counts percolate down the tree as shared trees diverge. The rebalance functions were incrementing the children at the appropriate time, but they were always assuming the children were internal nodes. This meant the leaf values (in our case packed block/flags entries) were not being incremented. Cc: stable@vger.kernel.org Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 4 +- drivers/md/persistent-data/dm-btree-remove.c | 46 ++++++++++---------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 009339d62828..ab95e5ff3758 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2544,7 +2544,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 6, 1}, + .version = {1, 7, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2831,7 +2831,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 7, 1}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index c4f28133ef82..b88757cd0d1d 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -139,15 +139,8 @@ struct child { struct btree_node *n; }; -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - -static int init_child(struct dm_btree_info *info, struct btree_node *parent, +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, struct child *result) { int r, inc; @@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct btree_node *parent, result->n = dm_block_data(result->block); if (inc) - inc_children(info->tm, result->n, &le64_type); + inc_children(info->tm, result->n, vt); *((__le64 *) value_ptr(parent, index)) = cpu_to_le64(dm_block_location(result->block)); @@ -236,7 +229,7 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, } static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent; @@ -244,11 +237,11 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, parent = dm_block_data(shadow_current(s)); - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, &right); + r = init_child(info, vt, parent, left_index + 1, &right); if (r) { exit_child(info, &left); return r; @@ -368,7 +361,7 @@ static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, } static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent = dm_block_data(shadow_current(s)); @@ -377,17 +370,17 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, /* * FIXME: fill out an array? */ - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, ¢er); + r = init_child(info, vt, parent, left_index + 1, ¢er); if (r) { exit_child(info, &left); return r; } - r = init_child(info, parent, left_index + 2, &right); + r = init_child(info, vt, parent, left_index + 2, &right); if (r) { exit_child(info, &left); exit_child(info, ¢er); @@ -434,7 +427,8 @@ static int get_nr_entries(struct dm_transaction_manager *tm, } static int rebalance_children(struct shadow_spine *s, - struct dm_btree_info *info, uint64_t key) + struct dm_btree_info *info, + struct dm_btree_value_type *vt, uint64_t key) { int i, r, has_left_sibling, has_right_sibling; uint32_t child_entries; @@ -472,13 +466,13 @@ static int rebalance_children(struct shadow_spine *s, has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); if (!has_left_sibling) - r = rebalance2(s, info, i); + r = rebalance2(s, info, vt, i); else if (!has_right_sibling) - r = rebalance2(s, info, i - 1); + r = rebalance2(s, info, vt, i - 1); else - r = rebalance3(s, info, i - 1); + r = rebalance3(s, info, vt, i - 1); return r; } @@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); - r = rebalance_children(s, info, key); + r = rebalance_children(s, info, vt, key); if (r) break; @@ -550,6 +544,14 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } +static struct dm_btree_value_type le64_type = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL +}; + int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { From 58051b94e05a59c4d34f9f1a441af40894817c59 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:25 +0000 Subject: [PATCH 02/10] dm thin: fix non power of two discard granularity calc Fix a discard granularity calculation to work for non power of 2 block sizes. In order for thinp to passdown discard bios to the underlying data device, the data device must have a discard granularity that is a factor of the thinp block size. Originally this check was done by using bitops since the block_size was known to be a power of two. Introduced by commit f13945d75730081830b6f3360266950e2b7c9067 ("dm thin: support a non power of 2 discard_granularity"). Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ab95e5ff3758..004ad1652b73 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1577,6 +1577,11 @@ static bool data_dev_supports_discard(struct pool_c *pt) return q && blk_queue_discard(q); } +static bool is_factor(sector_t block_size, uint32_t n) +{ + return !sector_div(block_size, n); +} + /* * If discard_passdown was enabled verify that the data device * supports discards. Disable discard_passdown if not. @@ -1602,7 +1607,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt) else if (data_limits->discard_granularity > block_size) reason = "discard granularity larger than a block"; - else if (block_size & (data_limits->discard_granularity - 1)) + else if (!is_factor(block_size, data_limits->discard_granularity)) reason = "discard granularity not a factor of block size"; if (reason) { From 3b6b7813b198b578aa7e04e4047ddb8225c37b7f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 20 Mar 2013 17:21:25 +0000 Subject: [PATCH 03/10] dm verity: avoid deadlock A deadlock was found in the prefetch code in the dm verity map function. This patch fixes this by transferring the prefetch to a worker thread and skipping it completely if kmalloc fails. If generic_make_request is called recursively, it queues the I/O request on the current->bio_list without making the I/O request and returns. The routine making the recursive call cannot wait for the I/O to complete. The deadlock occurs when one thread grabs the bufio_client mutex and waits for an I/O to complete but the I/O is queued on another thread's current->bio_list and is waiting to get the mutex held by the first thread. The fix recognises that prefetching is not essential. If memory can be allocated, it queues the prefetch request to the worker thread, but if not, it does nothing. Signed-off-by: Paul Taysom Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon Cc: stable@kernel.org --- drivers/md/dm-bufio.c | 2 ++ drivers/md/dm-verity.c | 39 ++++++++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 3c955e10a618..c6083132c4b8 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1025,6 +1025,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, { struct blk_plug plug; + BUG_ON(dm_bufio_in_request()); + blk_start_plug(&plug); dm_bufio_lock(c); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 6ad538375c3c..a746f1d21c66 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -93,6 +93,13 @@ struct dm_verity_io { */ }; +struct dm_verity_prefetch_work { + struct work_struct work; + struct dm_verity *v; + sector_t block; + unsigned n_blocks; +}; + static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) { return (struct shash_desc *)(io + 1); @@ -424,15 +431,18 @@ static void verity_end_io(struct bio *bio, int error) * The root buffer is not prefetched, it is assumed that it will be cached * all the time. */ -static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) +static void verity_prefetch_io(struct work_struct *work) { + struct dm_verity_prefetch_work *pw = + container_of(work, struct dm_verity_prefetch_work, work); + struct dm_verity *v = pw->v; int i; for (i = v->levels - 2; i >= 0; i--) { sector_t hash_block_start; sector_t hash_block_end; - verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); - verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); + verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); + verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); if (!i) { unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); @@ -452,6 +462,25 @@ no_prefetch_cluster: dm_bufio_prefetch(v->bufio, hash_block_start, hash_block_end - hash_block_start + 1); } + + kfree(pw); +} + +static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) +{ + struct dm_verity_prefetch_work *pw; + + pw = kmalloc(sizeof(struct dm_verity_prefetch_work), + GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!pw) + return; + + INIT_WORK(&pw->work, verity_prefetch_io); + pw->v = v; + pw->block = io->block; + pw->n_blocks = io->n_blocks; + queue_work(v->verify_wq, &pw->work); } /* @@ -498,7 +527,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) memcpy(io->io_vec, bio_iovec(bio), io->io_vec_size * sizeof(struct bio_vec)); - verity_prefetch_io(v, io); + verity_submit_prefetch(v, io); generic_make_request(bio); @@ -858,7 +887,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, From 414dd67d50a6b9a11af23bbb68e8fae13d726c8b Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:25 +0000 Subject: [PATCH 04/10] dm cache: avoid 64 bit division on 32 bit Squash various 32bit link errors. >> on i386: >> drivers/built-in.o: In function `is_discarded_oblock': >> dm-cache-target.c:(.text+0x1ea28e): undefined reference to `__udivdi3' ... Reported-by: Randy Dunlap Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 0f4e84b15c30..5ad227f0cea3 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -158,7 +158,7 @@ struct cache { /* * origin_blocks entries, discarded if set. */ - sector_t discard_block_size; /* a power of 2 times sectors per block */ + uint32_t discard_block_size; /* a power of 2 times sectors per block */ dm_dblock_t discard_nr_blocks; unsigned long *discard_bitset; @@ -412,17 +412,24 @@ static bool block_size_is_power_of_two(struct cache *cache) return cache->sectors_per_block_shift >= 0; } +static dm_block_t block_div(dm_block_t b, uint32_t n) +{ + do_div(b, n); + + return b; +} + static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) { - sector_t discard_blocks = cache->discard_block_size; + uint32_t discard_blocks = cache->discard_block_size; dm_block_t b = from_oblock(oblock); if (!block_size_is_power_of_two(cache)) - (void) sector_div(discard_blocks, cache->sectors_per_block); + discard_blocks = discard_blocks / cache->sectors_per_block; else discard_blocks >>= cache->sectors_per_block_shift; - (void) sector_div(b, discard_blocks); + b = block_div(b, discard_blocks); return to_dblock(b); } @@ -1002,7 +1009,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio) dm_block_t end_block = bio->bi_sector + bio_sectors(bio); dm_block_t b; - (void) sector_div(end_block, cache->discard_block_size); + end_block = block_div(end_block, cache->discard_block_size); for (b = start_block; b < end_block; b++) set_discard(cache, to_dblock(b)); @@ -1835,7 +1842,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) /* FIXME: factor out this whole section */ origin_blocks = cache->origin_sectors = ca->origin_sectors; - (void) sector_div(origin_blocks, ca->block_size); + origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); cache->sectors_per_block = ca->block_size; @@ -1848,7 +1855,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) dm_block_t cache_size = ca->cache_sectors; cache->sectors_per_block_shift = -1; - (void) sector_div(cache_size, ca->block_size); + cache_size = block_div(cache_size, ca->block_size); cache->cache_size = to_cblock(cache_size); } else { cache->sectors_per_block_shift = __ffs(ca->block_size); From 617a0b89da4898d4cc990c9eb4bc9c0591c538a5 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 20 Mar 2013 17:21:26 +0000 Subject: [PATCH 05/10] dm cache: detect cache_create failure Return error if cache_create() fails. A missing return check made cache_ctr continue even after an error in cache_create() resulting in the cache object being destroyed. So a simple failure like an odd number of cache policy config value arguments would result in an oops. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 5ad227f0cea3..76cc910557f0 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2009,6 +2009,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; r = cache_create(ca, &cache); + if (r) + goto out; r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); if (r) { From b978440b8db901aba0c4cd38c7c841c9b5cd9a7e Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 20 Mar 2013 17:21:26 +0000 Subject: [PATCH 06/10] dm cache: avoid calling policy destructor twice on error If the cache policy's config values are not able to be set we must set the policy to NULL after destroying it in create_cache_policy() so we don't attempt to destroy it a second time later. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 76cc910557f0..79ac8603644d 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1763,8 +1763,11 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, } r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); - if (r) + if (r) { + *error = "Error setting cache policy's config values"; dm_cache_policy_destroy(cache->policy); + cache->policy = NULL; + } return r; } From 79ed9caffc9fff67aa64fd683e791aa70f1bcb51 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:27 +0000 Subject: [PATCH 07/10] dm cache: metadata clear dirty bits on clean shutdown When writing the dirty bitset to the metadata device on a clean shutdown, clear the dirty bits. Previously they were left indicating the cache was dirty. This led to confusion about whether there really was dirty data in the cache or not. (This was a harmless bug.) Reported-by: Darrick J. Wong Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-metadata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index fbd3625f2748..1bb91802b22a 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -979,7 +979,7 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty /* nothing to be done */ return 0; - value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0)); + value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0)); __dm_bless_for_disk(&value); r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), From e2e74d617eadc15f601983270c4f4a6935c5a943 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:27 +0000 Subject: [PATCH 08/10] dm cache: fix race in writethrough implementation We have found a race in the optimisation used in the dm cache writethrough implementation. Currently, dm core sends the cache target two bios, one for the origin device and one for the cache device and these are processed in parallel. This patch avoids the race by changing the code back to a simpler (slower) implementation which processes the two writes in series, one after the other, until we can develop a complete fix for the problem. When the cache is in writethrough mode it needs to send WRITE bios to both the origin and cache devices. Previously we've been implementing this by having dm core query the cache target on every write to find out how many copies of the bio it wants. The cache will ask for two bios if the block is in the cache, and one otherwise. Then main problem with this is it's racey. At the time this check is made the bio hasn't yet been submitted and so isn't being taken into account when quiescing a block for migration (promotion or demotion). This means a single bio may be submitted when two were needed because the block has since been promoted to the cache (catastrophic), or two bios where only one is needed (harmless). I really don't want to start entering bios into the quiescing system (deferred_set) in the get_num_write_bios callback. Instead this patch simplifies things; only one bio is submitted by the core, this is first written to the origin and then the cache device in series. Obviously this will have a latency impact. deferred_writethrough_bios is introduced to record bios that must be later issued to the cache device from the worker thread. This deferred submission, after the origin bio completes, is required given that we're in interrupt context (writethrough_endio). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 138 ++++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 50 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 79ac8603644d..ff267db60025 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -142,6 +142,7 @@ struct cache { spinlock_t lock; struct bio_list deferred_bios; struct bio_list deferred_flush_bios; + struct bio_list deferred_writethrough_bios; struct list_head quiesced_migrations; struct list_head completed_migrations; struct list_head need_commit_migrations; @@ -199,6 +200,11 @@ struct per_bio_data { bool tick:1; unsigned req_nr:2; struct dm_deferred_entry *all_io_entry; + + /* writethrough fields */ + struct cache *cache; + dm_cblock_t cblock; + bio_end_io_t *saved_bi_end_io; }; struct dm_cache_migration { @@ -616,6 +622,56 @@ static void issue(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } +static void defer_writethrough_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_writethrough_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void writethrough_endio(struct bio *bio, int err) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + bio->bi_end_io = pb->saved_bi_end_io; + + if (err) { + bio_endio(bio, err); + return; + } + + remap_to_cache(pb->cache, bio, pb->cblock); + + /* + * We can't issue this bio directly, since we're in interrupt + * context. So it get's put on a bio list for processing by the + * worker thread. + */ + defer_writethrough_bio(pb->cache, bio); +} + +/* + * When running in writethrough mode we need to send writes to clean blocks + * to both the cache and origin devices. In future we'd like to clone the + * bio and send them in parallel, but for now we're doing them in + * series as this is easier. + */ +static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + pb->cache = cache; + pb->cblock = cblock; + pb->saved_bi_end_io = bio->bi_end_io; + bio->bi_end_io = writethrough_endio; + + remap_to_origin_clear_discard(pb->cache, bio, oblock); +} + /*---------------------------------------------------------------- * Migration processing * @@ -1077,14 +1133,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs, inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - } else + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); issue(cache, bio); @@ -1093,17 +1144,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, case POLICY_MISS: inc_miss_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - - if (pb->req_nr != 0) { - /* - * This is a duplicate writethrough io that is no - * longer needed because the block has been demoted. - */ - bio_endio(bio, 0); - } else { - remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); - } + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); break; case POLICY_NEW: @@ -1224,6 +1266,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) submit_bios ? generic_make_request(bio) : bio_io_error(bio); } +static void process_deferred_writethrough_bios(struct cache *cache) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_writethrough_bios); + bio_list_init(&cache->deferred_writethrough_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); +} + static void writeback_some_dirty_blocks(struct cache *cache) { int r = 0; @@ -1320,6 +1379,7 @@ static int more_work(struct cache *cache) else return !bio_list_empty(&cache->deferred_bios) || !bio_list_empty(&cache->deferred_flush_bios) || + !bio_list_empty(&cache->deferred_writethrough_bios) || !list_empty(&cache->quiesced_migrations) || !list_empty(&cache->completed_migrations) || !list_empty(&cache->need_commit_migrations); @@ -1338,6 +1398,8 @@ static void do_worker(struct work_struct *ws) writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); + if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); @@ -1803,8 +1865,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size, #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio); - static int cache_create(struct cache_args *ca, struct cache **result) { int r = 0; @@ -1831,9 +1891,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) memcpy(&cache->features, &ca->features, sizeof(cache->features)); - if (cache->features.write_through) - ti->num_write_bios = cache_num_write_bios; - cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -1883,6 +1940,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); bio_list_init(&cache->deferred_flush_bios); + bio_list_init(&cache->deferred_writethrough_bios); INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); @@ -2028,20 +2086,6 @@ out: return r; } -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio) -{ - int r; - struct cache *cache = ti->private; - dm_oblock_t block = get_bio_block(cache, bio); - dm_cblock_t cblock; - - r = policy_lookup(cache->policy, block, &cblock); - if (r < 0) - return 2; /* assume the worst */ - - return (!r && !is_dirty(cache, cblock)) ? 2 : 1; -} - static int cache_map(struct dm_target *ti, struct bio *bio) { struct cache *cache = ti->private; @@ -2109,18 +2153,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio) inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); - } else { + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); - cell_defer(cache, cell, false); - } + + cell_defer(cache, cell, false); break; case POLICY_MISS: @@ -2547,7 +2585,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, From 4e7f506f6429636115e2f58f9f97089acc62524a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 20 Mar 2013 17:21:27 +0000 Subject: [PATCH 09/10] dm cache: policy change version from string to integer set Separate dm cache policy version string into 3 unsigned numbers corresponding to major, minor and patchlevel and store them at the end of the on-disk metadata so we know which version of the policy generated the hints in case a future version wants to use them differently. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-metadata.c | 15 +++++++++++++-- drivers/md/dm-cache-policy-cleaner.c | 7 +++++-- drivers/md/dm-cache-policy-internal.h | 2 ++ drivers/md/dm-cache-policy-mq.c | 8 ++++++-- drivers/md/dm-cache-policy.c | 8 ++++++++ drivers/md/dm-cache-policy.h | 2 ++ 6 files changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 1bb91802b22a..74213d1f1db5 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -83,6 +83,8 @@ struct cache_disk_superblock { __le32 read_misses; __le32 write_hits; __le32 write_misses; + + __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; } __packed; struct dm_cache_metadata { @@ -109,6 +111,7 @@ struct dm_cache_metadata { bool clean_when_opened:1; char policy_name[CACHE_POLICY_NAME_SIZE]; + unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; size_t policy_hint_size; struct dm_cache_statistics stats; }; @@ -268,7 +271,8 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); disk_super->version = cpu_to_le32(CACHE_VERSION); - memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE); + memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); + memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, @@ -284,7 +288,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); disk_super->cache_blocks = cpu_to_le32(0); - memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); disk_super->read_hits = cpu_to_le32(0); disk_super->read_misses = cpu_to_le32(0); @@ -478,6 +481,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); + cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]); + cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]); + cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]); cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); @@ -572,6 +578,9 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); + disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); + disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); + disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); @@ -1070,6 +1079,7 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po __le32 value; size_t hint_size; const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); if (!policy_name[0] || (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) @@ -1077,6 +1087,7 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po if (strcmp(cmd->policy_name, policy_name)) { strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); + memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); hint_size = dm_cache_policy_get_hint_size(policy); if (!hint_size) diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c index cc05d70b3cb8..b04d1f904d07 100644 --- a/drivers/md/dm-cache-policy-cleaner.c +++ b/drivers/md/dm-cache-policy-cleaner.c @@ -17,7 +17,6 @@ /*----------------------------------------------------------------*/ #define DM_MSG_PREFIX "cache cleaner" -#define CLEANER_VERSION "1.0.0" /* Cache entry struct. */ struct wb_cache_entry { @@ -434,6 +433,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, static struct dm_cache_policy_type wb_policy_type = { .name = "cleaner", + .version = {1, 0, 0}, .hint_size = 0, .owner = THIS_MODULE, .create = wb_create @@ -446,7 +446,10 @@ static int __init wb_init(void) if (r < 0) DMERR("register failed %d", r); else - DMINFO("version " CLEANER_VERSION " loaded"); + DMINFO("version %u.%u.%u loaded", + wb_policy_type.version[0], + wb_policy_type.version[1], + wb_policy_type.version[2]); return r; } diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 52a75beeced5..0928abdc49f0 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -117,6 +117,8 @@ void dm_cache_policy_destroy(struct dm_cache_policy *p); */ const char *dm_cache_policy_get_name(struct dm_cache_policy *p); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p); + size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p); /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 964153255076..dc112a7137fe 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -14,7 +14,6 @@ #include #define DM_MSG_PREFIX "cache-policy-mq" -#define MQ_VERSION "1.0.0" static struct kmem_cache *mq_entry_cache; @@ -1133,6 +1132,7 @@ bad_cache_alloc: static struct dm_cache_policy_type mq_policy_type = { .name = "mq", + .version = {1, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1140,6 +1140,7 @@ static struct dm_cache_policy_type mq_policy_type = { static struct dm_cache_policy_type default_policy_type = { .name = "default", + .version = {1, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1164,7 +1165,10 @@ static int __init mq_init(void) r = dm_cache_policy_register(&default_policy_type); if (!r) { - DMINFO("version " MQ_VERSION " loaded"); + DMINFO("version %u.%u.%u loaded", + mq_policy_type.version[0], + mq_policy_type.version[1], + mq_policy_type.version[2]); return 0; } diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index 2cbf5fdaac52..21c03c570c06 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c @@ -150,6 +150,14 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p) } EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + return t->version; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_version); + size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p) { struct dm_cache_policy_type *t = p->private; diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index f0f51b260544..558bdfdabf5f 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -196,6 +196,7 @@ struct dm_cache_policy { * We maintain a little register of the different policy types. */ #define CACHE_POLICY_NAME_SIZE 16 +#define CACHE_POLICY_VERSION_SIZE 3 struct dm_cache_policy_type { /* For use by the register code only. */ @@ -206,6 +207,7 @@ struct dm_cache_policy_type { * what gets passed on the target line to select your policy. */ char name[CACHE_POLICY_NAME_SIZE]; + unsigned version[CACHE_POLICY_VERSION_SIZE]; /* * Policies may store a hint for each each cache block. From ea2dd8c1ed0becee9812cf0840a9cd553ed398fe Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 20 Mar 2013 17:21:28 +0000 Subject: [PATCH 10/10] dm cache: policy ignore hints if generated by different version When reading the dm cache metadata from disk, ignore the policy hints unless they were generated by the same major version number of the same policy module. The hints are considered to be private data belonging to the specific module that generated them and there is no requirement for them to make sense to different versions of the policy that generated them. Policy modules are all required to work fine if no previous hints are supplied (or if existing hints are lost). Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-metadata.c | 47 ++++++++++++++++++++++++++-------- drivers/md/dm-cache-metadata.h | 2 +- drivers/md/dm-cache-target.c | 3 +-- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 74213d1f1db5..83e995fece88 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -863,18 +863,43 @@ struct thunk { bool hints_valid; }; +static bool policy_unchanged(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); + size_t policy_hint_size = dm_cache_policy_get_hint_size(policy); + + /* + * Ensure policy names match. + */ + if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name))) + return false; + + /* + * Ensure policy major versions match. + */ + if (cmd->policy_version[0] != policy_version[0]) + return false; + + /* + * Ensure policy hint sizes match. + */ + if (cmd->policy_hint_size != policy_hint_size) + return false; + + return true; +} + static bool hints_array_initialized(struct dm_cache_metadata *cmd) { return cmd->hint_root && cmd->policy_hint_size; } static bool hints_array_available(struct dm_cache_metadata *cmd, - const char *policy_name) + struct dm_cache_policy *policy) { - bool policy_names_match = !strncmp(cmd->policy_name, policy_name, - sizeof(cmd->policy_name)); - - return cmd->clean_when_opened && policy_names_match && + return cmd->clean_when_opened && policy_unchanged(cmd, policy) && hints_array_initialized(cmd); } @@ -908,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf) return r; } -static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +static int __load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { struct thunk thunk; @@ -918,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam thunk.cmd = cmd; thunk.respect_dirty_flags = cmd->clean_when_opened; - thunk.hints_valid = hints_array_available(cmd, policy_name); + thunk.hints_valid = hints_array_available(cmd, policy); return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); } -int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { int r; down_read(&cmd->root_lock); - r = __load_mappings(cmd, policy_name, fn, context); + r = __load_mappings(cmd, policy, fn, context); up_read(&cmd->root_lock); return r; @@ -1085,7 +1112,7 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) return -EINVAL; - if (strcmp(cmd->policy_name, policy_name)) { + if (!policy_unchanged(cmd, policy)) { strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 135864ea0eee..f45cef21f3d0 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock, dm_cblock_t cblock, bool dirty, uint32_t hint, bool hint_valid); int dm_cache_load_mappings(struct dm_cache_metadata *cmd, - const char *policy_name, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index ff267db60025..66120bd46d15 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2369,8 +2369,7 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { - r = dm_cache_load_mappings(cache->cmd, - dm_cache_policy_get_name(cache->policy), + r = dm_cache_load_mappings(cache->cmd, cache->policy, load_mapping, cache); if (r) { DMERR("could not load cache mappings");