From 85067747cf9888249fa11fa49ef75af5192d3988 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jun 2020 16:00:58 -0400 Subject: [PATCH 1/6] dm: do not use waitqueue for request-based DM Given request-based DM now uses blk-mq's blk_mq_queue_inflight() to determine if outstanding IO has completed (and DM has no control over the blk-mq state machine used to track outstanding IO) it is unsafe to wakeup waiter (dm_wait_for_completion) before blk-mq has cleared a request's state bits (e.g. MQ_RQ_IN_FLIGHT or MQ_RQ_COMPLETE). As such dm_wait_for_completion() could be left to wait indefinitely if no other requests complete. Fix this by eliminating request-based DM's use of waitqueue to wait for blk-mq requests to complete in dm_wait_for_completion. Signed-off-by: Ming Lei Depends-on: 3c94d83cb3526 ("blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight()") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 4 --- drivers/md/dm.c | 64 ++++++++++++++++++++++++++++------------------ 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index f60c02512121..85e0daabad49 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -146,10 +146,6 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md) { - /* nudge anyone waiting on suspend queue */ - if (unlikely(wq_has_sleeper(&md->wait))) - wake_up(&md->wait); - /* * dm_put() must be at the end of this function. See the comment above */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e6807792fec8..446aff589732 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -654,28 +654,6 @@ static void free_tio(struct dm_target_io *tio) bio_put(&tio->clone); } -static bool md_in_flight_bios(struct mapped_device *md) -{ - int cpu; - struct hd_struct *part = &dm_disk(md)->part0; - long sum = 0; - - for_each_possible_cpu(cpu) { - sum += part_stat_local_read_cpu(part, in_flight[0], cpu); - sum += part_stat_local_read_cpu(part, in_flight[1], cpu); - } - - return sum != 0; -} - -static bool md_in_flight(struct mapped_device *md) -{ - if (queue_is_mq(md->queue)) - return blk_mq_queue_inflight(md->queue); - else - return md_in_flight_bios(md); -} - u64 dm_start_time_ns_from_clone(struct bio *bio) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); @@ -2470,15 +2448,29 @@ void dm_put(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_put); -static int dm_wait_for_completion(struct mapped_device *md, long task_state) +static bool md_in_flight_bios(struct mapped_device *md) +{ + int cpu; + struct hd_struct *part = &dm_disk(md)->part0; + long sum = 0; + + for_each_possible_cpu(cpu) { + sum += part_stat_local_read_cpu(part, in_flight[0], cpu); + sum += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + + return sum != 0; +} + +static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state) { int r = 0; DEFINE_WAIT(wait); - while (1) { + while (true) { prepare_to_wait(&md->wait, &wait, task_state); - if (!md_in_flight(md)) + if (!md_in_flight_bios(md)) break; if (signal_pending_state(task_state, current)) { @@ -2493,6 +2485,28 @@ static int dm_wait_for_completion(struct mapped_device *md, long task_state) return r; } +static int dm_wait_for_completion(struct mapped_device *md, long task_state) +{ + int r = 0; + + if (!queue_is_mq(md->queue)) + return dm_wait_for_bios_completion(md, task_state); + + while (true) { + if (!blk_mq_queue_inflight(md->queue)) + break; + + if (signal_pending_state(task_state, current)) { + r = -EINTR; + break; + } + + msleep(5); + } + + return r; +} + /* * Process the deferred bios */ From 382761dc6312965a11f82f2217e16ec421bf17ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:46 +0200 Subject: [PATCH 2/6] dm: use bio_uninit instead of bio_disassociate_blkg bio_uninit is the proper API to clean up a BIO that has been allocated on stack or inside a structure that doesn't come from the BIO allocator. Switch dm to use that instead of bio_disassociate_blkg, which really is an implementation detail. Note that the bio_uninit calls are also moved to the two callers of __send_empty_flush, so that they better pair with the bio_init calls used to initialize them. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 446aff589732..4cc7e2599664 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1443,9 +1443,6 @@ static int __send_empty_flush(struct clone_info *ci) BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); - - bio_disassociate_blkg(ci->bio); - return 0; } @@ -1633,6 +1630,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); + bio_uninit(ci.bio); /* dec_pending submits any data associated with flush */ } else if (op_is_zone_mgmt(bio_op(bio))) { ci.bio = bio; @@ -1707,6 +1705,7 @@ static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map, ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); + bio_uninit(ci.bio); /* dec_pending submits any data associated with flush */ } else { struct dm_target_io *tio; From a46624580376a3a0beb218d94cbc7f258696e29f Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Tue, 30 Jun 2020 17:49:24 +0200 Subject: [PATCH 3/6] dm writecache: reject asynchronous pmem devices DM writecache does not handle asynchronous pmem. Reject it when supplied as cache. Link: https://lore.kernel.org/linux-nvdimm/87lfk5hahc.fsf@linux.ibm.com/ Fixes: 6e84200c0a29 ("virtio-pmem: Add virtio pmem driver") Signed-off-by: Michal Suchanek Acked-by: Mikulas Patocka Cc: stable@vger.kernel.org # 5.3+ Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 30505d70f423..5358894bb9fd 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -2266,6 +2266,12 @@ invalid_optional: } if (WC_MODE_PMEM(wc)) { + if (!dax_synchronous(wc->ssd_dev->dax_dev)) { + r = -EOPNOTSUPP; + ti->error = "Asynchronous persistent memory not supported as pmem cache"; + goto bad; + } + r = persistent_memory_claim(wc); if (r) { ti->error = "Unable to map persistent memory for cache"; From ce34c9b461b50001892b0b348e024d2275014ede Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 2 Jul 2020 23:11:40 +0800 Subject: [PATCH 4/6] dm zoned: fix unused but set variable warnings Fix unused but set variable warnings: drivers/md/dm-zoned-reclaim.c:504:42: warning: variable nr_rnd set but not used [-Wunused-but-set-variable] 504 | unsigned int p_unmap, nr_unmap_rnd = 0, nr_rnd = 0; | ^~~~~~ drivers/md/dm-zoned-reclaim.c:504:24: warning: variable nr_unmap_rnd set but not used [-Wunused-but-set-variable] 504 | unsigned int p_unmap, nr_unmap_rnd = 0, nr_rnd = 0; | ^~~~~~~~~~~~ Fixes: f97809aec589 ("dm zoned: per-device reclaim") Signed-off-by: Wei Yongjun Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-reclaim.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index dd1eebf6e50f..7e0cc2d732cf 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -501,7 +501,7 @@ static void dmz_reclaim_work(struct work_struct *work) { struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); struct dmz_metadata *zmd = zrc->metadata; - unsigned int p_unmap, nr_unmap_rnd = 0, nr_rnd = 0; + unsigned int p_unmap; int ret; if (dmz_dev_is_dying(zmd)) @@ -527,9 +527,6 @@ static void dmz_reclaim_work(struct work_struct *work) zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2); } - nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); - nr_rnd = dmz_nr_rnd_zones(zmd, zrc->dev_idx); - DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)", dmz_metadata_label(zmd), zrc->dev_idx, zrc->kc_throttle.throttle, From 174364f6a8979655f71b04b6492657aec3762703 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 8 Jul 2020 09:20:22 +0900 Subject: [PATCH 5/6] dm zoned: Fix zone reclaim trigger Only triggering reclaim based on the percentage of unmapped cache zones can fail to detect cases where reclaim is needed, e.g. if the target has only 2 or 3 cache zones and only one unmapped cache zone, the percentage of free cache zones is higher than DMZ_RECLAIM_LOW_UNMAP_ZONES (30%) and reclaim does not trigger. This problem, combined with the fact that dmz_schedule_reclaim() is called from dmz_handle_bio() without the map lock held, leads to a race between zone allocation and dmz_should_reclaim() result. Depending on the workload applied, this race can lead to the write path waiting forever for a free zone without reclaim being triggered. Fix this by moving dmz_schedule_reclaim() inside dmz_alloc_zone() under the map lock. This results in checking the need for zone reclaim whenever a new data or buffer zone needs to be allocated. Also fix dmz_reclaim_percentage() to always return 0 if the number of unmapped cache (or random) zones is less than or equal to 1. Suggested-by: Shin'ichiro Kawasaki Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 9 ++++++++- drivers/md/dm-zoned-reclaim.c | 2 ++ drivers/md/dm-zoned-target.c | 10 +--------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 5cf6f5f552e0..b298fefb022e 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2217,8 +2217,15 @@ struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, { struct list_head *list; struct dm_zone *zone; - int i = 0; + int i; + /* Schedule reclaim to ensure free zones are available */ + if (!(flags & DMZ_ALLOC_RECLAIM)) { + for (i = 0; i < zmd->nr_devs; i++) + dmz_schedule_reclaim(zmd->dev[i].reclaim); + } + + i = 0; again: if (flags & DMZ_ALLOC_CACHE) list = &zmd->unmap_cache_list; diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 7e0cc2d732cf..9c0ecc9568a4 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -456,6 +456,8 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc) nr_zones = dmz_nr_rnd_zones(zmd, zrc->dev_idx); nr_unmap = dmz_nr_unmap_rnd_zones(zmd, zrc->dev_idx); } + if (nr_unmap <= 1) + return 0; return nr_unmap * 100 / nr_zones; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index cf915009c306..42aa5139df7c 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -400,15 +400,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); struct dmz_metadata *zmd = dmz->metadata; struct dm_zone *zone; - int i, ret; - - /* - * Write may trigger a zone allocation. So make sure the - * allocation can succeed. - */ - if (bio_op(bio) == REQ_OP_WRITE) - for (i = 0; i < dmz->nr_ddevs; i++) - dmz_schedule_reclaim(dmz->dev[i].reclaim); + int ret; dmz_lock_metadata(zmd); From 6958c1c640af8c3f40fa8a2eee3b5b905d95b677 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 8 Jul 2020 12:25:20 -0400 Subject: [PATCH 6/6] dm: use noio when sending kobject event kobject_uevent may allocate memory and it may be called while there are dm devices suspended. The allocation may recurse into a suspended device, causing a deadlock. We must set the noio flag when sending a uevent. The observed deadlock was reported here: https://www.redhat.com/archives/dm-devel/2020-March/msg00025.html Reported-by: Khazhismel Kumykov Reported-by: Tahsin Erdogan Reported-by: Gabriel Krisman Bertazi Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4cc7e2599664..52449afd58eb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2939,17 +2940,25 @@ EXPORT_SYMBOL_GPL(dm_internal_resume_fast); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned cookie) { + int r; + unsigned noio_flag; char udev_cookie[DM_COOKIE_LENGTH]; char *envp[] = { udev_cookie, NULL }; + noio_flag = memalloc_noio_save(); + if (!cookie) - return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); else { snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", DM_COOKIE_ENV_VAR_NAME, cookie); - return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, - action, envp); + r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, + action, envp); } + + memalloc_noio_restore(noio_flag); + + return r; } uint32_t dm_next_uevent_seq(struct mapped_device *md)