From 0738854939e6ec9b9111a8cfc0ca1dfa3cff6b2e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 2 Sep 2014 23:02:59 +0800 Subject: [PATCH 1/6] blk-merge: fix blk_recount_segments QUEUE_FLAG_NO_SG_MERGE is set at default for blk-mq devices, so bio->bi_phys_segment computed may be bigger than queue_max_segments(q) for blk-mq devices, then drivers will fail to handle the case, for example, BUG_ON() in virtio_queue_rq() can be triggerd for virtio-blk: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1359146 This patch fixes the issue by ignoring the QUEUE_FLAG_NO_SG_MERGE flag if the computed bio->bi_phys_segment is bigger than queue_max_segments(q), and the regression is caused by commit 05f1dd53152173(block: add queue flag for disabling SG merging). Reported-by: Kick In Tested-by: Chris J Arges Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 54535831f1e1..77881798f793 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -10,10 +10,11 @@ #include "blk.h" static unsigned int __blk_recalc_rq_segments(struct request_queue *q, - struct bio *bio) + struct bio *bio, + bool no_sg_merge) { struct bio_vec bv, bvprv = { NULL }; - int cluster, high, highprv = 1, no_sg_merge; + int cluster, high, highprv = 1; unsigned int seg_size, nr_phys_segs; struct bio *fbio, *bbio; struct bvec_iter iter; @@ -35,7 +36,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, cluster = blk_queue_cluster(q); seg_size = 0; nr_phys_segs = 0; - no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); high = 0; for_each_bio(bio) { bio_for_each_segment(bv, bio, iter) { @@ -88,18 +88,23 @@ new_segment: void blk_recalc_rq_segments(struct request *rq) { - rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio); + bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, + &rq->q->queue_flags); + + rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio, + no_sg_merge); } void blk_recount_segments(struct request_queue *q, struct bio *bio) { - if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags)) + if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) && + bio->bi_vcnt < queue_max_segments(q)) bio->bi_phys_segments = bio->bi_vcnt; else { struct bio *nxt = bio->bi_next; bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false); bio->bi_next = nxt; } From dc501dc0d9dc9cbabc18b920f91a26c207e9476c Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 2 Sep 2014 11:38:49 -0500 Subject: [PATCH 2/6] blk-mq: pass along blk_mq_alloc_tag_set return values Two of the blk-mq based drivers do not pass back the return value from blk_mq_alloc_tag_set, instead just returning -ENOMEM. blk_mq_alloc_tag_set returns -EINVAL if the number of queues or queue depth is bad. -ENOMEM implies that retrying after freeing some memory might be more successful, but that won't ever change in the -EINVAL cases. Change the null_blk and mtip32xx drivers to pass along the return value. Signed-off-by: Robert Elliott Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 1 - drivers/block/null_blk.c | 29 +++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index db1e9560d8a7..5c8e7fe07745 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3918,7 +3918,6 @@ skip_create_disk: if (rv) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); - rv = -ENOMEM; goto block_queue_alloc_init_error; } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index a3b042c4d448..00d469c7f9f7 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -462,17 +462,21 @@ static int null_add_dev(void) struct gendisk *disk; struct nullb *nullb; sector_t size; + int rv; nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); - if (!nullb) + if (!nullb) { + rv = -ENOMEM; goto out; + } spin_lock_init(&nullb->lock); if (queue_mode == NULL_Q_MQ && use_per_node_hctx) submit_queues = nr_online_nodes; - if (setup_queues(nullb)) + rv = setup_queues(nullb); + if (rv) goto out_free_nullb; if (queue_mode == NULL_Q_MQ) { @@ -484,22 +488,29 @@ static int null_add_dev(void) nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; nullb->tag_set.driver_data = nullb; - if (blk_mq_alloc_tag_set(&nullb->tag_set)) + rv = blk_mq_alloc_tag_set(&nullb->tag_set); + if (rv) goto out_cleanup_queues; nullb->q = blk_mq_init_queue(&nullb->tag_set); - if (!nullb->q) + if (!nullb->q) { + rv = -ENOMEM; goto out_cleanup_tags; + } } else if (queue_mode == NULL_Q_BIO) { nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); - if (!nullb->q) + if (!nullb->q) { + rv = -ENOMEM; goto out_cleanup_queues; + } blk_queue_make_request(nullb->q, null_queue_bio); init_driver_queues(nullb); } else { nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); - if (!nullb->q) + if (!nullb->q) { + rv = -ENOMEM; goto out_cleanup_queues; + } blk_queue_prep_rq(nullb->q, null_rq_prep_fn); blk_queue_softirq_done(nullb->q, null_softirq_done_fn); init_driver_queues(nullb); @@ -509,8 +520,10 @@ static int null_add_dev(void) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); disk = nullb->disk = alloc_disk_node(1, home_node); - if (!disk) + if (!disk) { + rv = -ENOMEM; goto out_cleanup_blk_queue; + } mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); @@ -544,7 +557,7 @@ out_cleanup_queues: out_free_nullb: kfree(nullb); out: - return -ENOMEM; + return rv; } static int __init null_init(void) From 5676e7b6db02b80eafc2e3ad316d5f2fee817ecb Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 2 Sep 2014 11:38:44 -0500 Subject: [PATCH 3/6] blk-mq: cleanup after blk_mq_init_rq_map failures In blk-mq.c blk_mq_alloc_tag_set, if: set->tags = kmalloc_node() succeeds, but one of the blk_mq_init_rq_map() calls fails, goto out_unwind; needs to free set->tags so the caller is not obligated to do so. None of the current callers (null_blk, virtio_blk, virtio_blk, or the forthcoming scsi-mq) do so. set->tags needs to be set to NULL after doing so, so other tag cleanup logic doesn't try to free a stale pointer later. Also set it to NULL in blk_mq_free_tag_set. Tested with error injection on the forthcoming scsi-mq + hpsa combination. Signed-off-by: Robert Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4aac82615a46..f9b85e83d9ba 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1982,6 +1982,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) out_unwind: while (--i >= 0) blk_mq_free_rq_map(set, set->tags[i], i); + kfree(set->tags); + set->tags = NULL; out: return -ENOMEM; } @@ -1997,6 +1999,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } kfree(set->tags); + set->tags = NULL; } EXPORT_SYMBOL(blk_mq_free_tag_set); From 2da78092dda13f1efd26edbbf99a567776913750 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 26 Aug 2014 09:05:36 -0600 Subject: [PATCH 4/6] block: Fix dev_t minor allocation lifetime Releases the dev_t minor when all references are closed to prevent another device from acquiring the same major/minor. Since the partition's release may be invoked from call_rcu's soft-irq context, the ext_dev_idr's mutex had to be replaced with a spinlock so as not so sleep. Signed-off-by: Keith Busch Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 24 ++++++++++++++---------- block/partition-generic.c | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 791f41943132..09da5e4a8e03 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -28,10 +28,10 @@ struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) -/* For extended devt allocation. ext_devt_mutex prevents look up +/* For extended devt allocation. ext_devt_lock prevents look up * results from going away underneath its user. */ -static DEFINE_MUTEX(ext_devt_mutex); +static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; @@ -420,9 +420,13 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) } /* allocate ext devt */ - mutex_lock(&ext_devt_mutex); - idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_KERNEL); - mutex_unlock(&ext_devt_mutex); + idr_preload(GFP_KERNEL); + + spin_lock(&ext_devt_lock); + idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); + spin_unlock(&ext_devt_lock); + + idr_preload_end(); if (idx < 0) return idx == -ENOSPC ? -EBUSY : idx; @@ -447,9 +451,9 @@ void blk_free_devt(dev_t devt) return; if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - mutex_lock(&ext_devt_mutex); + spin_lock(&ext_devt_lock); idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - mutex_unlock(&ext_devt_mutex); + spin_unlock(&ext_devt_lock); } } @@ -665,7 +669,6 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); - blk_free_devt(disk_to_dev(disk)->devt); } EXPORT_SYMBOL(del_gendisk); @@ -690,13 +693,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } else { struct hd_struct *part; - mutex_lock(&ext_devt_mutex); + spin_lock(&ext_devt_lock); part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); if (part && get_disk(part_to_disk(part))) { *partno = part->partno; disk = part_to_disk(part); } - mutex_unlock(&ext_devt_mutex); + spin_unlock(&ext_devt_lock); } return disk; @@ -1098,6 +1101,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); diff --git a/block/partition-generic.c b/block/partition-generic.c index 789cdea05893..0d9e5f97f0a8 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -211,6 +211,7 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); + blk_free_devt(dev->devt); free_part_stats(p); free_part_info(p); kfree(p); @@ -253,7 +254,6 @@ void delete_partition(struct gendisk *disk, int partno) rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); - blk_free_devt(part_devt(part)); hd_struct_put(part); } From df35c7c912fe668797681842b3b74c61b0664050 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 9 Sep 2014 11:50:58 -0400 Subject: [PATCH 5/6] Block: fix unbalanced bypass-disable in blk_register_queue When a queue is registered, the block layer turns off the bypass setting (because bypass is enabled when the queue is created). This doesn't work well for queues that are unregistered and then registered again; we get a WARNING because of the unbalanced calls to blk_queue_bypass_end(). This patch fixes the problem by making blk_register_queue() call blk_queue_bypass_end() only the first time the queue is registered. Signed-off-by: Alan Stern Acked-by: Tejun Heo CC: James Bottomley CC: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4db5abf96b9e..17f5c84ce7bf 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -554,8 +554,10 @@ int blk_register_queue(struct gendisk *disk) * Initialization must be complete by now. Finish the initial * bypass from queue allocation. */ - queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); - blk_queue_bypass_end(q); + if (!blk_queue_init_done(q)) { + queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + blk_queue_bypass_end(q); + } ret = blk_trace_init_sysfs(dev); if (ret) From a516440542afcb9647f88d12c35640baf02d07ea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Sep 2014 09:02:03 -0600 Subject: [PATCH 6/6] blk-mq: scale depth and rq map appropriate if low on memory If we are running in a kdump environment, resources are scarce. For some SCSI setups with a huge set of shared tags, we run out of memory allocating what the drivers is asking for. So implement a scale back logic to reduce the tag depth for those cases, allowing the driver to successfully load. We should extend this to detect low memory situations, and implement a sane fallback for those (1 queue, 64 tags, or something like that). Tested-by: Robert Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 88 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 19 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f9b85e83d9ba..383ea0cb1f0a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1321,6 +1321,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, continue; set->ops->exit_request(set->driver_data, tags->rqs[i], hctx_idx, i); + tags->rqs[i] = NULL; } } @@ -1354,8 +1355,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&tags->page_list); - tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *), - GFP_KERNEL, set->numa_node); + tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, + set->numa_node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; @@ -1379,8 +1381,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, this_order--; do { - page = alloc_pages_node(set->numa_node, GFP_KERNEL, - this_order); + page = alloc_pages_node(set->numa_node, + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, + this_order); if (page) break; if (!this_order--) @@ -1404,8 +1407,10 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, tags->rqs[i], hctx_idx, i, - set->numa_node)) + set->numa_node)) { + tags->rqs[i] = NULL; goto fail; + } } p += rq_size; @@ -1416,7 +1421,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, return tags; fail: - pr_warn("%s: failed to allocate requests\n", __func__); blk_mq_free_rq_map(set, tags, hctx_idx); return NULL; } @@ -1936,6 +1940,61 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; } +static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + set->tags[i] = blk_mq_init_rq_map(set, i); + if (!set->tags[i]) + goto out_unwind; + } + + return 0; + +out_unwind: + while (--i >= 0) + blk_mq_free_rq_map(set, set->tags[i], i); + + set->tags = NULL; + return -ENOMEM; +} + +/* + * Allocate the request maps associated with this tag_set. Note that this + * may reduce the depth asked for, if memory is tight. set->queue_depth + * will be updated to reflect the allocated depth. + */ +static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + unsigned int depth; + int err; + + depth = set->queue_depth; + do { + err = __blk_mq_alloc_rq_maps(set); + if (!err) + break; + + set->queue_depth >>= 1; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { + err = -ENOMEM; + break; + } + } while (set->queue_depth); + + if (!set->queue_depth || err) { + pr_err("blk-mq: failed to allocate request map\n"); + return -ENOMEM; + } + + if (depth != set->queue_depth) + pr_info("blk-mq: reduced tag depth (%u -> %u)\n", + depth, set->queue_depth); + + return 0; +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -1944,8 +2003,6 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { - int i; - if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) @@ -1966,25 +2023,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out; + return -ENOMEM; - for (i = 0; i < set->nr_hw_queues; i++) { - set->tags[i] = blk_mq_init_rq_map(set, i); - if (!set->tags[i]) - goto out_unwind; - } + if (blk_mq_alloc_rq_maps(set)) + goto enomem; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; - -out_unwind: - while (--i >= 0) - blk_mq_free_rq_map(set, set->tags[i], i); +enomem: kfree(set->tags); set->tags = NULL; -out: return -ENOMEM; } EXPORT_SYMBOL(blk_mq_alloc_tag_set);