From 6131837b1de66116459ef4413e26fdbc70d066dc Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 26 Apr 2018 00:21:58 -0700 Subject: [PATCH 01/16] blk-mq: count allocated but not started requests in iostats inflight In the legacy block case, we increment the counter right after we allocate the request, not when the driver handles it. In both the legacy and blk-mq cases, part_inc_in_flight() is called from blk_account_io_start() right after we've allocated the request. blk-mq only considers requests started requests as inflight, but this is inconsistent with the legacy definition and the intention in the code. This removes the started condition and instead counts all allocated requests. Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c3621453ad87..5450cbc61f8d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,18 +95,15 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { - /* - * index[0] counts the specific partition that was asked - * for. index[1] counts the ones that are active on the - * whole device, so increment that if mi->part is indeed - * a partition, and not a whole device. - */ - if (rq->part == mi->part) - mi->inflight[0]++; - if (mi->part->partno) - mi->inflight[1]++; - } + /* + * index[0] counts the specific partition that was asked for. index[1] + * counts the ones that are active on the whole device, so increment + * that if mi->part is indeed a partition, and not a whole device. + */ + if (rq->part == mi->part) + mi->inflight[0]++; + if (mi->part->partno) + mi->inflight[1]++; } void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, From bf0ddaba65ddbb2715af97041da8e7a45b2d8628 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 26 Apr 2018 00:21:59 -0700 Subject: [PATCH 02/16] blk-mq: fix sysfs inflight counter When the blk-mq inflight implementation was added, /proc/diskstats was converted to use it, but /sys/block/$dev/inflight was not. Fix it by adding another helper to count in-flight requests by data direction. Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++++++++++++ block/blk-mq.h | 4 +++- block/genhd.c | 12 ++++++++++++ block/partition-generic.c | 10 ++++++---- include/linux/genhd.h | 4 +++- 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5450cbc61f8d..9ce9cac16c3f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -115,6 +115,25 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); } +static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, + bool reserved) +{ + struct mq_inflight *mi = priv; + + if (rq->part == mi->part) + mi->inflight[rq_data_dir(rq)]++; +} + +void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + struct mq_inflight mi = { .part = part, .inflight = inflight, }; + + inflight[0] = inflight[1] = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); +} + void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; diff --git a/block/blk-mq.h b/block/blk-mq.h index 89b5cd3a6c70..e1bb420dc5d6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -188,7 +188,9 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) } void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); + unsigned int inflight[2]); +void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) { diff --git a/block/genhd.c b/block/genhd.c index dc7e089373b9..c4513fe1adda 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -82,6 +82,18 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, } } +void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + if (q->mq_ops) { + blk_mq_in_flight_rw(q, part, inflight); + return; + } + + inflight[0] = atomic_read(&part->in_flight[0]); + inflight[1] = atomic_read(&part->in_flight[1]); +} + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); diff --git a/block/partition-generic.c b/block/partition-generic.c index 08dabcd8b6ae..db57cced9b98 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -145,13 +145,15 @@ ssize_t part_stat_show(struct device *dev, jiffies_to_msecs(part_stat_read(p, time_in_queue))); } -ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight[2]; - return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]), - atomic_read(&p->in_flight[1])); + part_in_flight_rw(q, p, inflight); + return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } #ifdef CONFIG_FAIL_MAKE_REQUEST diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c826b0b5232a..6cb8a5789668 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -368,7 +368,9 @@ static inline void free_part_stats(struct hd_struct *part) part_stat_add(cpu, gendiskp, field, -subnd) void part_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); + unsigned int inflight[2]); +void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, From 6e916a7eb1bc045f4e27355632ee7692014e6e60 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 3 May 2018 18:51:32 +0800 Subject: [PATCH 03/16] bcache: store disk name in struct cache and struct cached_dev Current code uses bdevname() or bio_devname() to reference gendisk disk name when bcache needs to display the disk names in kernel message. It was safe before bcache device failure handling patch set merged in, because when devices are failed, there was deadlock to prevent bcache printing error messages with gendisk disk name. But after the failure handling patch set merged, the deadlock is fixed, so it is possible that the gendisk structure bdev->hd_disk is released when bdevname() is called to reference bdev->bd_disk->disk_name[]. This is why I receive bug report of NULL pointers deference panic. This patch stores gendisk disk name in a buffer inside struct cache and struct cached_dev, then print out the offline device name won't reference bdev->hd_disk anymore. And this patch also avoids extra function calls of bdevname() and bio_devnmae(). Changelog: v3, add Reviewed-by from Hannes. v2, call bdevname() earlier in register_bdev() v1, first version with segguestion from Junhui Tang. Fixes: c7b7bd07404c5 ("bcache: add io_disable to struct cached_dev") Fixes: 5138ac6748e38 ("bcache: fix misleading error message in bch_count_io_errors()") Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 4 ++++ drivers/md/bcache/debug.c | 3 +-- drivers/md/bcache/io.c | 8 +++---- drivers/md/bcache/request.c | 5 +---- drivers/md/bcache/super.c | 44 ++++++++++++++++++------------------- 5 files changed, 30 insertions(+), 34 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d338b7086013..3a0cfb237af9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -392,6 +392,8 @@ struct cached_dev { #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 atomic_t io_errors; unsigned error_limit; + + char backing_dev_name[BDEVNAME_SIZE]; }; enum alloc_reserve { @@ -464,6 +466,8 @@ struct cache { atomic_long_t meta_sectors_written; atomic_long_t btree_sectors_written; atomic_long_t sectors_written; + + char cache_dev_name[BDEVNAME_SIZE]; }; struct gc_stat { diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 028f7b386e01..4e63c6f6c04d 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -106,7 +106,6 @@ void bch_btree_verify(struct btree *b) void bch_data_verify(struct cached_dev *dc, struct bio *bio) { - char name[BDEVNAME_SIZE]; struct bio *check; struct bio_vec bv, cbv; struct bvec_iter iter, citer = { 0 }; @@ -134,7 +133,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) bv.bv_len), dc->disk.c, "verify failed at dev %s sector %llu", - bdevname(dc->bdev, name), + dc->backing_dev_name, (uint64_t) bio->bi_iter.bi_sector); kunmap_atomic(p1); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 7fac97ae036e..2ddf8515e6a5 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -52,7 +52,6 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, /* IO errors */ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) { - char buf[BDEVNAME_SIZE]; unsigned errors; WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); @@ -60,7 +59,7 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) errors = atomic_add_return(1, &dc->io_errors); if (errors < dc->error_limit) pr_err("%s: IO error on backing device, unrecoverable", - bio_devname(bio, buf)); + dc->backing_dev_name); else bch_cached_dev_error(dc); } @@ -105,19 +104,18 @@ void bch_count_io_errors(struct cache *ca, } if (error) { - char buf[BDEVNAME_SIZE]; unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, &ca->io_errors); errors >>= IO_ERROR_SHIFT; if (errors < ca->set->error_limit) pr_err("%s: IO error on %s%s", - bdevname(ca->bdev, buf), m, + ca->cache_dev_name, m, is_read ? ", recovering." : "."); else bch_cache_set_error(ca->set, "%s: too many IO errors %s", - bdevname(ca->bdev, buf), m); + ca->cache_dev_name, m); } } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index a65e3365eeb9..8e3e8655ed63 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -649,11 +649,8 @@ static void backing_request_endio(struct bio *bio) */ if (unlikely(s->iop.writeback && bio->bi_opf & REQ_PREFLUSH)) { - char buf[BDEVNAME_SIZE]; - - bio_devname(bio, buf); pr_err("Can't flush %s: returned bi_status %i", - buf, bio->bi_status); + dc->backing_dev_name, bio->bi_status); } else { /* set to orig_bio->bi_status in bio_complete() */ s->iop.status = bio->bi_status; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d90d9e59ca00..8196b19fada2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -936,7 +936,6 @@ static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) static void cached_dev_detach_finish(struct work_struct *w) { struct cached_dev *dc = container_of(w, struct cached_dev, detach); - char buf[BDEVNAME_SIZE]; struct closure cl; closure_init_stack(&cl); @@ -967,7 +966,7 @@ static void cached_dev_detach_finish(struct work_struct *w) mutex_unlock(&bch_register_lock); - pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); + pr_info("Caching disabled for %s", dc->backing_dev_name); /* Drop ref we took in cached_dev_detach() */ closure_put(&dc->disk.cl); @@ -999,29 +998,28 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, { uint32_t rtime = cpu_to_le32(get_seconds()); struct uuid_entry *u; - char buf[BDEVNAME_SIZE]; struct cached_dev *exist_dc, *t; - bdevname(dc->bdev, buf); - if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) return -ENOENT; if (dc->disk.c) { - pr_err("Can't attach %s: already attached", buf); + pr_err("Can't attach %s: already attached", + dc->backing_dev_name); return -EINVAL; } if (test_bit(CACHE_SET_STOPPING, &c->flags)) { - pr_err("Can't attach %s: shutting down", buf); + pr_err("Can't attach %s: shutting down", + dc->backing_dev_name); return -EINVAL; } if (dc->sb.block_size < c->sb.block_size) { /* Will die */ pr_err("Couldn't attach %s: block size less than set's block size", - buf); + dc->backing_dev_name); return -EINVAL; } @@ -1029,7 +1027,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) { if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) { pr_err("Tried to attach %s but duplicate UUID already attached", - buf); + dc->backing_dev_name); return -EINVAL; } @@ -1047,13 +1045,15 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, if (!u) { if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - pr_err("Couldn't find uuid for %s in set", buf); + pr_err("Couldn't find uuid for %s in set", + dc->backing_dev_name); return -ENOENT; } u = uuid_find_empty(c); if (!u) { - pr_err("Not caching %s, no room for UUID", buf); + pr_err("Not caching %s, no room for UUID", + dc->backing_dev_name); return -EINVAL; } } @@ -1112,7 +1112,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, up_write(&dc->writeback_lock); pr_info("Caching %s as %s on set %pU", - bdevname(dc->bdev, buf), dc->disk.disk->disk_name, + dc->backing_dev_name, + dc->disk.disk->disk_name, dc->disk.c->sb.set_uuid); return 0; } @@ -1225,10 +1226,10 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cached_dev *dc) { - char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; struct cache_set *c; + bdevname(bdev, dc->backing_dev_name); memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; dc->bdev->bd_holder = dc; @@ -1237,6 +1238,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; get_page(sb_page); + if (cached_dev_init(dc, sb->block_size << 9)) goto err; @@ -1247,7 +1249,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; - pr_info("registered backing device %s", bdevname(bdev, name)); + pr_info("registered backing device %s", dc->backing_dev_name); list_add(&dc->list, &uncached_devices); list_for_each_entry(c, &bch_cache_sets, list) @@ -1259,7 +1261,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, return; err: - pr_notice("error %s: %s", bdevname(bdev, name), err); + pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); } @@ -1367,8 +1369,6 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) bool bch_cached_dev_error(struct cached_dev *dc) { - char name[BDEVNAME_SIZE]; - if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return false; @@ -1377,7 +1377,7 @@ bool bch_cached_dev_error(struct cached_dev *dc) smp_mb(); pr_err("stop %s: too many IO errors on backing device %s\n", - dc->disk.disk->disk_name, bdevname(dc->bdev, name)); + dc->disk.disk->disk_name, dc->backing_dev_name); bcache_device_stop(&dc->disk); return true; @@ -2003,12 +2003,10 @@ static int cache_alloc(struct cache *ca) static int register_cache(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cache *ca) { - char name[BDEVNAME_SIZE]; const char *err = NULL; /* must be set for any error case */ int ret = 0; - bdevname(bdev, name); - + bdevname(bdev, ca->cache_dev_name); memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; @@ -2045,14 +2043,14 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, goto out; } - pr_info("registered cache device %s", name); + pr_info("registered cache device %s", ca->cache_dev_name); out: kobject_put(&ca->kobj); err: if (err) - pr_notice("error %s: %s", name, err); + pr_notice("error %s: %s", ca->cache_dev_name, err); return ret; } From 6147305c73e4511ca1a975b766b97a779d442567 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 3 May 2018 18:51:33 +0800 Subject: [PATCH 04/16] bcache: set CACHE_SET_IO_DISABLE in bch_cached_dev_error() Commit c7b7bd07404c5 ("bcache: add io_disable to struct cached_dev") tries to stop bcache device by calling bcache_device_stop() when too many I/O errors happened on backing device. But if there is internal I/O happening on cache device (writeback scan, garbage collection, etc), a regular I/O request triggers the internal I/Os may still holds a refcount of dc->count, and the refcount may only be dropped after the internal I/O stopped. By this patch, bch_cached_dev_error() will check if the backing device is attached to a cache set, if yes that CACHE_SET_IO_DISABLE will be set to flags of this cache set. Then internal I/Os on cache device will be rejected and stopped immediately, and the bcache device can be stopped. For people who are not familiar with the interesting refcount dependance, let me explain a bit more how the fix works. Example the writeback thread will scan cache device for dirty data writeback purpose. Before it stopps, it holds a refcount of dc->count. When CACHE_SET_IO_DISABLE bit is set, the internal I/O will stopped and the while-loop in bch_writeback_thread() quits and calls cached_dev_put() to drop dc->count. If this is the last refcount to drop, then cached_dev_detach_finish() will be called. In this call back function, in turn closure_put(dc->disk.cl) is called to drop a refcount of closure dc->disk.cl. If this is the last refcount of this closure to drop, then cached_dev_flush() will be called. Then the cached device is freed. So if CACHE_SET_IO_DISABLE is not set, the bache device can not be stopped until all inernal cache device I/O stopped. For large size cache device, and writeback thread competes locks with gc thread, there might be a quite long time to wait. Fixes: c7b7bd07404c5 ("bcache: add io_disable to struct cached_dev") Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8196b19fada2..c017cd444c66 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1369,6 +1369,8 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) bool bch_cached_dev_error(struct cached_dev *dc) { + struct cache_set *c; + if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return false; @@ -1379,6 +1381,21 @@ bool bch_cached_dev_error(struct cached_dev *dc) pr_err("stop %s: too many IO errors on backing device %s\n", dc->disk.disk->disk_name, dc->backing_dev_name); + /* + * If the cached device is still attached to a cache set, + * even dc->io_disable is true and no more I/O requests + * accepted, cache device internal I/O (writeback scan or + * garbage collection) may still prevent bcache device from + * being stopped. So here CACHE_SET_IO_DISABLE should be + * set to c->flags too, to make the internal I/O to cache + * device rejected and stopped immediately. + * If c is NULL, that means the bcache device is not attached + * to any cache set, then no CACHE_SET_IO_DISABLE bit to set. + */ + c = dc->disk.c; + if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) + pr_info("CACHE_SET_IO_DISABLE already set"); + bcache_device_stop(&dc->disk); return true; } From bf78980fcc58bad2d61858ce342153a3dd097aa0 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 3 May 2018 18:51:34 +0800 Subject: [PATCH 05/16] bcache: count backing device I/O error for writeback I/O Commit c7b7bd07404c5 ("bcache: add io_disable to struct cached_dev") counts backing device I/O requets and set dc->io_disable to true if error counters exceeds dc->io_error_limit. But it only counts I/O errors for regular I/O request, neglects errors of write back I/Os when backing device is offline. This patch counts the errors of writeback I/Os, in dirty_endio() if bio->bi_status is not 0, it means error happens when writing dirty keys to backing device, then bch_count_backing_io_errors() is called. By this fix, even there is no reqular I/O request coming, if writeback I/O errors exceed dc->io_error_limit, the bcache device may still be stopped for the broken backing device. Fixes: c7b7bd07404c5 ("bcache: add io_disable to struct cached_dev") Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4a9547cdcdc5..ad45ebe1a74b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -244,8 +244,10 @@ static void dirty_endio(struct bio *bio) struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; - if (bio->bi_status) + if (bio->bi_status) { SET_KEY_DIRTY(&w->key, false); + bch_count_backing_io_errors(io->dc, bio); + } closure_put(&io->cl); } From ecb2ba8cb83549f1bb06bc7e693ae8fed43c0e4f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 3 May 2018 18:51:35 +0800 Subject: [PATCH 06/16] bcache: add wait_for_kthread_stop() in bch_allocator_thread() When CACHE_SET_IO_DISABLE is set on cache set flags, bcache allocator thread routine bch_allocator_thread() may stop the while-loops and exit. Then it is possible to observe the following kernel oops message, [ 631.068366] bcache: bch_btree_insert() error -5 [ 631.069115] bcache: cached_dev_detach_finish() Caching disabled for sdf [ 631.070220] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 631.070250] PGD 0 P4D 0 [ 631.070261] Oops: 0002 [#1] SMP PTI [snipped] [ 631.070578] Workqueue: events cache_set_flush [bcache] [ 631.070597] RIP: 0010:exit_creds+0x1b/0x50 [ 631.070610] RSP: 0018:ffffc9000705fe08 EFLAGS: 00010246 [ 631.070626] RAX: 0000000000000001 RBX: ffff880a622ad300 RCX: 000000000000000b [ 631.070645] RDX: 0000000000000601 RSI: 000000000000000c RDI: 0000000000000000 [ 631.070663] RBP: ffff880a622ad300 R08: ffffea00190c66e0 R09: 0000000000000200 [ 631.070682] R10: ffff880a48123000 R11: ffff880000000000 R12: 0000000000000000 [ 631.070700] R13: ffff880a4b160e40 R14: ffff880a4b160000 R15: 0ffff880667e2530 [ 631.070719] FS: 0000000000000000(0000) GS:ffff880667e00000(0000) knlGS:0000000000000000 [ 631.070740] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 631.070755] CR2: 0000000000000000 CR3: 000000000200a001 CR4: 00000000003606e0 [ 631.070774] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 631.070793] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 631.070811] Call Trace: [ 631.070828] __put_task_struct+0x55/0x160 [ 631.070845] kthread_stop+0xee/0x100 [ 631.070863] cache_set_flush+0x11d/0x1a0 [bcache] [ 631.070879] process_one_work+0x146/0x340 [ 631.070892] worker_thread+0x47/0x3e0 [ 631.070906] kthread+0xf5/0x130 [ 631.070917] ? max_active_store+0x60/0x60 [ 631.070930] ? kthread_bind+0x10/0x10 [ 631.070945] ret_from_fork+0x35/0x40 [snipped] [ 631.071017] RIP: exit_creds+0x1b/0x50 RSP: ffffc9000705fe08 [ 631.071033] CR2: 0000000000000000 [ 631.071045] ---[ end trace 011c63a24b22c927 ]--- [ 631.071085] bcache: bcache_device_free() bcache0 stopped The reason is when cache_set_flush() tries to call kthread_stop() to stop allocator thread, but it exits already due to cache device I/O errors. This patch adds wait_for_kthread_stop() at tail of bch_allocator_thread(), to prevent the thread routine exiting directly. Then the allocator thread can be blocked at wait_for_kthread_stop() and wait for cache_set_flush() to stop it by calling kthread_stop(). changelog: v3: add Reviewed-by from Hannnes. v2: not directly return from allocator_wait(), move 'return 0' to tail of bch_allocator_thread(). v1: initial version. Fixes: 771f393e8ffc ("bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags") Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 004cc3cc6123..7fa2631b422c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -290,7 +290,7 @@ do { \ if (kthread_should_stop() || \ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ set_current_state(TASK_RUNNING); \ - return 0; \ + goto out; \ } \ \ schedule(); \ @@ -378,6 +378,9 @@ retry_invalidate: bch_prio_write(ca); } } +out: + wait_for_kthread_stop(); + return 0; } /* Allocation */ From 4fd8e13843978cbba48b8c21119da60c7fd5910d Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 3 May 2018 18:51:36 +0800 Subject: [PATCH 07/16] bcache: set dc->io_disable to true in conditional_stop_bcache_device() Commit 7e027ca4b534b ("bcache: add stop_when_cache_set_failed option to backing device") adds stop_when_cache_set_failed option and stops bcache device if stop_when_cache_set_failed is auto and there is dirty data on broken cache device. There might exists a small time gap that the cache set is released and set to NULL but bcache device is not released yet (because they are released in parallel). During this time gap, dc->c is NULL so CACHE_SET_IO_DISABLE won't be checked, and dc->io_disable is still false, so new coming I/O requests will be accepted and directly go into backing device as no cache set attached to. If there is dirty data on cache device, this behavior may introduce potential inconsistent data. This patch sets dc->io_disable to true before calling bcache_device_stop() to make sure the backing device will reject new coming I/O request as well, so even in the small time gap no I/O will directly go into backing device to corrupt data consistency. Fixes: 7e027ca4b534b ("bcache: add stop_when_cache_set_failed option to backing device") Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c017cd444c66..cedbb41533c2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1556,6 +1556,20 @@ static void conditional_stop_bcache_device(struct cache_set *c, */ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.", d->disk->disk_name); + /* + * There might be a small time gap that cache set is + * released but bcache device is not. Inside this time + * gap, regular I/O requests will directly go into + * backing device as no cache set attached to. This + * behavior may also introduce potential inconsistence + * data in writeback mode while cache is dirty. + * Therefore before calling bcache_device_stop() due + * to a broken cache device, dc->io_disable should be + * explicitly set to true. + */ + dc->io_disable = true; + /* make others know io_disable is true earlier */ + smp_mb(); bcache_device_stop(d); } else { /* From 09a44ca2114737e0932257619c16a2b50c7807f1 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 3 May 2018 18:51:37 +0800 Subject: [PATCH 08/16] bcache: use pr_info() to inform duplicated CACHE_SET_IO_DISABLE set It is possible that multiple I/O requests hits on failed cache device or backing device, therefore it is quite common that CACHE_SET_IO_DISABLE is set already when a task tries to set the bit from bch_cache_set_error(). Currently the message "CACHE_SET_IO_DISABLE already set" is printed by pr_warn(), which might mislead users to think a serious fault happens in source code. This patch uses pr_info() to print the information in such situation, avoid extra worries. This information is helpful to understand bcache behavior in cache device failures, so I still keep them in source code. Fixes: 771f393e8ffc9 ("bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags") Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index cedbb41533c2..3dea06b41d43 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1412,7 +1412,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) return false; if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) - pr_warn("CACHE_SET_IO_DISABLE already set"); + pr_info("CACHE_SET_IO_DISABLE already set"); /* XXX: we can be called from atomic context acquire_console_sem(); From 8236b0ae31c837d2b3a2565c5f8d77f637e824cc Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 2 May 2018 07:07:55 +0900 Subject: [PATCH 09/16] bdi: wake up concurrent wb_shutdown() callers. syzbot is reporting hung tasks at wait_on_bit(WB_shutting_down) in wb_shutdown() [1]. This seems to be because commit 5318ce7d46866e1d ("bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy()") forgot to call wake_up_bit(WB_shutting_down) after clear_bit(WB_shutting_down). Introduce a helper function clear_and_wake_up_bit() and use it, in order to avoid similar errors in future. [1] https://syzkaller.appspot.com/bug?id=b297474817af98d5796bc544e1bb806fc3da0e5e Signed-off-by: Tetsuo Handa Reported-by: syzbot Fixes: 5318ce7d46866e1d ("bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy()") Cc: Tejun Heo Reviewed-by: Jan Kara Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe --- include/linux/wait_bit.h | 17 +++++++++++++++++ mm/backing-dev.c | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 9318b2166439..2b0072fa5e92 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -305,4 +305,21 @@ do { \ __ret; \ }) +/** + * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit + * + * @bit: the bit of the word being waited on + * @word: the word being waited on, a kernel virtual address + * + * You can use this helper if bitflags are manipulated atomically rather than + * non-atomically under a lock. + */ +static inline void clear_and_wake_up_bit(int bit, void *word) +{ + clear_bit_unlock(bit, word); + /* See wake_up_bit() for which memory barrier you need to use. */ + smp_mb__after_atomic(); + wake_up_bit(word, bit); +} + #endif /* _LINUX_WAIT_BIT_H */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 023190c69dce..fa5e6d7406d1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -383,7 +383,7 @@ static void wb_shutdown(struct bdi_writeback *wb) * the barrier provided by test_and_clear_bit() above. */ smp_wmb(); - clear_bit(WB_shutting_down, &wb->state); + clear_and_wake_up_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) From f53823c18131e755905b4f654196fd2cc3953f6e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 23 Apr 2018 11:21:03 +0900 Subject: [PATCH 10/16] bdi: Fix use after free bug in debugfs_remove() syzbot is reporting use after free bug in debugfs_remove() [1]. This is because fault injection made memory allocation for debugfs_create_file() from bdi_debug_register() from bdi_register_va() fail and continued with setting WB_registered. But when debugfs_remove() is called from debugfs_remove(bdi->debug_dir) from bdi_debug_unregister() from bdi_unregister() from release_bdi() because WB_registered was set by bdi_register_va(), IS_ERR_OR_NULL(bdi->debug_dir) == false despite debugfs_remove(bdi->debug_dir) was already called from bdi_register_va(). Fix this by making IS_ERR_OR_NULL(bdi->debug_dir) == true. [1] https://syzkaller.appspot.com/bug?id=5ab4efd91a96dcea9b68104f159adf4af2a6dfc1 Signed-off-by: Tetsuo Handa Reported-by: syzbot Fixes: 97f07697932e6faf ("bdi: convert bdi_debug_register to int") Cc: weiping zhang Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- mm/backing-dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index fa5e6d7406d1..7441bd93b732 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -115,6 +115,7 @@ static int bdi_debug_register(struct backing_dev_info *bdi, const char *name) bdi, &bdi_debug_stats_fops); if (!bdi->debug_stats) { debugfs_remove(bdi->debug_dir); + bdi->debug_dir = NULL; return -ENOMEM; } From 59a2f3f00fd744dbad22593f47552037d3154ca6 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Sat, 14 Apr 2018 20:06:19 +0800 Subject: [PATCH 11/16] nvme: fix potential memory leak in option parsing When specifying same string type option several times, current option parsing may cause memory leak. Hence, call kfree for previous one in this case. Signed-off-by: Chengguang Xu Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/fabrics.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 124c458806df..7ae732a77fe8 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -668,6 +668,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -ENOMEM; goto out; } + kfree(opts->transport); opts->transport = p; break; case NVMF_OPT_NQN: @@ -676,6 +677,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -ENOMEM; goto out; } + kfree(opts->subsysnqn); opts->subsysnqn = p; nqnlen = strlen(opts->subsysnqn); if (nqnlen >= NVMF_NQN_SIZE) { @@ -698,6 +700,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -ENOMEM; goto out; } + kfree(opts->traddr); opts->traddr = p; break; case NVMF_OPT_TRSVCID: @@ -706,6 +709,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -ENOMEM; goto out; } + kfree(opts->trsvcid); opts->trsvcid = p; break; case NVMF_OPT_QUEUE_SIZE: @@ -792,6 +796,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -EINVAL; goto out; } + nvmf_host_put(opts->host); opts->host = nvmf_host_add(p); kfree(p); if (!opts->host) { @@ -817,6 +822,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -ENOMEM; goto out; } + kfree(opts->host_traddr); opts->host_traddr = p; break; case NVMF_OPT_HOST_ID: From f31a21103c03bb62846409fdc60cc9faf2398cfb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 17 Apr 2018 14:42:44 -0600 Subject: [PATCH 12/16] nvme: Set integrity flag for user passthrough commands If the command a separate metadata buffer attached, the request needs to have the integrity flag set so the driver knows to map it. Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9df4f71e58ca..127a9cbf3314 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -764,6 +764,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, ret = PTR_ERR(meta); goto out_unmap; } + req->cmd_flags |= REQ_INTEGRITY; } } From 5cadde8019a6a80550fdde92d5a3327565974eab Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Apr 2018 14:24:29 -0600 Subject: [PATCH 13/16] nvme/multipath: Disable runtime writable enabling parameter We can't allow the user to change multipath settings at runtime, as this will create naming conflicts due to the different naming schemes used for each mode. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 956e0b8e9c4d..3ded009e7631 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -15,7 +15,7 @@ #include "nvme.h" static bool multipath = true; -module_param(multipath, bool, 0644); +module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); From a785dbccd95c37606c720580714f5a7a8b3255f1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Apr 2018 14:22:41 -0600 Subject: [PATCH 14/16] nvme/multipath: Fix multipath disabled naming collisions When CONFIG_NVME_MULTIPATH is set, but we're not using nvme to multipath, namespaces with multiple paths were not creating unique names due to reusing the same instance number from the namespace's head. This patch fixes this by falling back to the non-multipath naming method when the parameter disabled using multipath. Reported-by: Mike Snitzer Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 26 +------------------------- drivers/nvme/host/multipath.c | 22 ++++++++++++++++++++++ drivers/nvme/host/nvme.h | 12 ++++++++++++ 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 127a9cbf3314..a3771c5729f5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2998,31 +2998,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (nvme_init_ns_head(ns, nsid, id)) goto out_free_id; nvme_setup_streams_ns(ctrl, ns); - -#ifdef CONFIG_NVME_MULTIPATH - /* - * If multipathing is enabled we need to always use the subsystem - * instance number for numbering our devices to avoid conflicts - * between subsystems that have multiple controllers and thus use - * the multipath-aware subsystem node and those that have a single - * controller and use the controller node directly. - */ - if (ns->head->disk) { - sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, - ctrl->cntlid, ns->head->instance); - flags = GENHD_FL_HIDDEN; - } else { - sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, - ns->head->instance); - } -#else - /* - * But without the multipath code enabled, multiple controller per - * subsystems are visible as devices and thus we cannot use the - * subsystem instance. - */ - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); -#endif + nvme_set_disk_name(disk_name, ns, ctrl, &flags); if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk_name, node)) { diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3ded009e7631..d7b664ae5923 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -19,6 +19,28 @@ module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +/* + * If multipathing is enabled we need to always use the subsystem instance + * number for numbering our devices to avoid conflicts between subsystems that + * have multiple controllers and thus use the multipath-aware subsystem node + * and those that have a single controller and use the controller node + * directly. + */ +void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, + struct nvme_ctrl *ctrl, int *flags) +{ + if (!multipath) { + sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); + } else if (ns->head->disk) { + sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, + ctrl->cntlid, ns->head->instance); + *flags = GENHD_FL_HIDDEN; + } else { + sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, + ns->head->instance); + } +} + void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 061fecfd44f5..7ded7a51c430 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -436,6 +436,8 @@ extern const struct attribute_group nvme_ns_id_attr_group; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH +void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, + struct nvme_ctrl *ctrl, int *flags); void nvme_failover_req(struct request *req); bool nvme_req_needs_failover(struct request *req, blk_status_t error); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); @@ -461,6 +463,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) } #else +/* + * Without the multipath code enabled, multiple controller per subsystems are + * visible as devices and thus we cannot use the subsystem instance. + */ +static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, + struct nvme_ctrl *ctrl, int *flags) +{ + sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); +} + static inline void nvme_failover_req(struct request *req) { } From 8bfc3b4c6f9de815de4ab73784b9419348266a65 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 3 May 2018 17:00:35 +0200 Subject: [PATCH 15/16] nvmet: switch loopback target state to connecting when resetting After commit bb06ec31452f ("nvme: expand nvmf_check_if_ready checks") resetting of the loopback nvme target failed as we forgot to switch it's state to NVME_CTRL_CONNECTING before we reconnect the admin queues. Therefore the checks in nvmf_check_if_ready() choose to go to the reject_io case and thus we couldn't sent out an identify controller command to reconnect. Change the controller state to NVME_CTRL_CONNECTING after tearing down the old connection and before re-establishing the connection. Fixes: bb06ec31452f ("nvme: expand nvmf_check_if_ready checks") Signed-off-by: Johannes Thumshirn Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/target/loop.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 31fdfba556a8..27a8561c0cb9 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -469,6 +469,12 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) nvme_stop_ctrl(&ctrl->ctrl); nvme_loop_shutdown_ctrl(ctrl); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure should never happen */ + WARN_ON_ONCE(1); + return; + } + ret = nvme_loop_configure_admin_queue(ctrl); if (ret) goto out_disable; From b8b784958eccbf8f51ebeee65282ca3fd59ea391 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 May 2018 18:26:26 +0200 Subject: [PATCH 16/16] bdi: Fix oops in wb_workfn() Syzbot has reported that it can hit a NULL pointer dereference in wb_workfn() due to wb->bdi->dev being NULL. This indicates that wb_workfn() was called for an already unregistered bdi which should not happen as wb_shutdown() called from bdi_unregister() should make sure all pending writeback works are completed before bdi is unregistered. Except that wb_workfn() itself can requeue the work with: mod_delayed_work(bdi_wq, &wb->dwork, 0); and if this happens while wb_shutdown() is waiting in: flush_delayed_work(&wb->dwork); the dwork can get executed after wb_shutdown() has finished and bdi_unregister() has cleared wb->bdi->dev. Make wb_workfn() use wakeup_wb() for requeueing the work which takes all the necessary precautions against racing with bdi unregistration. CC: Tetsuo Handa CC: Tejun Heo Fixes: 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Reported-by: syzbot Reviewed-by: Dave Chinner Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 47d7c151fcba..471d863958bc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1961,7 +1961,7 @@ void wb_workfn(struct work_struct *work) } if (!list_empty(&wb->work_list)) - mod_delayed_work(bdi_wq, &wb->dwork, 0); + wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb);