From 973e5405f2f67ddbb2bf07b3ffc71908a37fea8e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 13 Aug 2018 16:01:10 +0200 Subject: [PATCH 01/15] xen/blkback: don't keep persistent grants too long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Persistent grants are allocated until a threshold per ring is being reached. Those grants won't be freed until the ring is being destroyed meaning there will be resources kept busy which might no longer be used. Instead of freeing only persistent grants until the threshold is reached add a timestamp and remove all persistent grants not having been in use for a minute. Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- .../ABI/testing/sysfs-driver-xen-blkback | 10 +++ drivers/block/xen-blkback/blkback.c | 90 ++++++++++--------- drivers/block/xen-blkback/common.h | 8 +- 3 files changed, 61 insertions(+), 47 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback index 8bb43b66eb55..4e7babb3ba1f 100644 --- a/Documentation/ABI/testing/sysfs-driver-xen-blkback +++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback @@ -15,3 +15,13 @@ Description: blkback. If the frontend tries to use more than max_persistent_grants, the LRU kicks in and starts removing 5% of max_persistent_grants every 100ms. + +What: /sys/module/xen_blkback/parameters/persistent_grant_unused_seconds +Date: August 2018 +KernelVersion: 4.19 +Contact: Roger Pau Monné +Description: + How long a persistent grant is allowed to remain + allocated without being in use. The time is in + seconds, 0 means indefinitely long. + The default is 60 seconds. diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index b55b245e8052..9eae7b243f68 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -83,6 +83,18 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); +/* + * How long a persistent grant is allowed to remain allocated without being in + * use. The time is in seconds, 0 means indefinitely long. + */ + +static unsigned int xen_blkif_pgrant_timeout = 60; +module_param_named(persistent_grant_unused_seconds, xen_blkif_pgrant_timeout, + uint, 0644); +MODULE_PARM_DESC(persistent_grant_unused_seconds, + "Time in seconds an unused persistent grant is allowed to " + "remain allocated. Default is 60, 0 means unlimited."); + /* * Maximum number of rings/queues blkback supports, allow as many queues as there * are CPUs if user has not specified a value. @@ -123,6 +135,13 @@ module_param(log_stats, int, 0644); /* Number of free pages to remove on each call to gnttab_free_pages */ #define NUM_BATCH_FREE_PAGES 10 +static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt) +{ + return xen_blkif_pgrant_timeout && + (jiffies - persistent_gnt->last_used >= + HZ * xen_blkif_pgrant_timeout); +} + static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) { unsigned long flags; @@ -278,7 +297,7 @@ static void put_persistent_gnt(struct xen_blkif_ring *ring, { if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) pr_alert_ratelimited("freeing a grant already unused\n"); - set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); + persistent_gnt->last_used = jiffies; clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); atomic_dec(&ring->persistent_gnt_in_use); } @@ -371,26 +390,26 @@ static void purge_persistent_gnt(struct xen_blkif_ring *ring) struct persistent_gnt *persistent_gnt; struct rb_node *n; unsigned int num_clean, total; - bool scan_used = false, clean_used = false; + bool scan_used = false; struct rb_root *root; - if (ring->persistent_gnt_c < xen_blkif_max_pgrants || - (ring->persistent_gnt_c == xen_blkif_max_pgrants && - !ring->blkif->vbd.overflow_max_grants)) { - goto out; - } - if (work_busy(&ring->persistent_purge_work)) { pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); goto out; } - num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; - num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; - num_clean = min(ring->persistent_gnt_c, num_clean); - if ((num_clean == 0) || - (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use)))) - goto out; + if (ring->persistent_gnt_c < xen_blkif_max_pgrants || + (ring->persistent_gnt_c == xen_blkif_max_pgrants && + !ring->blkif->vbd.overflow_max_grants)) { + num_clean = 0; + } else { + num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; + num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + + num_clean; + num_clean = min(ring->persistent_gnt_c, num_clean); + pr_debug("Going to purge at least %u persistent grants\n", + num_clean); + } /* * At this point, we can assure that there will be no calls @@ -401,9 +420,7 @@ static void purge_persistent_gnt(struct xen_blkif_ring *ring) * number of grants. */ - total = num_clean; - - pr_debug("Going to purge %u persistent grants\n", num_clean); + total = 0; BUG_ON(!list_empty(&ring->persistent_purge_list)); root = &ring->persistent_gnts; @@ -412,47 +429,38 @@ purge_list: BUG_ON(persistent_gnt->handle == BLKBACK_INVALID_HANDLE); - if (clean_used) { - clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); - continue; - } - if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) continue; - if (!scan_used && - (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags))) + if (!scan_used && !persistent_gnt_timeout(persistent_gnt)) + continue; + if (scan_used && total >= num_clean) continue; rb_erase(&persistent_gnt->node, root); list_add(&persistent_gnt->remove_node, &ring->persistent_purge_list); - if (--num_clean == 0) - goto finished; + total++; } /* - * If we get here it means we also need to start cleaning + * Check whether we also need to start cleaning * grants that were used since last purge in order to cope * with the requested num */ - if (!scan_used && !clean_used) { - pr_debug("Still missing %u purged frames\n", num_clean); + if (!scan_used && total < num_clean) { + pr_debug("Still missing %u purged frames\n", num_clean - total); scan_used = true; goto purge_list; } -finished: - if (!clean_used) { - pr_debug("Finished scanning for grants to clean, removing used flag\n"); - clean_used = true; - goto purge_list; + + if (total) { + ring->persistent_gnt_c -= total; + ring->blkif->vbd.overflow_max_grants = 0; + + /* We can defer this work */ + schedule_work(&ring->persistent_purge_work); + pr_debug("Purged %u/%u\n", num_clean, total); } - ring->persistent_gnt_c -= (total - num_clean); - ring->blkif->vbd.overflow_max_grants = 0; - - /* We can defer this work */ - schedule_work(&ring->persistent_purge_work); - pr_debug("Purged %u/%u\n", (total - num_clean), total); - out: return; } diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index ecb35fe8ca8d..7bff72db3b7e 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -234,14 +234,9 @@ struct xen_vbd { struct backend_info; /* Number of available flags */ -#define PERSISTENT_GNT_FLAGS_SIZE 2 +#define PERSISTENT_GNT_FLAGS_SIZE 1 /* This persistent grant is currently in use */ #define PERSISTENT_GNT_ACTIVE 0 -/* - * This persistent grant has been used, this flag is set when we remove the - * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently. - */ -#define PERSISTENT_GNT_WAS_ACTIVE 1 /* Number of requests that we can fit in a ring */ #define XEN_BLKIF_REQS_PER_PAGE 32 @@ -250,6 +245,7 @@ struct persistent_gnt { struct page *page; grant_ref_t gnt; grant_handle_t handle; + unsigned long last_used; DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE); struct rb_node node; struct list_head remove_node; From a46b53672b2c2e3770b38a4abf90d16364d2584b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 13 Aug 2018 16:01:11 +0200 Subject: [PATCH 02/15] xen/blkfront: cleanup stale persistent grants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a periodic cleanup function to remove old persistent grants which are no longer in use on the backend side. This avoids starvation in case there are lots of persistent grants for a device which no longer is involved in I/O business. Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 94 ++++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 4 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8986adab9bf5..a2a395f85a41 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -121,6 +122,8 @@ static inline struct blkif_req *blkif_req(struct request *rq) static DEFINE_MUTEX(blkfront_mutex); static const struct block_device_operations xlvbd_block_fops; +static struct delayed_work blkfront_work; +static LIST_HEAD(info_list); /* * Maximum number of segments in indirect requests, the actual value used by @@ -216,6 +219,7 @@ struct blkfront_info /* Save uncomplete reqs and bios for migration. */ struct list_head requests; struct bio_list bio_list; + struct list_head info_list; }; static unsigned int nr_minors; @@ -1759,6 +1763,12 @@ abort_transaction: return err; } +static void free_info(struct blkfront_info *info) +{ + list_del(&info->info_list); + kfree(info); +} + /* Common code used when first setting up, and when resuming. */ static int talk_to_blkback(struct xenbus_device *dev, struct blkfront_info *info) @@ -1880,7 +1890,10 @@ again: destroy_blkring: blkif_free(info, 0); - kfree(info); + mutex_lock(&blkfront_mutex); + free_info(info); + mutex_unlock(&blkfront_mutex); + dev_set_drvdata(&dev->dev, NULL); return err; @@ -1991,6 +2004,10 @@ static int blkfront_probe(struct xenbus_device *dev, info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); dev_set_drvdata(&dev->dev, info); + mutex_lock(&blkfront_mutex); + list_add(&info->info_list, &info_list); + mutex_unlock(&blkfront_mutex); + return 0; } @@ -2301,6 +2318,12 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) if (indirect_segments <= BLKIF_MAX_SEGMENTS_PER_REQUEST) indirect_segments = 0; info->max_indirect_segments = indirect_segments; + + if (info->feature_persistent) { + mutex_lock(&blkfront_mutex); + schedule_delayed_work(&blkfront_work, HZ * 10); + mutex_unlock(&blkfront_mutex); + } } /* @@ -2482,7 +2505,9 @@ static int blkfront_remove(struct xenbus_device *xbdev) mutex_unlock(&info->mutex); if (!bdev) { - kfree(info); + mutex_lock(&blkfront_mutex); + free_info(info); + mutex_unlock(&blkfront_mutex); return 0; } @@ -2502,7 +2527,9 @@ static int blkfront_remove(struct xenbus_device *xbdev) if (info && !bdev->bd_openers) { xlvbd_release_gendisk(info); disk->private_data = NULL; - kfree(info); + mutex_lock(&blkfront_mutex); + free_info(info); + mutex_unlock(&blkfront_mutex); } mutex_unlock(&bdev->bd_mutex); @@ -2585,7 +2612,7 @@ static void blkif_release(struct gendisk *disk, fmode_t mode) dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); xlvbd_release_gendisk(info); disk->private_data = NULL; - kfree(info); + free_info(info); } out: @@ -2618,6 +2645,61 @@ static struct xenbus_driver blkfront_driver = { .is_ready = blkfront_is_ready, }; +static void purge_persistent_grants(struct blkfront_info *info) +{ + unsigned int i; + unsigned long flags; + + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + struct grant *gnt_list_entry, *tmp; + + spin_lock_irqsave(&rinfo->ring_lock, flags); + + if (rinfo->persistent_gnts_c == 0) { + spin_unlock_irqrestore(&rinfo->ring_lock, flags); + continue; + } + + list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants, + node) { + if (gnt_list_entry->gref == GRANT_INVALID_REF || + gnttab_query_foreign_access(gnt_list_entry->gref)) + continue; + + list_del(&gnt_list_entry->node); + gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); + rinfo->persistent_gnts_c--; + __free_page(gnt_list_entry->page); + kfree(gnt_list_entry); + } + + spin_unlock_irqrestore(&rinfo->ring_lock, flags); + } +} + +static void blkfront_delay_work(struct work_struct *work) +{ + struct blkfront_info *info; + bool need_schedule_work = false; + + mutex_lock(&blkfront_mutex); + + list_for_each_entry(info, &info_list, info_list) { + if (info->feature_persistent) { + need_schedule_work = true; + mutex_lock(&info->mutex); + purge_persistent_grants(info); + mutex_unlock(&info->mutex); + } + } + + if (need_schedule_work) + schedule_delayed_work(&blkfront_work, HZ * 10); + + mutex_unlock(&blkfront_mutex); +} + static int __init xlblk_init(void) { int ret; @@ -2650,6 +2732,8 @@ static int __init xlblk_init(void) return -ENODEV; } + INIT_DELAYED_WORK(&blkfront_work, blkfront_delay_work); + ret = xenbus_register_frontend(&blkfront_driver); if (ret) { unregister_blkdev(XENVBD_MAJOR, DEV_NAME); @@ -2663,6 +2747,8 @@ module_init(xlblk_init); static void __exit xlblk_exit(void) { + cancel_delayed_work_sync(&blkfront_work); + xenbus_unregister_driver(&blkfront_driver); unregister_blkdev(XENVBD_MAJOR, DEV_NAME); kfree(minors); From 4bcddbae019df2614ea36976ba3d36313f93c6d3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 13 Aug 2018 16:01:12 +0200 Subject: [PATCH 03/15] xen/blkfront: reorder tests in xlblk_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case we don't want pv block devices we should not test parameters for sanity and eventually print out error messages. So test precluding conditions before checking parameters. Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a2a395f85a41..a71d817e900d 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2708,6 +2708,15 @@ static int __init xlblk_init(void) if (!xen_domain()) return -ENODEV; + if (!xen_has_pv_disk_devices()) + return -ENODEV; + + if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { + pr_warn("xen_blk: can't get major %d with name %s\n", + XENVBD_MAJOR, DEV_NAME); + return -ENODEV; + } + if (xen_blkif_max_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST) xen_blkif_max_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; @@ -2723,15 +2732,6 @@ static int __init xlblk_init(void) xen_blkif_max_queues = nr_cpus; } - if (!xen_has_pv_disk_devices()) - return -ENODEV; - - if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { - printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n", - XENVBD_MAJOR, DEV_NAME); - return -ENODEV; - } - INIT_DELAYED_WORK(&blkfront_work, blkfront_delay_work); ret = xenbus_register_frontend(&blkfront_driver); From d77ff24e7fa2258877fa0b87efa06b9a58a37aab Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 13 Aug 2018 16:01:13 +0200 Subject: [PATCH 04/15] xen/blkback: move persistent grants flags to bool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct persistent_gnt flags member is meant to be a bitfield of different flags. There is only PERSISTENT_GNT_ACTIVE flag left, so convert it to a bool named "active". Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 13 ++++++------- drivers/block/xen-blkback/common.h | 7 +------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 9eae7b243f68..fd1e19f1a49f 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -255,8 +255,7 @@ static int add_persistent_gnt(struct xen_blkif_ring *ring, } } - bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE); - set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); + persistent_gnt->active = true; /* Add new node and rebalance tree. */ rb_link_node(&(persistent_gnt->node), parent, new); rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts); @@ -280,11 +279,11 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring, else if (gref > data->gnt) node = node->rb_right; else { - if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) { + if (data->active) { pr_alert_ratelimited("requesting a grant already in use\n"); return NULL; } - set_bit(PERSISTENT_GNT_ACTIVE, data->flags); + data->active = true; atomic_inc(&ring->persistent_gnt_in_use); return data; } @@ -295,10 +294,10 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring, static void put_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { - if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) + if (!persistent_gnt->active) pr_alert_ratelimited("freeing a grant already unused\n"); persistent_gnt->last_used = jiffies; - clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); + persistent_gnt->active = false; atomic_dec(&ring->persistent_gnt_in_use); } @@ -429,7 +428,7 @@ purge_list: BUG_ON(persistent_gnt->handle == BLKBACK_INVALID_HANDLE); - if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) + if (persistent_gnt->active) continue; if (!scan_used && !persistent_gnt_timeout(persistent_gnt)) continue; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 7bff72db3b7e..2339b8d39c5e 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -233,11 +233,6 @@ struct xen_vbd { struct backend_info; -/* Number of available flags */ -#define PERSISTENT_GNT_FLAGS_SIZE 1 -/* This persistent grant is currently in use */ -#define PERSISTENT_GNT_ACTIVE 0 - /* Number of requests that we can fit in a ring */ #define XEN_BLKIF_REQS_PER_PAGE 32 @@ -246,7 +241,7 @@ struct persistent_gnt { grant_ref_t gnt; grant_handle_t handle; unsigned long last_used; - DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE); + bool active; struct rb_node node; struct list_head remove_node; }; From 6f2f39ad1a54978394851c05e327419ebeb7227e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 13 Aug 2018 16:01:14 +0200 Subject: [PATCH 05/15] xen/blkback: remove unused pers_gnts_lock from struct xen_blkif_ring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pers_gnts_lock isn't being used anywhere. Remove it. Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/common.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 2339b8d39c5e..1d3002d773f7 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -269,7 +269,6 @@ struct xen_blkif_ring { wait_queue_head_t pending_free_wq; /* Tree to store persistent grants. */ - spinlock_t pers_gnts_lock; struct rb_root persistent_gnts; unsigned int persistent_gnt_c; atomic_t persistent_gnt_in_use; From 061a5427530633de93ace4ef001b99961984af62 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 26 Aug 2018 10:09:06 -0600 Subject: [PATCH 06/15] blk-wbt: abstract out end IO completion handler Prep patch for calling the handler from a different context, no functional changes in this patch. Tested-by: Agarwal, Anchal Signed-off-by: Jens Axboe --- block/blk-wbt.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 84507d3e9a98..4575b4650370 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -123,16 +123,11 @@ static void rwb_wake_all(struct rq_wb *rwb) } } -static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) +static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, + enum wbt_flags wb_acct) { - struct rq_wb *rwb = RQWB(rqos); - struct rq_wait *rqw; int inflight, limit; - if (!(wb_acct & WBT_TRACKED)) - return; - - rqw = get_rq_wait(rwb, wb_acct); inflight = atomic_dec_return(&rqw->inflight); /* @@ -170,6 +165,18 @@ static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) } } +static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) +{ + struct rq_wb *rwb = RQWB(rqos); + struct rq_wait *rqw; + + if (!(wb_acct & WBT_TRACKED)) + return; + + rqw = get_rq_wait(rwb, wb_acct); + wbt_rqw_done(rwb, rqw, wb_acct); +} + /* * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed. From 38cfb5a45ee013bfab5d1ae4c4738815e744b440 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 26 Aug 2018 10:10:05 -0600 Subject: [PATCH 07/15] blk-wbt: improve waking of tasks We have two potential issues: 1) After commit 2887e41b910b, we only wake one process at the time when we finish an IO. We really want to wake up as many tasks as can queue IO. Before this commit, we woke up everyone, which could cause a thundering herd issue. 2) A task can potentially consume two wakeups, causing us to (in practice) miss a wakeup. Fix both by providing our own wakeup function, which stops __wake_up_common() from waking up more tasks if we fail to get a queueing token. With the strict ordering we have on the wait list, this wakes the right tasks and the right amount of tasks. Based on a patch from Jianchao Wang . Tested-by: Agarwal, Anchal Signed-off-by: Jens Axboe --- block/blk-wbt.c | 65 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4575b4650370..bfb0d21d19ce 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -161,7 +161,7 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) - wake_up(&rqw->wait); + wake_up_all(&rqw->wait); } } @@ -488,6 +488,34 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) return limit; } +struct wbt_wait_data { + struct wait_queue_entry wq; + struct task_struct *task; + struct rq_wb *rwb; + struct rq_wait *rqw; + unsigned long rw; + bool got_token; +}; + +static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, + int wake_flags, void *key) +{ + struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, + wq); + + /* + * If we fail to get a budget, return -1 to interrupt the wake up + * loop in __wake_up_common. + */ + if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw))) + return -1; + + data->got_token = true; + list_del_init(&curr->entry); + wake_up_process(data->task); + return 1; +} + /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. @@ -498,31 +526,52 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, __acquires(lock) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); - DECLARE_WAITQUEUE(wait, current); + struct wbt_wait_data data = { + .wq = { + .func = wbt_wake_function, + .entry = LIST_HEAD_INIT(data.wq.entry), + }, + .task = current, + .rwb = rwb, + .rqw = rqw, + .rw = rw, + }; bool has_sleeper; has_sleeper = wq_has_sleeper(&rqw->wait); if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) return; - add_wait_queue_exclusive(&rqw->wait, &wait); + prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) + if (data.got_token) break; + if (!has_sleeper && + rq_wait_inc_below(rqw, get_limit(rwb, rw))) { + finish_wait(&rqw->wait, &data.wq); + + /* + * We raced with wbt_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + */ + if (data.got_token) + wbt_rqw_done(rwb, rqw, wb_acct); + break; + } + if (lock) { spin_unlock_irq(lock); io_schedule(); spin_lock_irq(lock); } else io_schedule(); + has_sleeper = false; } while (1); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&rqw->wait, &wait); + finish_wait(&rqw->wait, &data.wq); } static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) From b0a84beb2e35536839ea289182684528f379b860 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Aug 2018 13:32:12 -0600 Subject: [PATCH 08/15] blk-wbt: remove dead code We already note and mark discard and swap IO from bio_to_wbt_flags(). Signed-off-by: Jens Axboe --- block/blk-wbt.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index bfb0d21d19ce..8e20a0677dcf 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -636,11 +636,6 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) return; } - if (current_is_kswapd()) - flags |= WBT_KSWAPD; - if (bio_op(bio) == REQ_OP_DISCARD) - flags |= WBT_DISCARD; - __wbt_wait(rwb, flags, bio->bi_opf, lock); if (!blk_stat_is_active(rwb->cb)) From 46cb52ad414ac829680d0bb8cc7090ac2b577ca7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 15 Jul 2018 22:09:29 +0200 Subject: [PATCH 09/15] ata: ftide010: Add a quirk for SQ201 The DMA is broken on this specific device for some unknown reason (probably badly designed or plain broken interface electronics) and will only work with PIO. Other users of the same hardware does not have this problem. Add a specific quirk so that this Gemini device gets DMA turned off. Also fix up some code around passing the port information around in probe while we're at it. Signed-off-by: Linus Walleij Signed-off-by: Jens Axboe --- drivers/ata/pata_ftide010.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/ata/pata_ftide010.c b/drivers/ata/pata_ftide010.c index 5d4b72e21161..569a4a662dcd 100644 --- a/drivers/ata/pata_ftide010.c +++ b/drivers/ata/pata_ftide010.c @@ -256,14 +256,12 @@ static struct ata_port_operations pata_ftide010_port_ops = { .qc_issue = ftide010_qc_issue, }; -static struct ata_port_info ftide010_port_info[] = { - { - .flags = ATA_FLAG_SLAVE_POSS, - .mwdma_mask = ATA_MWDMA2, - .udma_mask = ATA_UDMA6, - .pio_mask = ATA_PIO4, - .port_ops = &pata_ftide010_port_ops, - }, +static struct ata_port_info ftide010_port_info = { + .flags = ATA_FLAG_SLAVE_POSS, + .mwdma_mask = ATA_MWDMA2, + .udma_mask = ATA_UDMA6, + .pio_mask = ATA_PIO4, + .port_ops = &pata_ftide010_port_ops, }; #if IS_ENABLED(CONFIG_SATA_GEMINI) @@ -349,6 +347,7 @@ static int pata_ftide010_gemini_cable_detect(struct ata_port *ap) } static int pata_ftide010_gemini_init(struct ftide010 *ftide, + struct ata_port_info *pi, bool is_ata1) { struct device *dev = ftide->dev; @@ -373,7 +372,13 @@ static int pata_ftide010_gemini_init(struct ftide010 *ftide, /* Flag port as SATA-capable */ if (gemini_sata_bridge_enabled(sg, is_ata1)) - ftide010_port_info[0].flags |= ATA_FLAG_SATA; + pi->flags |= ATA_FLAG_SATA; + + /* This device has broken DMA, only PIO works */ + if (of_machine_is_compatible("itian,sq201")) { + pi->mwdma_mask = 0; + pi->udma_mask = 0; + } /* * We assume that a simple 40-wire cable is used in the PATA mode. @@ -435,6 +440,7 @@ static int pata_ftide010_gemini_init(struct ftide010 *ftide, } #else static int pata_ftide010_gemini_init(struct ftide010 *ftide, + struct ata_port_info *pi, bool is_ata1) { return -ENOTSUPP; @@ -446,7 +452,7 @@ static int pata_ftide010_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; - const struct ata_port_info pi = ftide010_port_info[0]; + struct ata_port_info pi = ftide010_port_info; const struct ata_port_info *ppi[] = { &pi, NULL }; struct ftide010 *ftide; struct resource *res; @@ -490,6 +496,7 @@ static int pata_ftide010_probe(struct platform_device *pdev) * are ATA0. This will also set up the cable types. */ ret = pata_ftide010_gemini_init(ftide, + &pi, (res->start == 0x63400000)); if (ret) goto err_dis_clk; From 62d2a1940709198a522a43ff8be8b8f6b3654dec Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 28 Aug 2018 07:31:11 +0800 Subject: [PATCH 10/15] block: remove unnecessary condition check kmem_cache_destroy() can handle NULL pointer correctly, so there is no need to check e->icq_cache before calling kmem_cache_destroy(). Signed-off-by: Chengguang Xu Signed-off-by: Jens Axboe --- block/elevator.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index 5ea6e7d600e4..6a06b5d040e5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -895,8 +895,7 @@ int elv_register(struct elevator_type *e) spin_lock(&elv_list_lock); if (elevator_find(e->elevator_name, e->uses_mq)) { spin_unlock(&elv_list_lock); - if (e->icq_cache) - kmem_cache_destroy(e->icq_cache); + kmem_cache_destroy(e->icq_cache); return -EBUSY; } list_add_tail(&e->list, &elv_list); From db193954ed9e35701b6e489fa4cc97b08589341b Mon Sep 17 00:00:00 2001 From: John Pittman Date: Mon, 27 Aug 2018 14:33:05 -0400 Subject: [PATCH 11/15] block: bsg: move atomic_t ref_count variable to refcount API Currently, variable ref_count within the bsg_device struct is of type atomic_t. For variables being used as reference counters, the refcount API should be used instead of atomic. The newer refcount API works to prevent counter overflows and use-after-free bugs. So, move this varable from the atomic API to refcount, potentially avoiding the issues mentioned. Signed-off-by: John Pittman Signed-off-by: Jens Axboe --- block/bsg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index db588add6ba6..9a442c23a715 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -37,7 +37,7 @@ struct bsg_device { struct request_queue *queue; spinlock_t lock; struct hlist_node dev_list; - atomic_t ref_count; + refcount_t ref_count; char name[20]; int max_queue; }; @@ -252,7 +252,7 @@ static int bsg_put_device(struct bsg_device *bd) mutex_lock(&bsg_mutex); - if (!atomic_dec_and_test(&bd->ref_count)) { + if (!refcount_dec_and_test(&bd->ref_count)) { mutex_unlock(&bsg_mutex); return 0; } @@ -290,7 +290,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, bd->queue = rq; - atomic_set(&bd->ref_count, 1); + refcount_set(&bd->ref_count, 1); hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); @@ -308,7 +308,7 @@ static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) { if (bd->queue == q) { - atomic_inc(&bd->ref_count); + refcount_inc(&bd->ref_count); goto found; } } From f1ed3df20d2d223e0852cc4ac1f19bba869a7e3c Mon Sep 17 00:00:00 2001 From: Michal Wnukowski Date: Wed, 15 Aug 2018 15:51:57 -0700 Subject: [PATCH 12/15] nvme-pci: add a memory barrier to nvme_dbbuf_update_and_check_event In many architectures loads may be reordered with older stores to different locations. In the nvme driver the following two operations could be reordered: - Write shadow doorbell (dbbuf_db) into memory. - Read EventIdx (dbbuf_ei) from memory. This can result in a potential race condition between driver and VM host processing requests (if given virtual NVMe controller has a support for shadow doorbell). If that occurs, then the NVMe controller may decide to wait for MMIO doorbell from guest operating system, and guest driver may decide not to issue MMIO doorbell on any of subsequent commands. This issue is purely timing-dependent one, so there is no easy way to reproduce it. Currently the easiest known approach is to run "Oracle IO Numbers" (orion) that is shipped with Oracle DB: orion -run advanced -num_large 0 -size_small 8 -type rand -simulate \ concat -write 40 -duration 120 -matrix row -testname nvme_test Where nvme_test is a .lun file that contains a list of NVMe block devices to run test against. Limiting number of vCPUs assigned to given VM instance seems to increase chances for this bug to occur. On test environment with VM that got 4 NVMe drives and 1 vCPU assigned the virtual NVMe controller hang could be observed within 10-20 minutes. That correspond to about 400-500k IO operations processed (or about 100GB of IO read/writes). Orion tool was used as a validation and set to run in a loop for 36 hours (equivalent of pushing 550M IO operations). No issues were observed. That suggest that the patch fixes the issue. Fixes: f9f38e33389c ("nvme: improve performance for virtual NVMe devices") Signed-off-by: Michal Wnukowski Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg [hch: updated changelog and comment a bit] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1b9951d2067e..d668682f91df 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -316,6 +316,14 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, old_value = *dbbuf_db; *dbbuf_db = value; + /* + * Ensure that the doorbell is updated before reading the event + * index from memory. The controller needs to provide similar + * ordering to ensure the envent index is updated before reading + * the doorbell. + */ + mb(); + if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value)) return false; } From afd299ca996929f4f98ac20da0044c0cdc124879 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 9 Aug 2018 16:00:14 -0700 Subject: [PATCH 13/15] nvme-fcloop: Fix dropped LS's to removed target port When a targetport is removed from the config, fcloop will avoid calling the LS done() routine thinking the targetport is gone. This leaves the initiator reset/reconnect hanging as it waits for a status on the Create_Association LS for the reconnect. Change the filter in the LS callback path. If tport null (set when failed validation before "sending to remote port"), be sure to call done. This was the main bug. But, continue the logic that only calls done if tport was set but there is no remoteport (e.g. case where remoteport has been removed, thus host doesn't expect a completion). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 34712def81b1..5251689a1d9a 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -311,7 +311,7 @@ fcloop_tgt_lsrqst_done_work(struct work_struct *work) struct fcloop_tport *tport = tls_req->tport; struct nvmefc_ls_req *lsreq = tls_req->lsreq; - if (tport->remoteport) + if (!tport || tport->remoteport) lsreq->done(lsreq, tls_req->status); } @@ -329,6 +329,7 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, if (!rport->targetport) { tls_req->status = -ECONNREFUSED; + tls_req->tport = NULL; schedule_work(&tls_req->work); return ret; } From 04db0e5ec58167364a80fd33ddb4f3b67434eb85 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 15 Aug 2018 18:48:25 -0700 Subject: [PATCH 14/15] nvmet: free workqueue object if module init fails Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ebf3e7a6c49e..b5ec96abd048 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1210,7 +1210,7 @@ static int __init nvmet_init(void) error = nvmet_init_discovery(); if (error) - goto out; + goto out_free_work_queue; error = nvmet_init_configfs(); if (error) @@ -1219,6 +1219,8 @@ static int __init nvmet_init(void) out_exit_discovery: nvmet_exit_discovery(); +out_free_work_queue: + destroy_workqueue(buffered_io_wq); out: return error; } From 8f3fafc9c2f0ece10832c25f7ffcb07c97a32ad4 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Thu, 26 Apr 2018 11:51:08 -0600 Subject: [PATCH 15/15] cdrom: Fix info leak/OOB read in cdrom_ioctl_drive_status Like d88b6d04: "cdrom: information leak in cdrom_ioctl_media_changed()" There is another cast from unsigned long to int which causes a bounds check to fail with specially crafted input. The value is then used as an index in the slot array in cdrom_slot_status(). Signed-off-by: Scott Bauer Signed-off-by: Scott Bauer Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 113fc6edb2b0..a5d5a96479bf 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2546,7 +2546,7 @@ static int cdrom_ioctl_drive_status(struct cdrom_device_info *cdi, if (!CDROM_CAN(CDC_SELECT_DISC) || (arg == CDSL_CURRENT || arg == CDSL_NONE)) return cdi->ops->drive_status(cdi, CDSL_CURRENT); - if (((int)arg >= cdi->capacity)) + if (arg >= cdi->capacity) return -EINVAL; return cdrom_slot_status(cdi, arg); }