From d47c8ad261f787af22a220ffcc2d07afba809223 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 5 Oct 2017 16:23:16 +1100 Subject: [PATCH 01/33] md: fix deadlock error in recent patch. A recent patch aimed to cause md_write_start() to fail (rather than block) when the mddev was suspending, so as to avoid deadlocks. Unfortunately the test in wait_event() was wrong, and it didn't change behaviour at all. We wait_event() must wait until the metadata is written OR the array is suspending. Fixes: cc27b0c78c79 ("md: fix deadlock between mddev_suspend() and md_write_start()") Cc: stable@vger.kernel.org Reported-by: Xiao Ni Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0ff1bbf6c90e..8b2eb0f4122f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8039,7 +8039,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || + mddev->suspended); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; From d1d90147c9680aaec4a5757932c2103c42c9c23b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Oct 2017 10:32:48 +0800 Subject: [PATCH 02/33] md: always set THREAD_WAKEUP and wake up wqueue if thread existed Since commit 4ad23a976413 ("MD: use per-cpu counter for writes_pending"), the wait_queue is only got invoked if THREAD_WAKEUP is not set previously. With above change, I can see process_metadata_update could always hang on the wait queue, because mddev->thread could stay on 'D' status and the THREAD_WAKEUP flag is not cleared since there are lots of place to wake up mddev->thread. Then deadlock happened as follows: linux175:~ # ps aux|grep md|grep D root 20117 0.0 0.0 0 0 ? D 03:45 0:00 [md0_raid1] root 20125 0.0 0.0 0 0 ? D 03:45 0:00 [md0_cluster_rec] linux175:~ # cat /proc/20117/stack [] dlm_lock_sync+0x94/0xd0 [md_cluster] [] lock_token+0x34/0xd0 [md_cluster] [] metadata_update_start+0x64/0x110 [md_cluster] [] md_update_sb.part.58+0x9b/0x860 [md_mod] [] md_update_sb+0x15/0x30 [md_mod] [] md_check_recovery+0x266/0x490 [md_mod] [] raid1d+0x42/0x810 [raid1] [] md_thread+0x122/0x150 [md_mod] [] kthread+0x101/0x140 linux175:~ # cat /proc/20125/stack [] recv_daemon+0x3f9/0x5c0 [md_cluster] [] md_thread+0x122/0x150 [md_mod] [] kthread+0x101/0x140 So let's revert the part of code in the commit to resovle the problem since we can't get lots of benefits of previous change. Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending") Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8b2eb0f4122f..707471e3cb01 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7468,8 +7468,8 @@ void md_wakeup_thread(struct md_thread *thread) { if (thread) { pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) - wake_up(&thread->wqueue); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); } } EXPORT_SYMBOL(md_wakeup_thread); From 938b533d479e7428b7fa1b8179283646d2e2c53d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 16 Oct 2017 19:03:44 -0700 Subject: [PATCH 03/33] md/bitmap: revert a patch This reverts commit 8031c3ddc70a. That patches doesn't work well if PAGE_SIZE > 4k. We will fix the original problem with a different approach. Fix: 8031c3ddc70a(md/bitmap: copy correct data for bitmap super) Reported-by: Joshua Kinard Cc: stable@vger.kernel.org (4.10+) Suggested-by: Neil Brown Signed-off-by: Shaohua Li --- drivers/md/bitmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index d2121637b4ab..cae57b5be817 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -625,7 +625,7 @@ re_read: err = read_sb_page(bitmap->mddev, offset, sb_page, - 0, PAGE_SIZE); + 0, sizeof(bitmap_super_t)); } if (err) return err; @@ -2123,7 +2123,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (store.sb_page && bitmap->storage.sb_page) memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), - PAGE_SIZE); + sizeof(bitmap_super_t)); bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; From 385f4d7f946b08f36f68b0a28e95a319925b6b62 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 29 Sep 2017 09:16:43 +0800 Subject: [PATCH 04/33] md-cluster: fix wrong condition check in raid1_write_request The check used here is to avoid conflict between write and resync, however we used the wrong logic, it should be the inverse of the checking inside "if". Fixes: 589a1c4 ("Suspend writes in RAID1 if within range") Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f3f3e40dc9d8..35264ad0ec70 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1325,12 +1325,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, sigset_t full, old; prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); - if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi || - (mddev_is_clustered(mddev) && + if ((bio_end_sector(bio) <= mddev->suspend_lo || + bio->bi_iter.bi_sector >= mddev->suspend_hi) && + (!mddev_is_clustered(mddev) || !md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, - bio_end_sector(bio)))) + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) break; sigfillset(&full); sigprocmask(SIG_BLOCK, &full, &old); From 611426e2737235cf05e1b8f27d2502b96a5e05d9 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Fri, 29 Sep 2017 22:54:18 +0200 Subject: [PATCH 05/33] raid5-ppl: don't resync after rebuild The check for degraded array is unnecessary and causes a resync to be performed after ppl recovery and rebuild when restarting an array during rebuilding after unclean shutdown. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5-ppl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index cd026c88f7ef..76d6245427b8 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1296,8 +1296,7 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && - mddev->recovery_cp == 0 && !mddev->degraded && + } else if (!mddev->pers && mddev->recovery_cp == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* From 07719ff767dcd8cc42050f185d332052f3816546 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Fri, 29 Sep 2017 22:54:19 +0200 Subject: [PATCH 06/33] raid5-ppl: check recovery_offset when performing ppl recovery If starting an array that is undergoing rebuild, make ppl recovery honor the recovery_offset of a member disk and don't read data that is not yet in-sync. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5-ppl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 76d6245427b8..628c0bf7b9fd 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -758,7 +758,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)sector); rdev = conf->disks[dd_idx].rdev; - if (!rdev) { + if (!rdev || (!test_bit(In_sync, &rdev->flags) && + sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", __func__, indent, "", dd_idx); update_parity = false; From 7a57157aeb157cd02ccdcff237bbf63440035b07 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 10:51:17 +0100 Subject: [PATCH 07/33] md-cluster: make function cluster_check_sync_size static The function cluster_check_sync_size is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'cluster_check_sync_size' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 03082e17c65c..bf41492a2cb0 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1094,7 +1094,7 @@ static void metadata_update_cancel(struct mddev *mddev) /* * return 0 if all the bitmaps have the same sync_size */ -int cluster_check_sync_size(struct mddev *mddev) +static int cluster_check_sync_size(struct mddev *mddev) { int i, rv; bitmap_super_t *sb; From 584ed9fa9532f8b9d5955628ff87ee3b2ab9f5a9 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 5 Oct 2017 11:28:47 -0700 Subject: [PATCH 08/33] md: raid10: remove VLAIS The raid10 driver can't be built with clang since it uses a variable length array in a structure (VLAIS): drivers/md/raid10.c:4583:17: error: fields must have a constant size: 'variable length array in structure' extension will never be supported Allocate the r10bio struct with kmalloc instead of using the VLAIS construct. Shaohua: set the MD_RECOVERY_INTR bit Neil Brown: use GFP_NOIO Signed-off-by: Matthias Kaehlcke Reviewed-by: Guenter Roeck Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 374df5796649..950fbefbedbb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4578,15 +4578,18 @@ static int handle_reshape_read_error(struct mddev *mddev, /* Use sync reads to get the blocks from somewhere else */ int sectors = r10_bio->sectors; struct r10conf *conf = mddev->private; - struct { - struct r10bio r10_bio; - struct r10dev devs[conf->copies]; - } on_stack; - struct r10bio *r10b = &on_stack.r10_bio; + struct r10bio *r10b; int slot = 0; int idx = 0; struct page **pages; + r10b = kmalloc(sizeof(*r10b) + + sizeof(struct r10dev) * conf->copies, GFP_NOIO); + if (!r10b) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return -ENOMEM; + } + /* reshape IOs share pages from .devs[0].bio */ pages = get_resync_pages(r10_bio->devs[0].bio)->pages; @@ -4635,11 +4638,13 @@ static int handle_reshape_read_error(struct mddev *mddev, /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + kfree(r10b); return -EIO; } sectors -= s; idx++; } + kfree(r10b); return 0; } From 935fe0983e09f4f7331ebf5ea4ae2124f6e9f9e8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 10 Oct 2017 17:02:41 -0400 Subject: [PATCH 09/33] md: rename some drivers/md/ files to have an "md-" prefix Motivated by the desire to illiminate the imprecise nature of DM-specific patches being unnecessarily sent to both the MD maintainer and mailing-list. Which is born out of the fact that DM files also reside in drivers/md/ Now all MD-specific files in drivers/md/ start with either "raid" or "md-" and the MAINTAINERS file has been updated accordingly. Shaohua: don't change module name Signed-off-by: Mike Snitzer Signed-off-by: Shaohua Li --- MAINTAINERS | 7 ++++++- drivers/md/Makefile | 5 ++++- drivers/md/dm-raid.c | 2 +- drivers/md/{bitmap.c => md-bitmap.c} | 2 +- drivers/md/{bitmap.h => md-bitmap.h} | 0 drivers/md/md-cluster.c | 2 +- drivers/md/{faulty.c => md-faulty.c} | 0 drivers/md/{linear.c => md-linear.c} | 2 +- drivers/md/{linear.h => md-linear.h} | 0 drivers/md/{multipath.c => md-multipath.c} | 2 +- drivers/md/{multipath.h => md-multipath.h} | 0 drivers/md/md.c | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5-cache.c | 2 +- drivers/md/raid5.c | 2 +- 16 files changed, 20 insertions(+), 12 deletions(-) rename drivers/md/{bitmap.c => md-bitmap.c} (99%) rename drivers/md/{bitmap.h => md-bitmap.h} (100%) rename drivers/md/{faulty.c => md-faulty.c} (100%) rename drivers/md/{linear.c => md-linear.c} (99%) rename drivers/md/{linear.h => md-linear.h} (100%) rename drivers/md/{multipath.c => md-multipath.c} (99%) rename drivers/md/{multipath.h => md-multipath.h} (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 65b0c88d5ee0..7649877692b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4091,6 +4091,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git T: quilt http://people.redhat.com/agk/patches/linux/editing/ S: Maintained F: Documentation/device-mapper/ +F: drivers/md/Makefile +F: drivers/md/Kconfig F: drivers/md/dm* F: drivers/md/persistent-data/ F: include/linux/device-mapper.h @@ -12446,7 +12448,10 @@ M: Shaohua Li L: linux-raid@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git S: Supported -F: drivers/md/ +F: drivers/md/Makefile +F: drivers/md/Kconfig +F: drivers/md/md* +F: drivers/md/raid* F: include/linux/raid/ F: include/uapi/linux/raid/ diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 786ec9e86d65..693602ffdd38 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -18,9 +18,12 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-smq-y += dm-cache-policy-smq.o dm-era-y += dm-era-target.o dm-verity-y += dm-verity-target.o -md-mod-y += md.o bitmap.o +md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o +linear-y += md-linear.o +multipath-y += md-multipath.o +faulty-y += md-faulty.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1ac58c5651b7..252770696a05 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -12,7 +12,7 @@ #include "raid1.h" #include "raid5.h" #include "raid10.h" -#include "bitmap.h" +#include "md-bitmap.h" #include diff --git a/drivers/md/bitmap.c b/drivers/md/md-bitmap.c similarity index 99% rename from drivers/md/bitmap.c rename to drivers/md/md-bitmap.c index cae57b5be817..b843b53b0f65 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/md-bitmap.c @@ -29,7 +29,7 @@ #include #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" static inline char *bmname(struct bitmap *bitmap) { diff --git a/drivers/md/bitmap.h b/drivers/md/md-bitmap.h similarity index 100% rename from drivers/md/bitmap.h rename to drivers/md/md-bitmap.h diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index bf41492a2cb0..bc81ecc24c96 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -15,7 +15,7 @@ #include #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #define LVB_SIZE 64 diff --git a/drivers/md/faulty.c b/drivers/md/md-faulty.c similarity index 100% rename from drivers/md/faulty.c rename to drivers/md/md-faulty.c diff --git a/drivers/md/linear.c b/drivers/md/md-linear.c similarity index 99% rename from drivers/md/linear.c rename to drivers/md/md-linear.c index c464fb48039a..773fc70dced7 100644 --- a/drivers/md/linear.c +++ b/drivers/md/md-linear.c @@ -23,7 +23,7 @@ #include #include #include "md.h" -#include "linear.h" +#include "md-linear.h" /* * find which device holds a particular offset diff --git a/drivers/md/linear.h b/drivers/md/md-linear.h similarity index 100% rename from drivers/md/linear.h rename to drivers/md/md-linear.h diff --git a/drivers/md/multipath.c b/drivers/md/md-multipath.c similarity index 99% rename from drivers/md/multipath.c rename to drivers/md/md-multipath.c index b68e0666b9b0..5c70176fa24d 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/md-multipath.c @@ -25,7 +25,7 @@ #include #include #include "md.h" -#include "multipath.h" +#include "md-multipath.h" #define MAX_WORK_PER_DISK 128 diff --git a/drivers/md/multipath.h b/drivers/md/md-multipath.h similarity index 100% rename from drivers/md/multipath.h rename to drivers/md/md-multipath.h diff --git a/drivers/md/md.c b/drivers/md/md.c index 707471e3cb01..97afb28c6f51 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,7 +69,7 @@ #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #ifndef MODULE diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 35264ad0ec70..efdabd3040e7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -43,7 +43,7 @@ #include "md.h" #include "raid1.h" -#include "bitmap.h" +#include "md-bitmap.h" #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 950fbefbedbb..862cbd162e1c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -29,7 +29,7 @@ #include "md.h" #include "raid10.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0b7406ac8ce1..2b450eee21fa 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -23,7 +23,7 @@ #include #include "md.h" #include "raid5.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 928e24a07133..10c0d87074f0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -63,7 +63,7 @@ #include "md.h" #include "raid5.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) From a0e764c54382be8da96f83bcecc9cf26de3846dc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Oct 2017 11:46:54 +0100 Subject: [PATCH 10/33] md: raid10: remove a couple of redundant variables and initializations Variables dev and bio_last_sector are assigned values that are never read and hence these are redundant variables and can be removed. Also remove the duplicated initialization of sectors, the latter assignment is identical to the first and can be removed. Cleans up 3 clang build warnings: Value stored to 'dev' is never read Value stored to 'bio_last_sector' is never read Value stored to 'sectors' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 862cbd162e1c..b0de5b5ee689 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -383,12 +383,11 @@ static void raid10_end_read_request(struct bio *bio) { int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; - int slot, dev; + int slot; struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; - dev = r10_bio->devs[slot].devnum; rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: @@ -748,7 +747,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, raid10_find_phys(conf, r10_bio); rcu_read_lock(); - sectors = r10_bio->sectors; best_slot = -1; best_rdev = NULL; best_dist = MaxSector; @@ -2575,7 +2573,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) struct bio *bio; struct r10conf *conf = mddev->private; struct md_rdev *rdev = r10_bio->devs[slot].rdev; - sector_t bio_last_sector; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. @@ -2586,7 +2583,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * frozen. */ bio = r10_bio->devs[slot].bio; - bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); r10_bio->devs[slot].bio = NULL; From 235b6003fb28f0dd8e7ed8fbdb088bb548291766 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 16:18:36 +1100 Subject: [PATCH 11/33] raid5: Set R5_Expanded on parity devices as well as data. When reshaping a fully degraded raid5/raid6 to a larger nubmer of devices, the new device(s) are not in-sync and so that can make the newly grown stripe appear to be "failed". To avoid this, we set the R5_Expanded flag to say "Even though this device is not fully in-sync, this block is safe so don't treat the device as failed for this stripe". This flag is set for data devices, not not for parity devices. Consequently, if you have a RAID6 with two devices that are partly recovered and a spare, and start a reshape to include the spare, then when the reshape gets past the point where the recovery was up to, it will think the stripes are failed and will get into an infinite loop, failing to make progress. So when contructing parity on an EXPAND_READY stripe, set R5_Expanded. Reported-by: Curt Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 10c0d87074f0..a21dbd22a2fb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1818,8 +1818,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { set_bit(R5_UPTODATE, &dev->flags); + if (test_bit(STRIPE_EXPAND_READY, &sh->state)) + set_bit(R5_Expanded, &dev->flags); + } if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) From 230b55fa8d64007339319539f8f8e68114d08529 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 14:24:09 +1100 Subject: [PATCH 12/33] md: forbid a RAID5 from having both a bitmap and a journal. Having both a bitmap and a journal is pointless. Attempting to do so can corrupt the bitmap if the journal replay happens before the bitmap is initialized. Rather than try to avoid this corruption, simply refuse to allow arrays with both a bitmap and a journal. So: - if raid5_run sees both are present, fail. - if adding a bitmap finds a journal is present, fail - if adding a journal finds a bitmap is present, fail. Cc: stable@vger.kernel.org (4.10+) Signed-off-by: NeilBrown Tested-by: Joshua Kinard Acked-by: Joshua Kinard Signed-off-by: Shaohua Li --- drivers/md/md-bitmap.c | 6 ++++++ drivers/md/md.c | 2 +- drivers/md/raid5.c | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b843b53b0f65..d1b3b60669ea 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1816,6 +1816,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) BUG_ON(file && mddev->bitmap_info.offset); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/raid:%s: array with journal cannot have bitmap\n", + mdname(mddev)); + return ERR_PTR(-EBUSY); + } + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return ERR_PTR(-ENOMEM); diff --git a/drivers/md/md.c b/drivers/md/md.c index 97afb28c6f51..6f25e3f1a1cf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6362,7 +6362,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) break; } } - if (has_journal) { + if (has_journal || mddev->bitmap) { export_rdev(rdev); return -EBUSY; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a21dbd22a2fb..a8732955f130 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7159,6 +7159,13 @@ static int raid5_run(struct mddev *mddev) min_offset_diff = diff; } + if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && + (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { + pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Difficulties arise if the stripe we would write to From 4d5324f760aacaefeb721b172aa14bf66045c332 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Oct 2017 12:17:16 +1100 Subject: [PATCH 13/33] md: always hold reconfig_mutex when calling mddev_suspend() Most often mddev_suspend() is called with reconfig_mutex held. Make this a requirement in preparation a subsequent patch. Also require reconfig_mutex to be held for mddev_resume(), partly for symmetry and partly to guarantee no races with incr/decr of mddev->suspend. Taking the mutex in r5c_disable_writeback_async() is a little tricky as this is called from a work queue via log->disable_writeback_work, and flush_work() is called on that while holding ->reconfig_mutex. If the work item hasn't run before flush_work() is called, the work function will not be able to get the mutex. So we use mddev_trylock() inside the wait_event() call, and have that abort when conf->log is set to NULL, which happens before flush_work() is called. We wait in mddev->sb_wait and ensure this is woken when any of the conditions change. This requires waking mddev->sb_wait in mddev_unlock(). This is only like to trigger extra wake_ups of threads that needn't be woken when metadata is being written, and that doesn't happen often enough that the cost would be noticeable. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/dm-raid.c | 10 ++++++++-- drivers/md/md.c | 3 +++ drivers/md/raid5-cache.c | 18 +++++++++++++----- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 252770696a05..8b1d93114f40 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3628,8 +3628,11 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; - if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); + mddev_unlock(&rs->md); + } rs->md.ro = 1; } @@ -3886,8 +3889,11 @@ static void raid_resume(struct dm_target *ti) if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(mddev); mddev_resume(mddev); + mddev_unlock(mddev); + } } static struct target_type raid_target = { diff --git a/drivers/md/md.c b/drivers/md/md.c index 6f25e3f1a1cf..9767bb33df56 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -344,6 +344,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) void mddev_suspend(struct mddev *mddev) { WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; synchronize_rcu(); @@ -357,6 +358,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { + lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; wake_up(&mddev->sb_wait); @@ -663,6 +665,7 @@ void mddev_unlock(struct mddev *mddev) */ spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2b450eee21fa..59af7cf35092 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -693,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work) struct r5l_log *log = container_of(work, struct r5l_log, disable_writeback_work); struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int locked = 0; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -701,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - - mddev_suspend(mddev); - log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - mddev_resume(mddev); + conf->log == NULL || + (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && + (locked = mddev_trylock(mddev)))); + if (locked) { + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); + mddev_unlock(mddev); + } } static void r5l_submit_current_io(struct r5l_log *log) @@ -3165,6 +3171,8 @@ void r5l_exit_log(struct r5conf *conf) conf->log = NULL; synchronize_rcu(); + /* Ensure disable_writeback_work wakes up and exits */ + wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); From 52a0d49de3d592a3118e13f35985e3d99eaf43df Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: [PATCH 14/33] md: don't call bitmap_create() while array is quiesced. bitmap_create() allocates memory with GFP_KERNEL and so can wait for IO. If called while the array is quiesced, it could wait indefinitely for write out to the array - deadlock. So call bitmap_create() before quiescing the array. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9767bb33df56..2cb49f639809 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6621,22 +6621,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); if (fd >= 0) { struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); + mddev->pers->quiesce(mddev, 1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); } else err = PTR_ERR(bitmap); - } - if (fd < 0 || err) { + if (err) { + bitmap_destroy(mddev); + fd = -1; + } + mddev->pers->quiesce(mddev, 0); + } else if (fd < 0) { + mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ + mddev->pers->quiesce(mddev, 0); } - mddev->pers->quiesce(mddev, 0); } if (fd < 0) { struct file *f = mddev->bitmap_info.file; @@ -6920,8 +6924,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - mddev->pers->quiesce(mddev, 1); bitmap = bitmap_create(mddev, -1); + mddev->pers->quiesce(mddev, 1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); From b3143b9a38d5039bcd1f2d1c94039651bfba8043 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: [PATCH 15/33] md: move suspend_hi/lo handling into core md code responding to ->suspend_lo and ->suspend_hi is similar to responding to ->suspended. It is best to wait in the common core code without incrementing ->active_io. This allows mddev_suspend()/mddev_resume() to work while requests are waiting for suspend_lo/hi to change. This is will be important after a subsequent patch which uses mddev_suspend() to synchronize updating for suspend_lo/hi. So move the code for testing suspend_lo/hi out of raid1.c and raid5.c, and place it in md.c Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 29 +++++++++++++++++++++++------ drivers/md/raid1.c | 14 +++++--------- drivers/md/raid5.c | 22 ---------------------- 3 files changed, 28 insertions(+), 37 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2cb49f639809..68de2a6ee29a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +static bool is_suspended(struct mddev *mddev, struct bio *bio) +{ + if (mddev->suspended) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (mddev->suspend_lo >= mddev->suspend_hi) + return false; + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + return false; + if (bio_end_sector(bio) < mddev->suspend_lo) + return false; + return true; +} + void md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: rcu_read_lock(); - if (mddev->suspended) { + if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) + if (!is_suspended(mddev, bio)) break; rcu_read_unlock(); schedule(); @@ -4845,10 +4860,11 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; old = mddev->suspend_lo; mddev->suspend_lo = new; - if (new >= old) + if (new >= old) { /* Shrinking suspended region */ + wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 2); - else { + } else { /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -4888,10 +4904,11 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; old = mddev->suspend_hi; mddev->suspend_hi = new; - if (new <= old) + if (new <= old) { /* Shrinking suspended region */ + wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 2); - else { + } else { /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index efdabd3040e7..fb56ef79a1c3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1310,11 +1310,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ - if ((bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_iter.bi_sector < mddev->suspend_hi) || - (mddev_is_clustered(mddev) && + if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio)))) { + bio->bi_iter.bi_sector, bio_end_sector(bio))) { /* * As the suspend_* range is controlled by userspace, we want @@ -1325,12 +1323,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, sigset_t full, old; prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); - if ((bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi) && - (!mddev_is_clustered(mddev) || - !md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev_is_clustered(mddev) || + !md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, - bio_end_sector(bio)))) + bio_end_sector(bio))) break; sigfillset(&full); sigprocmask(SIG_BLOCK, &full, &old); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8732955f130..354a969f50a6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5685,28 +5685,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) goto retry; } - if (rw == WRITE && - logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - raid5_release_stripe(sh); - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - prepare_to_wait(&conf->wait_for_overlap, - &w, TASK_INTERRUPTIBLE); - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - sigset_t full, old; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); - schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); - do_prepare = true; - } - goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or From 9e1cc0a54556a6c63dc0cfb7cd7d60d43337bba6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: [PATCH 16/33] md: use mddev_suspend/resume instead of ->quiesce() mddev_suspend() is a more general interface than calling ->quiesce() and is so more extensible. A future patch will make use of this. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 68de2a6ee29a..5bd4f18763bd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4866,8 +4866,8 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 2); } else { /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); + mddev_suspend(mddev); + mddev_resume(mddev); } err = 0; unlock: @@ -4910,8 +4910,8 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 2); } else { /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); + mddev_suspend(mddev); + mddev_resume(mddev); } err = 0; unlock: @@ -6642,7 +6642,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); @@ -6652,11 +6652,11 @@ static int set_bitmap_file(struct mddev *mddev, int fd) bitmap_destroy(mddev); fd = -1; } - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else if (fd < 0) { - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } } if (fd < 0) { @@ -6942,7 +6942,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = bitmap_create(mddev, -1); - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); @@ -6950,7 +6950,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -6973,9 +6973,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); } - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } From 35bfc52187f6df8779d0f1cebdb52b7f797baf4e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: [PATCH 17/33] md: allow metadata update while suspending. There are various deadlocks that can occur when a thread holds reconfig_mutex and calls ->quiesce(mddev, 1). As some write request block waiting for metadata to be updated (e.g. to record device failure), and as the md thread updates the metadata while the reconfig mutex is held, holding the mutex can stop write requests completing, and this prevents ->quiesce(mddev, 1) from completing. ->quiesce() is now usually called from mddev_suspend(), and it is always called with reconfig_mutex held. So at this time it is safe for the thread to update metadata without explicitly taking the lock. So add 2 new flags, one which says the unlocked updates is allowed, and one which ways it is happening. Then allow it while the quiesce completes, and then wait for it to finish. Reported-and-tested-by: Xiao Ni Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 14 ++++++++++++++ drivers/md/md.h | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5bd4f18763bd..9155f00dca20 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -364,8 +364,12 @@ void mddev_suspend(struct mddev *mddev) return; synchronize_rcu(); wake_up(&mddev->sb_wait); + set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); + smp_mb__after_atomic(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); + wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); del_timer_sync(&mddev->safemode_timer); } @@ -8838,6 +8842,16 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); + } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); diff --git a/drivers/md/md.h b/drivers/md/md.h index d8287d3cd1bf..03fc641e5da1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -237,6 +237,12 @@ enum mddev_flags { */ MD_HAS_PPL, /* The raid array has PPL feature set */ MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ + MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update + * the metadata without taking reconfig_mutex. + */ + MD_UPDATING_SB, /* md_check_recovery is updating the metadata + * without explicitly holding reconfig_mutex. + */ }; enum mddev_sb_flags { From b03e0ccb5ab9df3efbe51c87843a1ffbecbafa1f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Oct 2017 12:49:15 +1100 Subject: [PATCH 18/33] md: remove special meaning of ->quiesce(.., 2) The '2' argument means "wake up anything that is waiting". This is an inelegant part of the design and was added to help support management of suspend_lo/suspend_hi setting. Now that suspend_lo/hi is managed in mddev_suspend/resume, that need is gone. These is still a couple of places where we call 'quiesce' with an argument of '2', but they can safely be changed to call ->quiesce(.., 1); ->quiesce(.., 0) which achieve the same result at the small cost of pausing IO briefly. This removes a small "optimization" from suspend_{hi,lo}_store, but it isn't clear that optimization served a useful purpose. The code now is a lot clearer. Suggested-by: Shaohua Li Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 6 +++--- drivers/md/md.c | 34 ++++++++++------------------------ drivers/md/md.h | 9 ++++----- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 13 +++---------- drivers/md/raid10.c | 10 +++------- drivers/md/raid5-cache.c | 12 ++++++------ drivers/md/raid5-log.h | 2 +- drivers/md/raid5.c | 18 ++++++------------ 9 files changed, 37 insertions(+), 69 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index bc81ecc24c96..d0fd1bd8575c 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) static void remove_suspend_info(struct mddev *mddev, int slot) { struct md_cluster_info *cinfo = mddev->cluster_info; + mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } @@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev, s->lo = lo; s->hi = hi; mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9155f00dca20..d441b1d9846c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4846,7 +4846,7 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4862,17 +4862,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers == NULL || mddev->pers->quiesce == NULL) goto unlock; - old = mddev->suspend_lo; + mddev_suspend(mddev); mddev->suspend_lo = new; - if (new >= old) { - /* Shrinking suspended region */ - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 2); - } else { - /* Expanding suspended region - need to wait */ - mddev_suspend(mddev); - mddev_resume(mddev); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -4890,7 +4883,7 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4903,20 +4896,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) + if (mddev->pers == NULL) goto unlock; - old = mddev->suspend_hi; + + mddev_suspend(mddev); mddev->suspend_hi = new; - if (new <= old) { - /* Shrinking suspended region */ - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 2); - } else { - /* Expanding suspended region - need to wait */ - mddev_suspend(mddev); - mddev_resume(mddev); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); diff --git a/drivers/md/md.h b/drivers/md/md.h index 03fc641e5da1..998b4ce1498f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -544,12 +544,11 @@ struct md_personality int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved + /* quiesce suspends or resumes internal processing. + * 1 - stop new actions and wait for action io to complete + * 0 - return to normal behaviour */ - void (*quiesce) (struct mddev *mddev, int state); + void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 5a00fc118470..5ecba9eef441 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev) return ERR_PTR(-EINVAL); } -static void raid0_quiesce(struct mddev *mddev, int state) +static void raid0_quiesce(struct mddev *mddev, int quiesce) { } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fb56ef79a1c3..9428dfa7e9a0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3273,21 +3273,14 @@ static int raid1_reshape(struct mddev *mddev) return 0; } -static void raid1_quiesce(struct mddev *mddev, int state) +static void raid1_quiesce(struct mddev *mddev, int quiesce) { struct r1conf *conf = mddev->private; - switch(state) { - case 2: /* wake for suspend */ - wake_up(&conf->wait_barrier); - break; - case 1: + if (quiesce) freeze_array(conf, 0); - break; - case 0: + else unfreeze_array(conf); - break; - } } static void *raid1_takeover(struct mddev *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b0de5b5ee689..615f677ceb1a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3828,18 +3828,14 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf); } -static void raid10_quiesce(struct mddev *mddev, int state) +static void raid10_quiesce(struct mddev *mddev, int quiesce) { struct r10conf *conf = mddev->private; - switch(state) { - case 1: + if (quiesce) raise_barrier(conf, 0); - break; - case 0: + else lower_barrier(conf); - break; - } } static int raid10_resize(struct mddev *mddev, sector_t sectors) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 59af7cf35092..037ed274807f 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1589,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) md_wakeup_thread(log->reclaim_thread); } -void r5l_quiesce(struct r5l_log *log, int state) +void r5l_quiesce(struct r5l_log *log, int quiesce) { struct mddev *mddev; - if (!log || state == 2) + if (!log) return; - if (state == 0) - kthread_unpark(log->reclaim_thread->tsk); - else if (state == 1) { + + if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); kthread_park(log->reclaim_thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); - } + } else + kthread_unpark(log->reclaim_thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 328d67aedda4..c3596a27a5a8 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -8,7 +8,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log); extern void r5l_flush_stripe_to_raid(struct r5l_log *log); extern void r5l_stripe_write_finished(struct stripe_head *sh); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int state); +extern void r5l_quiesce(struct r5l_log *log, int quiesce); extern bool r5l_log_disk_error(struct r5conf *conf); extern bool r5c_is_writeback(struct r5l_log *log); extern int diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 354a969f50a6..17ffa1e44c84 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8008,16 +8008,12 @@ static void raid5_finish_reshape(struct mddev *mddev) } } -static void raid5_quiesce(struct mddev *mddev, int state) +static void raid5_quiesce(struct mddev *mddev, int quiesce) { struct r5conf *conf = mddev->private; - switch(state) { - case 2: /* resume for a suspend */ - wake_up(&conf->wait_for_overlap); - break; - - case 1: /* stop all writes */ + if (quiesce) { + /* stop all writes */ lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain @@ -8033,17 +8029,15 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); - break; - - case 0: /* re-enable writes */ + } else { + /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); - break; } - r5l_quiesce(conf->log, state); + r5l_quiesce(conf->log, quiesce); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) From ae89fd3de4793c0dc2ec7e9f26b58a357d74a6c7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 18 Oct 2017 19:01:11 -0400 Subject: [PATCH 19/33] md: use TASK_IDLE instead of blocking signals Hi - I submit this patch for the next merge window: Some times ago, I made a patch f9c79bc05a2a that blocks signals around the schedule() calls in MD. The MD subsystem needs to do an uninterruptible sleep that is not accounted in load average - so we block signals and use interruptible sleep. The kernel has a special TASK_IDLE state for this purpose, so we can use it instead of blocking signals. This patch doesn't fix any bug, it just makes the code simpler. Signed-off-by: Mikulas Patocka Acked-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 7 +------ drivers/md/raid5.c | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9428dfa7e9a0..1f36473c79dc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -37,7 +37,6 @@ #include #include #include -#include #include @@ -1320,18 +1319,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ DEFINE_WAIT(w); for (;;) { - sigset_t full, old; prepare_to_wait(&conf->wait_barrier, - &w, TASK_INTERRUPTIBLE); + &w, TASK_IDLE); if (!mddev_is_clustered(mddev) || !md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); } finish_wait(&conf->wait_barrier, &w); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 17ffa1e44c84..2a4b34941d86 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -55,7 +55,6 @@ #include #include #include -#include #include #include From f6eca2d43ed694ab8124dd24c88277f7eca93b7d Mon Sep 17 00:00:00 2001 From: Nate Dailey Date: Tue, 17 Oct 2017 08:17:03 -0400 Subject: [PATCH 20/33] raid1: prevent freeze_array/wait_all_barriers deadlock If freeze_array is attempted in the middle of close_sync/ wait_all_barriers, deadlock can occur. freeze_array will wait for nr_pending and nr_queued to line up. wait_all_barriers increments nr_pending for each barrier bucket, one at a time, but doesn't actually issue IO that could be counted in nr_queued. So freeze_array is blocked until wait_all_barriers completes and allow_all_barriers runs. At the same time, when _wait_barrier sees array_frozen == 1, it stops and waits for freeze_array to complete. Prevent the deadlock by making close_sync call _wait_barrier and _allow_barrier for one bucket at a time, instead of deferring the _allow_barrier calls until after all _wait_barriers are complete. Signed-off-by: Nate Dailey Fix: fd76863e37fe(RAID1: a new I/O barrier implementation to remove resync window) Reviewed-by: Coly Li Cc: stable@vger.kernel.org (v4.11) Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1f36473c79dc..038f5eb299ce 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -989,14 +989,6 @@ static void wait_barrier(struct r1conf *conf, sector_t sector_nr) _wait_barrier(conf, idx); } -static void wait_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _wait_barrier(conf, idx); -} - static void _allow_barrier(struct r1conf *conf, int idx) { atomic_dec(&conf->nr_pending[idx]); @@ -1010,14 +1002,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr) _allow_barrier(conf, idx); } -static void allow_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _allow_barrier(conf, idx); -} - /* conf->resync_lock should be held */ static int get_unqueued_pending(struct r1conf *conf) { @@ -1645,8 +1629,12 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_all_barriers(conf); - allow_all_barriers(conf); + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { + _wait_barrier(conf, idx); + _allow_barrier(conf, idx); + } mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; From efa4b77b00b56138fb7e68d2fe8fd1b3c15cd503 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 18 Oct 2017 22:08:13 -0700 Subject: [PATCH 21/33] md: use lockdep_assert_held lockdep_assert_held is a better way to assert lock held, and it works for UP. Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 ++-- drivers/md/md.h | 5 ----- drivers/md/raid5-cache.c | 12 ++++++------ 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d441b1d9846c..5a0ec1d1a6e8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2335,7 +2335,7 @@ static void export_array(struct mddev *mddev) static bool set_in_sync(struct mddev *mddev) { - WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock)); + lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); @@ -6749,7 +6749,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { - WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; diff --git a/drivers/md/md.h b/drivers/md/md.h index 998b4ce1498f..7d6bcf0eba0c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -500,11 +500,6 @@ static inline void mddev_lock_nointr(struct mddev *mddev) mutex_lock(&mddev->reconfig_mutex); } -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - static inline int mddev_trylock(struct mddev *mddev) { return mutex_trylock(&mddev->reconfig_mutex); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 037ed274807f..f1c86d938502 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -539,7 +539,7 @@ static void r5l_log_run_stripes(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -555,7 +555,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -1200,7 +1200,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log) { struct stripe_head *sh; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); if (!list_empty(&log->no_mem_stripes)) { sh = list_first_entry(&log->no_mem_stripes, @@ -1216,7 +1216,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) struct r5l_io_unit *io, *next; bool found = false; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { /* don't change list order */ @@ -1388,7 +1388,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) * raid5_release_stripe() while holding conf->device_lock */ BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); list_del_init(&sh->lru); atomic_inc(&sh->count); @@ -1415,7 +1415,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) int count; struct stripe_head *sh, *next; - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); if (!conf->log) return; From d4098c7262a47f529765d89614484a957363d623 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:11:50 +0800 Subject: [PATCH 22/33] md-cluster/raid10: set "do_balance = 0" if area is resyncing Just like clustered raid1, it is impossible for cluster raid10 to choose the best device for read balance when the area of array is resyncing. Because we cannot trust the data to be the same on all devices at that time, so we choose just the first one to use, so set do_balance to 0. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 615f677ceb1a..61890231972e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -759,8 +759,11 @@ static struct md_rdev *read_balance(struct r10conf *conf, * the resync window. We take the first readable disk when * above the resync window. */ - if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) + if ((conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, + this_sector + sectors))) do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { From cb8a7a7e1098e74d36378b992a6d012668ec10d9 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:11:51 +0800 Subject: [PATCH 23/33] md-cluster: Suspend writes in RAID10 if within range If there is a resync going on, all nodes must suspend writes to the range. This is recorded in suspend_info and suspend_list. If there is an I/O within the ranges of any of the suspend_info, area_resyncing will return 1. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 61890231972e..cc6a56a659a3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1294,6 +1294,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sector_t sectors; int max_sectors; + if ((mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { + DEFINE_WAIT(w); + for (;;) { + prepare_to_wait(&conf->wait_barrier, + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. From 8db87912c9a8771c53b98845cd5516ea63b22e1e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:11:52 +0800 Subject: [PATCH 24/33] md-cluster: Use a small window for raid10 resync Suspending the entire device for resync could take too long. Resync in small chunks. cluster's resync window is maintained in r10conf as cluster_sync_low and cluster_sync_high, and processed in raid10's sync_request(). If the current resync is outside the cluster resync window: 1. Set the cluster_sync_low to curr_resync_completed. 2. Set cluster_sync_high to cluster_sync_low + stripe size. 3. Send a message to all nodes so they may add it in their suspension list. Note: We only support "near" raid10 so far, resync a far or offset raid10 array could have trouble. So raid10_run checks the layout of clustered raid10, it will refuse to run if the layout is not correct. With the "near" layout we process one stripe at a time progressing monotonically through the address space. So we can have a sliding window of whole-stripes which moves through the array suspending IO on other nodes, and both resync which uses array addresses and recovery which uses device addresses can stay within this window. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 113 +++++++++++++++++++++++++++++++++++++++++++- drivers/md/raid10.h | 6 +++ 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index cc6a56a659a3..b9edbc747a95 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) /* * When performing a resync, we need to read and compare, so @@ -2840,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) return r10bio; } +/* + * Set cluster_sync_high since we need other nodes to add the + * range [cluster_sync_low, cluster_sync_high] to suspend list. + */ +static void raid10_set_cluster_sync_high(struct r10conf *conf) +{ + sector_t window_size; + int extra_chunk, chunks; + + /* + * First, here we define "stripe" as a unit which across + * all member devices one time, so we get chunks by use + * raid_disks / near_copies. Otherwise, if near_copies is + * close to raid_disks, then resync window could increases + * linearly with the increase of raid_disks, which means + * we will suspend a really large IO window while it is not + * necessary. If raid_disks is not divisible by near_copies, + * an extra chunk is needed to ensure the whole "stripe" is + * covered. + */ + + chunks = conf->geo.raid_disks / conf->geo.near_copies; + if (conf->geo.raid_disks % conf->geo.near_copies == 0) + extra_chunk = 0; + else + extra_chunk = 1; + window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; + + /* + * At least use a 32M window to align with raid1's resync window + */ + window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? + CLUSTER_RESYNC_WINDOW_SECTORS : window_size; + + conf->cluster_sync_high = conf->cluster_sync_low + window_size; +} + /* * perform a "sync" on one "block" * @@ -2912,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + /* If we aborted, we need to abort the * sync on the 'current' bitmap chucks (there can * be several when recovering multiple devices). @@ -3266,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* resync. Schedule a read for every block at this virt offset */ int count = 0; - bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); + /* + * Since curr_resync_completed could probably not update in + * time, and we will set cluster_sync_low based on it. + * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for + * safety reason, which ensures curr_resync_completed is + * updated in bitmap_cond_end_sync. + */ + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && @@ -3400,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } while (++page_idx < RESYNC_PAGES); r10_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* It is resync not recovery */ + if (conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + raid10_set_cluster_sync_high(conf); + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } else if (mddev_is_clustered(mddev)) { + /* This is recovery not resync */ + sector_t sect_va1, sect_va2; + bool broadcast_msg = false; + + for (i = 0; i < conf->geo.raid_disks; i++) { + /* + * sector_nr is a device address for recovery, so we + * need translate it to array address before compare + * with cluster_sync_high. + */ + sect_va1 = raid10_find_virt(conf, sector_nr, i); + + if (conf->cluster_sync_high < sect_va1 + nr_sectors) { + broadcast_msg = true; + /* + * curr_resync_completed is similar as + * sector_nr, so make the translation too. + */ + sect_va2 = raid10_find_virt(conf, + mddev->curr_resync_completed, i); + + if (conf->cluster_sync_low == 0 || + conf->cluster_sync_low > sect_va2) + conf->cluster_sync_low = sect_va2; + } + } + if (broadcast_msg) { + raid10_set_cluster_sync_high(conf); + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } + while (biolist) { bio = biolist; biolist = biolist->bi_next; @@ -3659,6 +3758,18 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; + if (mddev_is_clustered(conf->mddev)) { + int fc, fo; + + fc = (mddev->layout >> 8) & 255; + fo = mddev->layout & (1<<16); + if (fc > 1 || fo > 0) { + pr_err("only near layout is supported by clustered" + " raid10\n"); + goto out; + } + } + mddev->thread = conf->thread; conf->thread = NULL; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 735ce1a3d260..2bef4e8789c8 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -88,6 +88,12 @@ struct r10conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + + /* + * Keep track of cluster resync window to send to other nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; }; /* From f81f7302e86f5c0a21b59c94164f2510812b7764 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:33:33 +0800 Subject: [PATCH 25/33] raid1: remove obsolete code in raid1_write_request There are some lines could be removed due to recent change for raid1 such as commit 3956df15d634 ("md: move suspend_hi/lo handling into core md code"). Also, seems some comments are put to wrong place, move them before wait_barrier. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 038f5eb299ce..cc9d337a1ed3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1286,27 +1286,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int max_sectors; - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - - if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) { - /* - * As the suspend_* range is controlled by userspace, we want - * an interruptible wait. - */ DEFINE_WAIT(w); for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); - if (!mddev_is_clustered(mddev) || - !md_cluster_ops->area_resyncing(mddev, WRITE, + if (!md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; @@ -1314,6 +1302,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } finish_wait(&conf->wait_barrier, &w); } + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ wait_barrier(conf, bio->bi_iter.bi_sector); r1_bio = alloc_r1bio(mddev, bio); From fc33060ba0c78310f6398357ffca8f55a4c41cee Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 27 Oct 2017 16:59:06 +0100 Subject: [PATCH 26/33] md: remove redundant variable q The pointer q is assigned but never read; it is redundant and can be removed. Cleans up clang warning: drivers/md/md-multipath.c:260:4: warning: Value stored to 'q' is never read Signed-off-by: Colin Ian King Signed-off-by: Shaohua Li --- drivers/md/md-multipath.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 5c70176fa24d..e40065bdbfc8 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -243,7 +243,6 @@ static void print_multipath_conf (struct mpconf *conf) static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; - struct request_queue *q; int err = -EEXIST; int path; struct multipath_info *p; @@ -257,7 +256,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { - q = rdev->bdev->bd_disk->queue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); From f0e230ad877855567607fe2f40802b6317ad38f3 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:11:53 +0800 Subject: [PATCH 27/33] md-cluster: update document for raid10 Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- Documentation/md/md-cluster.txt | 3 ++- drivers/md/Kconfig | 5 +++-- drivers/md/md-cluster.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt index 82ee51604e9a..e1055f105cf5 100644 --- a/Documentation/md/md-cluster.txt +++ b/Documentation/md/md-cluster.txt @@ -1,4 +1,5 @@ -The cluster MD is a shared-device RAID for a cluster. +The cluster MD is a shared-device RAID for a cluster, it supports +two levels: raid1 and raid10 (limited support). 1. On-disk format diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4a249ee86364..83b9362be09c 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -178,7 +178,7 @@ config MD_FAULTY config MD_CLUSTER - tristate "Cluster Support for MD (EXPERIMENTAL)" + tristate "Cluster Support for MD" depends on BLK_DEV_MD depends on DLM default n @@ -188,7 +188,8 @@ config MD_CLUSTER nodes in the cluster can access the MD devices simultaneously. This brings the redundancy (and uptime) of RAID levels across the - nodes of the cluster. + nodes of the cluster. Currently, it can work with raid1 and raid10 + (limited support). If unsure, say N. diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index d0fd1bd8575c..79bfbc840385 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1478,7 +1478,7 @@ static struct md_cluster_operations cluster_ops = { static int __init cluster_init(void) { - pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); + pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); pr_info("Registering Cluster MD functions\n"); register_md_cluster_operations(&cluster_ops, THIS_MODULE); return 0; From b90f6ff080c52e2f05364210733df120e3c4e597 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 26 Oct 2017 15:56:54 +0200 Subject: [PATCH 28/33] md: don't check MD_SB_CHANGE_CLEAN in md_allow_write Only MD_SB_CHANGE_PENDING should be used to wait for transition from clean to dirty. Checking also MD_SB_CHANGE_CLEAN is unnecessary and can race with e.g. md_do_sync(). This sporadically causes a hang when changing consistency policy during resync: INFO: task mdadm:6183 blocked for more than 30 seconds. Not tainted 4.14.0-rc3+ #391 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. mdadm D12752 6183 6022 0x00000000 Call Trace: __schedule+0x93f/0x990 schedule+0x6b/0x90 md_allow_write+0x100/0x130 [md_mod] ? do_wait_intr_irq+0x90/0x90 resize_stripes+0x3a/0x5b0 [raid456] ? kernfs_fop_write+0xbe/0x180 raid5_change_consistency_policy+0xa6/0x200 [raid456] consistency_policy_store+0x2e/0x70 [md_mod] md_attr_store+0x90/0xc0 [md_mod] sysfs_kf_write+0x42/0x50 kernfs_fop_write+0x119/0x180 __vfs_write+0x28/0x110 ? rcu_sync_lockdep_assert+0x12/0x60 ? __sb_start_write+0x15a/0x1c0 ? vfs_write+0xa3/0x1a0 vfs_write+0xb4/0x1a0 SyS_write+0x49/0xa0 entry_SYSCALL_64_fastpath+0x18/0xad Fixes: 2214c260c72b ("md: don't return -EAGAIN in md_allow_write for external metadata arrays") Cc: Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5a0ec1d1a6e8..2bf4cc41b4f8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8125,7 +8125,6 @@ void md_allow_write(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); From 39b4954c0a1556f8f7f1fdcf59a227117fcd8a0b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 3 Nov 2017 11:24:44 -0600 Subject: [PATCH 29/33] badblocks: fix wrong return value in badblocks_set if badblocks are disabled MD's rdev_set_badblocks() expects that badblocks_set() returns 1 if badblocks are disabled, otherwise, rdev_set_badblocks() will record superblock changes and return success in that case and md will fail to report an IO error which it should. This bug has existed since badblocks were introduced in commit 9e0e252a048b ("badblocks: Add core badblock management code"). Signed-off-by: Liu Bo Acked-by: Guoqing Jiang Signed-off-by: Shaohua Li --- block/badblocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/badblocks.c b/block/badblocks.c index 43c71166e1e2..91f7bcf979d3 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -178,7 +178,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors, if (bb->shift < 0) /* badblocks are disabled */ - return 0; + return 1; if (bb->shift) { /* round the start down, and the end up */ From db0505d320660b6ad92418847e7eca6b61b246ac Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 16:18:36 +1100 Subject: [PATCH 30/33] md: be cautious about using ->curr_resync_completed for ->recovery_offset The ->recovery_offset shows how much of a non-InSync device is actually in sync - how much has been recoveryed. When performing a recovery, ->curr_resync and ->curr_resync_completed follow the device address being recovered and so can be used to update ->recovery_offset. When performing a reshape, ->curr_resync* might follow the device addresses (raid5) or might follow array addresses (raid10), so cannot in general be used to set ->recovery_offset. When reshaping backwards, ->curre_resync* measures from the *end* of the array-or-device, so is particularly unhelpful. So change the common code in md.c to only use ->curr_resync_complete for the simple recovery case, and add code to raid5.c to update ->recovery_offset during a forwards reshape. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 33 ++++++++++++++++++++++----------- drivers/md/raid5.c | 24 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2bf4cc41b4f8..15e4668f594c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2454,10 +2454,18 @@ repeat: } } - /* First make sure individual recovery_offsets are correct */ + /* + * First make sure individual recovery_offsets are correct + * curr_resync_completed can only be used during recovery. + * During reshape/resync it might use array-addresses rather + * that device addresses. + */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) @@ -8491,16 +8499,19 @@ void md_do_sync(struct md_thread *thread) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } } } skip: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2a4b34941d86..1649e82faae2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5738,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk */ struct r5conf *conf = mddev->private; struct stripe_head *sh; + struct md_rdev *rdev; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -5860,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; + conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -5958,6 +5968,14 @@ finish: goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -7945,6 +7963,7 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; @@ -7952,6 +7971,11 @@ static void end_reshape(struct r5conf *conf) smp_wmb(); conf->reshape_progress = MaxSector; conf->mddev->reshape_position = MaxSector; + rdev_for_each(rdev, conf->mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) + rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); From 97f0eb9f0fec0563c1c796d95123e871b8bb65c0 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 6 Nov 2017 10:11:25 +0800 Subject: [PATCH 31/33] md/bitmap: clear BITMAP_WRITE_ERROR bit before writing it to sb For a RAID1 device using a file-based bitmap, if a bitmap write error occurs but the later writes succeed, it's possible both BITMAP_STALE and BITMAP_WRITE_ERROR bits will be written to the bitmap super block, the BITMAP_STALE bit will be handled properly and be cleared, but the BITMAP_WRITE_ERROR bit in sb->flags will make bitmap_create() to fail. So clear it to protect against the write failure-and-then-recovery case. Signed-off-by: Hou Tao Signed-off-by: Shaohua Li --- drivers/md/md-bitmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index d1b3b60669ea..a60e46529d9f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -459,7 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - sb->state = cpu_to_le32(bitmap->flags); + /* + * clear BITMAP_WRITE_ERROR bit to protect against the case that + * a bitmap write error occurred but the later writes succeeded. + */ + sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR)); /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); From 0202ce8a90efdc81600e7bf9712d8c324081a924 Mon Sep 17 00:00:00 2001 From: Zdenek Kabelac Date: Wed, 8 Nov 2017 13:44:55 +0100 Subject: [PATCH 32/33] md: release allocated bitset sync_set Patch fixes kmemleak on md_stop() path used likely only by dm-raid wrapper. Code of md is using mddev_put() where both bitsets are released however this freeing is not shared. Also set NULL to bio_set and sync_set pointers just like mddev_put is doing. Signed-off-by: Zdenek Kabelac Signed-off-by: Shaohua Li --- drivers/md/md.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 15e4668f594c..e014d39159d7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5852,8 +5852,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - if (mddev->bio_set) + if (mddev->bio_set) { bioset_free(mddev->bio_set); + mddev->bio_set = NULL; + } + if (mddev->sync_set) { + bioset_free(mddev->sync_set); + mddev->sync_set = NULL; + } } EXPORT_SYMBOL_GPL(md_stop); From 0868b99c214a3d55486c700de7c3f770b7243e7c Mon Sep 17 00:00:00 2001 From: Zdenek Kabelac Date: Wed, 8 Nov 2017 13:44:56 +0100 Subject: [PATCH 33/33] md: free unused memory after bitmap resize When bitmap is resized, the old kalloced chunks just are not released once the resized bitmap starts to use new space. This fixes in particular kmemleak reports like this one: unreferenced object 0xffff8f4311e9c000 (size 4096): comm "lvm", pid 19333, jiffies 4295263268 (age 528.265s) hex dump (first 32 bytes): 02 80 02 80 02 80 02 80 02 80 02 80 02 80 02 80 ................ 02 80 02 80 02 80 02 80 02 80 02 80 02 80 02 80 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0x14e/0x2e0 [] bitmap_checkpage+0x7c/0x110 [] bitmap_get_counter+0x45/0xd0 [] bitmap_set_memory_bits+0x43/0xe0 [] bitmap_init_from_disk+0x23c/0x530 [] bitmap_load+0xbe/0x160 [] raid_preresume+0x203/0x2f0 [dm_raid] [] dm_table_resume_targets+0x4f/0xe0 [] dm_resume+0x122/0x140 [] dev_suspend+0x18f/0x290 [] ctl_ioctl+0x287/0x560 [] dm_ctl_ioctl+0x13/0x20 [] do_vfs_ioctl+0xa6/0x750 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x1f/0xc2 Signed-off-by: Zdenek Kabelac Signed-off-by: Shaohua Li --- drivers/md/md-bitmap.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index a60e46529d9f..bb45c0ccc1bf 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2162,6 +2162,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, for (k = 0; k < page; k++) { kfree(new_bp[k].map); } + kfree(new_bp); /* restore some fields from old_counts */ bitmap->counts.bp = old_counts.bp; @@ -2212,6 +2213,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, block += old_blocks; } + if (bitmap->counts.bp != old_counts.bp) { + unsigned long k; + for (k = 0; k < old_counts.pages; k++) + if (!old_counts.bp[k].hijacked) + kfree(old_counts.bp[k].map); + kfree(old_counts.bp); + } + if (!init) { int i; while (block < (chunks << chunkshift)) {