From 2b74e12e567feb4163e32815bce0be57489e73b9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2010 15:59:01 +1100 Subject: [PATCH 1/5] md: remove handling of flush_pending in md_submit_flush_data None of the functions called between setting flush_pending to 1, and atomic_dec_and_test can change flush_pending, or will anything running in any other thread (as ->flush_bio is not NULL). So the atomic_dec_and_test will always succeed. So remove the atomic_sec and the atomic_dec_and_test. Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 84c46a161927..83b6cb3e7025 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -404,8 +404,6 @@ static void md_submit_flush_data(struct work_struct *ws) mddev_t *mddev = container_of(ws, mddev_t, flush_work); struct bio *bio = mddev->flush_bio; - atomic_set(&mddev->flush_pending, 1); - if (bio->bi_size == 0) /* an empty barrier - all done */ bio_endio(bio, 0); @@ -414,10 +412,9 @@ static void md_submit_flush_data(struct work_struct *ws) if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); } - if (atomic_dec_and_test(&mddev->flush_pending)) { - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); - } + + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); } void md_flush_request(mddev_t *mddev, struct bio *bio) From a7a07e69653acf8540daa1da053cd84bf86e8e66 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2010 16:04:25 +1100 Subject: [PATCH 2/5] md: move code in to submit_flushes. submit_flushes is called from exactly one place. Move the code that is before and after that call into submit_flushes. This has not functional change, but will make the next patch smaller and easier to follow. Signed-off-by: NeilBrown --- drivers/md/md.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 83b6cb3e7025..31f8e151d893 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -371,10 +371,14 @@ static void md_end_flush(struct bio *bio, int err) bio_put(bio); } +static void md_submit_flush_data(struct work_struct *ws); + static void submit_flushes(mddev_t *mddev) { mdk_rdev_t *rdev; + INIT_WORK(&mddev->flush_work, md_submit_flush_data); + atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && @@ -397,6 +401,8 @@ static void submit_flushes(mddev_t *mddev) rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); + if (atomic_dec_and_test(&mddev->flush_pending)) + queue_work(md_wq, &mddev->flush_work); } static void md_submit_flush_data(struct work_struct *ws) @@ -426,13 +432,7 @@ void md_flush_request(mddev_t *mddev, struct bio *bio) mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); - atomic_set(&mddev->flush_pending, 1); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - submit_flushes(mddev); - - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); } EXPORT_SYMBOL(md_flush_request); From a035fc3e2531703b539f23bec4ca7943cfc69349 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2010 16:17:51 +1100 Subject: [PATCH 3/5] md: fix possible deadlock in handling flush requests. As recorded in https://bugzilla.kernel.org/show_bug.cgi?id=24012 it is possible for a flush request through md to hang. This is due to an interaction between the recursion avoidance in generic_make_request, the insistence in md of only having one flush active at a time, and the possibility of dm (or md) submitting two flush requests to a device from the one generic_make_request. If a generic_make_request call into dm causes two flush requests to be queued (as happens if the dm table has two targets - they get one each), these two will be queued inside generic_make_request. Assume they are for the same md device. The first is processed and causes 1 or more flush requests to be sent to lower devices. These get queued within generic_make_request too. Then the second flush to the md device gets handled and it blocks waiting for the first flush to complete. But it won't complete until the two lower-device requests complete, and they haven't even been submitted yet as they are on the generic_make_request queue. The deadlock can be broken by using a separate thread to submit the requests to lower devices. md has such a thread readily available: md_wq. So use it to submit these requests. Reported-by: Giacomo Catenazzi Tested-by: Giacomo Catenazzi Signed-off-by: NeilBrown --- drivers/md/md.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 31f8e151d893..d66aaeddf95d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -373,8 +373,9 @@ static void md_end_flush(struct bio *bio, int err) static void md_submit_flush_data(struct work_struct *ws); -static void submit_flushes(mddev_t *mddev) +static void submit_flushes(struct work_struct *ws) { + mddev_t *mddev = container_of(ws, mddev_t, flush_work); mdk_rdev_t *rdev; INIT_WORK(&mddev->flush_work, md_submit_flush_data); @@ -432,7 +433,8 @@ void md_flush_request(mddev_t *mddev, struct bio *bio) mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); - submit_flushes(mddev); + INIT_WORK(&mddev->flush_work, submit_flushes); + queue_work(md_wq, &mddev->flush_work); } EXPORT_SYMBOL(md_flush_request); From 1a855a0606653d2d82506281e2c686bacb4b2f45 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2010 16:36:28 +1100 Subject: [PATCH 4/5] md: fix bug with re-adding of partially recovered device. With v0.90 metadata, a hot-spare does not become a full member of the array until recovery is complete. So if we re-add such a device to the array, we know that all of it is as up-to-date as the event count would suggest, and so it a bitmap-based recovery is possible. However with v1.x metadata, the hot-spare immediately becomes a full member of the array, but it record how much of the device has been recovered. If the array is stopped and re-assembled recovery starts from this point. When such a device is hot-added to an array we currently lose the 'how much is recovered' information and incorrectly included it as a full in-sync member (after bitmap-based fixup). This is wrong and unsafe and could corrupt data. So be more careful about setting saved_raid_disk - which is what guides the re-adding of devices back into an array. The new code matches the code in slot_store which does a similar thing, which is encouraging. This is suitable for any -stable kernel. Reported-by: "Dailey, Nate" Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d66aaeddf95d..b757da175180 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5159,7 +5159,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) PTR_ERR(rdev)); return PTR_ERR(rdev); } - /* set save_raid_disk if appropriate */ + /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<raid_disk < mddev->raid_disks) @@ -5169,7 +5169,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) } else super_types[mddev->major_version]. validate_super(mddev, rdev); - rdev->saved_raid_disk = rdev->raid_disk; + if (test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = rdev->raid_disk; + else + rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1< Date: Thu, 9 Dec 2010 17:02:14 +1100 Subject: [PATCH 5/5] md: protect against NULL reference when waiting to start a raid10. When we fail to start a raid10 for some reason, we call md_unregister_thread to kill the thread that was created. Unfortunately md_thread() will then make one call into the handler (raid10d) even though md_wakeup_thread has not been called. This is not safe and as md_unregister_thread is called after mddev->private has been set to NULL, it will definitely cause a NULL dereference. So fix this at both ends: - md_thread should only call the handler if THREAD_WAKEUP has been set. - raid10 should call md_unregister_thread before setting things to NULL just like all the other raid modules do. This is applicable to 2.6.35 and later. Cc: stable@kernel.org Reported-by: "Citizen" Signed-off-by: NeilBrown --- drivers/md/md.c | 5 ++--- drivers/md/raid10.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b757da175180..e71c5fa527f5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6044,9 +6044,8 @@ static int md_thread(void * arg) || kthread_should_stop(), thread->timeout); - clear_bit(THREAD_WAKEUP, &thread->flags); - - thread->run(thread->mddev); + if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags)) + thread->run(thread->mddev); } return 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c67aa54694ae..0641674827f0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2397,13 +2397,13 @@ static int run(mddev_t *mddev) return 0; out_free_conf: + md_unregister_thread(mddev->thread); if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); mddev->private = NULL; - md_unregister_thread(mddev->thread); out: return -EIO; }