From f3b99be19ded511a1bf05a148276239d9f13eefa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 24 Jun 2010 13:31:03 +1000 Subject: [PATCH 01/12] Restore partition detection of newly created md arrays. Commit b821eaa572fd737faaf6928ba046e571526c36c6 broke partition detection for md arrays. The logic was almost right. However if revalidate_disk is called when the device is not yet open, bdev->bd_disk won't be set, so the flush_disk() Call will not set bd_invalidated. So when md_open is called we still need to ensure that ->bd_invalidated gets set. This is easily done with a call to check_disk_size_change in the place where the offending commit removed check_disk_change. At the important times, the size will have changed from 0 to non-zero, so check_disk_size_change will set bd_invalidated. Tested-by: Duncan <1i5t5.duncan@cox.net> Reported-by: Duncan <1i5t5.duncan@cox.net> Signed-off-by: NeilBrown --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 46b3a044eadf..4edcda8f4869 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5895,6 +5895,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); + check_disk_size_change(mddev->gendisk, bdev); out: return err; } From 0544a21db02c1d8883158fd6f323364f830a120a Mon Sep 17 00:00:00 2001 From: "Prasanna S. Panchamukhi" Date: Thu, 24 Jun 2010 13:31:03 +1000 Subject: [PATCH 02/12] md: raid10: Fix null pointer dereference in fix_read_error() Such NULL pointer dereference can occur when the driver was fixing the read errors/bad blocks and the disk was physically removed causing a system crash. This patch check if the rcu_dereference() returns valid rdev before accessing it in fix_read_error(). Cc: stable@kernel.org Signed-off-by: Prasanna S. Panchamukhi Signed-off-by: Rob Becker Signed-off-by: NeilBrown --- drivers/md/raid10.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 03724992cdf2..6d420cb487b5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) int sectors = r10_bio->sectors; mdk_rdev_t*rdev; int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + int d = r10_bio->devs[r10_bio->read_slot].devnum; rcu_read_lock(); - { - int d = r10_bio->devs[r10_bio->read_slot].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev) { /* If rdev is not NULL */ char b[BDEVNAME_SIZE]; int cur_read_error_count = 0; - rdev = rcu_dereference(conf->mirrors[d].rdev); bdevname(rdev->bdev, b); if (test_bit(Faulty, &rdev->flags)) { @@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); do { - int d = r10_bio->devs[sl].devnum; + d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { @@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); while (sl != r10_bio->read_slot) { char b[BDEVNAME_SIZE]; - int d; + if (sl==0) sl = conf->copies; sl--; @@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) } sl = start; while (sl != r10_bio->read_slot) { - int d; + if (sl==0) sl = conf->copies; sl--; From e93f68a1fc6244c05ad8fae28e75835ec74ab34e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 15 Jun 2010 09:36:03 +0100 Subject: [PATCH 03/12] md: fix handling of array level takeover that re-arranges devices. Most array level changes leave the list of devices largely unchanged, possibly causing one at the end to become redundant. However conversions between RAID0 and RAID10 need to renumber all devices (except 0). This renumbering is currently being done in the ->run method when the new personality takes over. However this is too late as the common code in md.c might already have invalidated some of the devices if they had a ->raid_disk number that appeared to high. Moving it into the ->takeover method is too early as the array is still active at that time and wrong ->raid_disk numbers could cause confusion. So add a ->new_raid_disk field to mdk_rdev_s and use it to communicate the new raid_disk number. Now the common code knows exactly which devices need to be renumbered, and which can be invalidated, and can do it all at a convenient time when the array is suspend. It can also update some symlinks in sysfs which previously were not be updated correctly. Reported-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/md.c | 35 ++++++++++++++++++++++++++++++----- drivers/md/md.h | 3 +++ drivers/md/raid0.c | 11 +++-------- drivers/md/raid0.h | 3 --- drivers/md/raid10.c | 19 +++++-------------- drivers/md/raid10.h | 5 ----- 6 files changed, 41 insertions(+), 35 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 4edcda8f4869..4869128bf742 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3001,6 +3001,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; } + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->new_raid_disk = rdev->raid_disk; + /* ->takeover must set new_* and/or delta_disks * if it succeeds, and may set them when it fails. */ @@ -3051,13 +3054,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->safemode = 0; } - module_put(mddev->pers->owner); - /* Invalidate devices that are now superfluous */ - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= mddev->raid_disks) { - rdev->raid_disk = -1; + list_for_each_entry(rdev, &mddev->disks, same_set) { + char nm[20]; + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk > mddev->raid_disks) + rdev->new_raid_disk = -1; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + rdev->raid_disk = rdev->new_raid_disk; + if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); + else { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) + printk("md: cannot register %s for %s after level change\n", + nm, mdname(mddev)); } + } + + module_put(mddev->pers->owner); mddev->pers = pers; mddev->private = priv; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); diff --git a/drivers/md/md.h b/drivers/md/md.h index 7ab5ea155452..10597bfec000 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -78,6 +78,9 @@ struct mdk_rdev_s int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ + int new_raid_disk; /* role that the device will have in + * the array after a level-change completes. + */ int saved_raid_disk; /* role that device used to have in the * array and could again if we did a partial * resync from the bitmap diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e70f004c99e8..7c7c38058bc2 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) list_for_each_entry(rdev1, &mddev->disks, same_set) { int j = rdev1->raid_disk; - if (mddev->level == 10) + if (mddev->level == 10) { /* taking over a raid10-n2 array */ j /= 2; + rdev1->new_raid_disk = j; + } if (j < 0 || j >= mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: bad disk number %d - " @@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev) mddev->private = conf; } conf = mddev->private; - if (conf->scale_raid_disks) { - int i; - for (i=0; i < conf->strip_zone[0].nb_dev; i++) - conf->devlist[i]->raid_disk /= conf->scale_raid_disks; - /* FIXME update sysfs rd links */ - } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); @@ -643,7 +639,6 @@ static void *raid0_takeover_raid10(mddev_t *mddev) mddev->recovery_cp = MaxSector; create_strip_zones(mddev, &priv_conf); - priv_conf->scale_raid_disks = 2; return priv_conf; } diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index d724e664ca4d..91f8e876ee64 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -13,9 +13,6 @@ struct raid0_private_data struct strip_zone *strip_zone; mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ int nr_strip_zones; - int scale_raid_disks; /* divide rdev->raid_disks by this in run() - * to handle conversion from raid10 - */ }; typedef struct raid0_private_data raid0_conf_t; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6d420cb487b5..1bab3559f3e2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev) if (!conf->thread) goto out; - conf->scale_disks = 0; conf->mddev = mddev; return conf; @@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev) if (disk_idx >= conf->raid_disks || disk_idx < 0) continue; - if (conf->scale_disks) { - disk_idx *= conf->scale_disks; - rdev->raid_disk = disk_idx; - /* MOVE 'rd%d' link !! */ - } disk = conf->mirrors + disk_idx; disk->rdev = rdev; @@ -2435,13 +2429,6 @@ static void *raid10_takeover_raid0(mddev_t *mddev) return ERR_PTR(-EINVAL); } - /* Update slot numbers to obtain - * degraded raid10 with missing mirrors - */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - rdev->raid_disk *= 2; - } - /* Set new parameters */ mddev->new_level = 10; /* new layout: far_copies = 1, near_copies = 2 */ @@ -2454,7 +2441,11 @@ static void *raid10_takeover_raid0(mddev_t *mddev) mddev->recovery_cp = MaxSector; conf = setup_conf(mddev); - conf->scale_disks = 2; + if (!IS_ERR(conf)) + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) + rdev->new_raid_disk = rdev->raid_disk * 2; + return conf; } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3824a087e17c..2316ac2e8e21 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -38,11 +38,6 @@ struct r10_private_data_s { int chunk_shift; /* shift from chunks to sectors */ sector_t chunk_mask; - int scale_disks; /* When starting array, multiply - * each ->raid_disk by this. - * Need for raid0->raid10 migration - */ - struct list_head retry_list; /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; From f73ea87375a1b2bf6c0be82bb9a3cb9d5ee7a407 Mon Sep 17 00:00:00 2001 From: Maciej Trela Date: Wed, 16 Jun 2010 11:46:29 +0100 Subject: [PATCH 04/12] md: fix raid10 takeover: use new_layout for setup_conf Use mddev->new_layout in setup_conf. Also use new_chunk, and don't set ->degraded in takeover(). That gets set in run() Signed-off-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/raid10.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1bab3559f3e2..42e64e4e5e25 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev) sector_t stride, size; int err = -EINVAL; - if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || - !is_power_of_2(mddev->chunk_sectors)) { + if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || + !is_power_of_2(mddev->new_chunk_sectors)) { printk(KERN_ERR "md/raid10:%s: chunk size must be " "at least PAGE_SIZE(%ld) and be a power of 2.\n", mdname(mddev), PAGE_SIZE); goto out; } - nc = mddev->layout & 255; - fc = (mddev->layout >> 8) & 255; - fo = mddev->layout & (1<<16); + nc = mddev->new_layout & 255; + fc = (mddev->new_layout >> 8) & 255; + fo = mddev->new_layout & (1<<16); if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || - (mddev->layout >> 17)) { + (mddev->new_layout >> 17)) { printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", - mdname(mddev), mddev->layout); + mdname(mddev), mddev->new_layout); goto out; } @@ -2435,7 +2435,6 @@ static void *raid10_takeover_raid0(mddev_t *mddev) mddev->new_layout = (1<<8) + 2; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->delta_disks = mddev->raid_disks; - mddev->degraded = mddev->raid_disks; mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; From 001048a318d48e93cb6a1246f3b20335b2a7c855 Mon Sep 17 00:00:00 2001 From: Maciej Trela Date: Wed, 16 Jun 2010 11:55:14 +0100 Subject: [PATCH 05/12] md: clear layout after ->raid0 takeover After takeover from raid5/10 -> raid0 mddev->layout is not cleared. Signed-off-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/raid0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 7c7c38058bc2..ac09b7d38553 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -592,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev) /* Set new parameters */ mddev->new_level = 0; + mddev->new_layout = 0; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks--; mddev->delta_disks = -1; @@ -631,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev) /* Set new parameters */ mddev->new_level = 0; + mddev->new_layout = 0; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->delta_disks = - mddev->raid_disks / 2; mddev->raid_disks += mddev->delta_disks; From 049d6c1ef983c9ac43aa423dfd752071a5b0002d Mon Sep 17 00:00:00 2001 From: Maciej Trela Date: Wed, 16 Jun 2010 11:56:12 +0100 Subject: [PATCH 06/12] md: enable raid4->raid0 takeover Only level 5 with layout=PARITY_N can be taken over to raid0 now. Lets allow level 4 either. Signed-off-by: Maciej Trela Signed-off-by: NeilBrown --- drivers/md/raid0.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ac09b7d38553..563abed5a2cb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -569,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) return; } -static void *raid0_takeover_raid5(mddev_t *mddev) +static void *raid0_takeover_raid45(mddev_t *mddev) { mdk_rdev_t *rdev; raid0_conf_t *priv_conf; @@ -647,12 +647,16 @@ static void *raid0_takeover_raid10(mddev_t *mddev) static void *raid0_takeover(mddev_t *mddev) { /* raid0 can take over: + * raid4 - if all data disks are active. * raid5 - providing it is Raid4 layout and one disk is faulty * raid10 - assuming we have all necessary active disks */ + if (mddev->level == 4) + return raid0_takeover_raid45(mddev); + if (mddev->level == 5) { if (mddev->layout == ALGORITHM_PARITY_N) - return raid0_takeover_raid5(mddev); + return raid0_takeover_raid45(mddev); printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", mdname(mddev), ALGORITHM_PARITY_N); From e4e11e385d1e5516ac76c956d6c25e6c2fa1b8d0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Jun 2010 16:45:16 +1000 Subject: [PATCH 07/12] md/raid5: avoid oops when number of devices is reduced then increased. The entries in the stripe_cache maintained by raid5 are enlarged when we increased the number of devices in the array, but not shrunk when we reduce the number of devices. So if entries are added after reducing the number of devices, we much ensure to initialise the whole entry, not just the part that is currently relevant. Otherwise if we enlarge the array again, we will reference uninitialised values. As grow_buffers/shrink_buffer now want to use a count that is stored explicity in the raid_conf, they should get it from there rather than being passed it as a parameter. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d2c0f94fa37d..2c055dec8c68 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -277,12 +277,13 @@ out: return sh; } -static void shrink_buffers(struct stripe_head *sh, int num) +static void shrink_buffers(struct stripe_head *sh) { struct page *p; int i; + int num = sh->raid_conf->pool_size; - for (i=0; idev[i].page; if (!p) continue; @@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) } } -static int grow_buffers(struct stripe_head *sh, int num) +static int grow_buffers(struct stripe_head *sh) { int i; + int num = sh->raid_conf->pool_size; - for (i=0; iraid_disks, conf->previous_raid_disks); sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); if (!sh) return 0; - memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); + memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&sh->ops.wait_for_ops); #endif - if (grow_buffers(sh, disks)) { - shrink_buffers(sh, disks); + if (grow_buffers(sh)) { + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); return 0; } @@ -1468,7 +1469,7 @@ static int drop_one_stripe(raid5_conf_t *conf) if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); - shrink_buffers(sh, conf->pool_size); + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); return 1; From 70fffd0bfab1558a8c64c5e903dea1fb84cd9f6b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Jun 2010 17:01:25 +1000 Subject: [PATCH 08/12] md: Don't update ->recovery_offset when reshaping an array to fewer devices. When an array is reshaped to have fewer devices, the reshape proceeds from the end of the devices to the beginning. If a device happens to be non-In_sync (which is possible but rare) we would normally update the ->recovery_offset as the reshape progresses. However that would be wrong as the recover_offset records that the early part of the device is in_sync, while in fact it would only be the later part that is in_sync, and in any case the offset number would be measured from the wrong end of the device. Relatedly, if after a reshape a spare is discovered to not be recoverred all the way to the end, not allow spare_active to incorporate it in the array. This becomes relevant in the following sample scenario: A 4 drive RAID5 is converted to a 6 drive RAID6 in a combined operation. The RAID5->RAID6 conversion will cause a 5 drive to be included as a spare, then the 5drive -> 6drive reshape will effectively rebuild that spare as it progresses. The 6th drive is treated as in_sync the whole time as there is never any case that we might consider reading from it, but must not because there is no valid data. If we interrupt this reshape part-way through and reverse it to return to a 5-drive RAID6 (or event a 4-drive RAID5), we don't want to update the recovery_offset - as that would be wrong - and we don't want to include that spare as active in the 5-drive RAID6 when the reversed reshape completed and it will be mostly out-of-sync still. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ drivers/md/raid5.c | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 4869128bf742..cb20d0b0555a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) /* First make sure individual recovery_offsets are correct */ list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; @@ -6872,6 +6873,7 @@ void md_do_sync(mddev_t *mddev) rcu_read_lock(); list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2c055dec8c68..f972a94bbc32 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5208,6 +5208,7 @@ static int raid5_spare_active(mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { unsigned long flags; From 674806d62fb02a22eea948c9f1b5e58e0947b728 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 16 Jun 2010 17:17:53 +1000 Subject: [PATCH 09/12] md/raid5: More careful check for "has array failed". When we are reshaping an array, the device failure combinations that cause us to decide that the array as failed are more subtle. In particular, any 'spare' will be fully in-sync in the section of the array that has already been reshaped, thus failures that affect only that section are less critical. So encode this subtlety in a new function and call it as appropriate. The case that showed this problem was a 4 drive RAID5 to 8 drive RAID6 conversion where the last two devices failed. This resulted in: good good good good incomplete good good failed failed while converting a 5-drive RAID6 to 8 drive RAID5 The incomplete device causes the whole array to look bad, bad as it was actually good for the section that had been converted to 8-drives, all the data was actually safe. Reported-by: Terry Morris Signed-off-by: NeilBrown --- drivers/md/raid5.c | 75 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f972a94bbc32..d4b233c25f2e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -366,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, return NULL; } +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int has_failed(raid5_conf_t *conf) +{ + int degraded; + int i; + if (conf->mddev->reshape_position == MaxSector) + return conf->mddev->degraded > conf->max_degraded; + + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->previous_raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If the reshape increases the number of devices, + * this is being recovered by the reshape, so + * this 'previous' section is not in_sync. + * If the number of devices is being reduced however, + * the device can only be part of the array if + * we are reverting a reshape, so this section will + * be in-sync. + */ + if (conf->raid_disks >= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (degraded > conf->max_degraded) + return 1; + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If reshape increases the number of devices, this + * section has already been recovered, else it + * almost certainly hasn't. + */ + if (conf->raid_disks <= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (degraded > conf->max_degraded) + return 1; + return 0; +} + static void unplug_slaves(mddev_t *mddev); static void raid5_unplug_device(struct request_queue *q); @@ -5006,7 +5073,7 @@ static int run(mddev_t *mddev) mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) - working_disks); - if (mddev->degraded > conf->max_degraded) { + if (has_failed(conf)) { printk(KERN_ERR "md/raid:%s: not enough operational devices" " (%d/%d failed)\n", mdname(mddev), mddev->degraded, conf->raid_disks); @@ -5244,7 +5311,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded && + !has_failed(conf) && number < conf->raid_disks) { err = -EBUSY; goto abort; @@ -5272,7 +5339,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int first = 0; int last = conf->raid_disks - 1; - if (mddev->degraded > conf->max_degraded) + if (has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -5364,7 +5431,7 @@ static int check_reshape(mddev_t *mddev) if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; - if (mddev->degraded > conf->max_degraded) + if (has_failed(conf)) return -EINVAL; if (mddev->delta_disks < 0) { /* We might be able to shrink, but the devices must From 415e72d034c50520ddb7ff79e7d1792c1306f0c9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 17 Jun 2010 17:25:21 +1000 Subject: [PATCH 10/12] md/raid5: Allow recovered part of partially recovered devices to be in-sync During a recovery of reshape the early part of some devices might be in-sync while the later parts are not. We we know we are looking at an early part it is good to treat that part as in-sync for stripe calculations. This is particularly important for a reshape which suffers device failure. Treating the data as in-sync can mean the difference between data-safety and data-loss. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d4b233c25f2e..09f07dadf404 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3031,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh) mdk_rdev_t *rdev; dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx toread %p read %p write %p " "written %p\n", i, dev->flags, dev->toread, dev->read, @@ -3068,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh) blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* could be in-sync depending on recovery/reshape status */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { s.failed++; s.failed_num = i; - } else - set_bit(R5_Insync, &dev->flags); + } } rcu_read_unlock(); @@ -3312,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh) for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); @@ -3350,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh) blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* in sync if before recovery_offset */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { if (s.failed < 2) r6s.failed_num[s.failed] = i; s.failed++; - } else - set_bit(R5_Insync, &dev->flags); + } } rcu_read_unlock(); From 2f115882499f3e5eca33d1df07b8876cc752a1ff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 17 Jun 2010 17:41:03 +1000 Subject: [PATCH 11/12] md/raid5: add a missing 'continue' in a loop. As the comment says, the tail of this loop only applies to devices that are not fully in sync, so if In_sync was set, we should avoid the rest of the loop. This bug will hardly ever cause an actual problem. The worst it can do is allow an array to be assembled that is dirty and degraded, which is not generally a good idea (without warning the sysadmin first). This will only happen if the array is RAID4 or a RAID5/6 in an intermediate state during a reshape and so has one drive that is all 'parity' - no data - while some other device has failed. This is certainly possible, but not at all common. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 09f07dadf404..66cd47973398 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5057,8 +5057,10 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk < 0) continue; - if (test_bit(In_sync, &rdev->flags)) + if (test_bit(In_sync, &rdev->flags)) { working_disks++; + continue; + } /* This disc is not fully in-sync. However if it * just stored parity (beyond the recovery_offset), * when we don't need to be concerned about the From 3424bf6a772cff606fc4bc24a3639c937afb547f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 17 Jun 2010 17:48:26 +1000 Subject: [PATCH 12/12] md/raid5: don't include 'spare' drives when reshaping to fewer devices. There are few situations where it would make any sense to add a spare when reducing the number of devices in an array, but it is conceivable: A 6 drive RAID6 with two missing devices could be reshaped to a 5 drive RAID6, and a spare could become available just in time for the reshape, but not early enough to have been recovered first. 'freezing' recovery can make this easy to do without any races. However doing such a thing is a bad idea. md will not record the partially-recovered state of the 'spare' and when the reshape finished it will think that the spare is still spare. Easiest way to avoid this confusion is to simply disallow it. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 66cd47973398..96c690279fc6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5526,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev) /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. + * Don't add devices if we are reducing the number of + * devices in the array. This is because it is not possible + * to correctly record the "partially reconstructed" state of + * such devices during the reshape and confusion could result. */ - list_for_each_entry(rdev, &mddev->disks, same_set) + if (mddev->delta_disks >= 0) + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { @@ -5549,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev) } /* When a reshape changes the number of devices, ->degraded - * is measured against the large of the pre and post number of + * is measured against the larger of the pre and post number of * devices.*/ if (mddev->delta_disks > 0) { spin_lock_irqsave(&conf->device_lock, flags);