From f3b99be19ded511a1bf05a148276239d9f13eefa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 24 Jun 2010 13:31:03 +1000
Subject: [PATCH 01/12] Restore partition detection of newly created md arrays.

Commit  b821eaa572fd737faaf6928ba046e571526c36c6 broke partition
detection for md arrays.

The logic was almost right.  However if revalidate_disk is called
when the device is not yet open, bdev->bd_disk won't be set, so the
flush_disk() Call will not set bd_invalidated.

So when md_open is called we still need to ensure that
->bd_invalidated gets set.  This is easily done with a call to
check_disk_size_change in the place where the offending commit removed
check_disk_change.  At the important times, the size will have changed
from 0 to non-zero, so check_disk_size_change will set bd_invalidated.

Tested-by: Duncan <1i5t5.duncan@cox.net>
Reported-by: Duncan <1i5t5.duncan@cox.net>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 46b3a044eadf..4edcda8f4869 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5895,6 +5895,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	atomic_inc(&mddev->openers);
 	mutex_unlock(&mddev->open_mutex);
 
+	check_disk_size_change(mddev->gendisk, bdev);
  out:
 	return err;
 }

From 0544a21db02c1d8883158fd6f323364f830a120a Mon Sep 17 00:00:00 2001
From: "Prasanna S. Panchamukhi" <prasanna.panchamukhi@riverbed.com>
Date: Thu, 24 Jun 2010 13:31:03 +1000
Subject: [PATCH 02/12] md: raid10: Fix null pointer dereference in
 fix_read_error()

Such NULL pointer dereference can occur when the driver was fixing the
read errors/bad blocks and the disk was physically removed
causing a system crash. This patch check if the
rcu_dereference() returns valid rdev before accessing it in fix_read_error().

Cc: stable@kernel.org
Signed-off-by: Prasanna S. Panchamukhi <prasanna.panchamukhi@riverbed.com>
Signed-off-by: Rob Becker <rbecker@riverbed.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 03724992cdf2..6d420cb487b5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 	int sectors = r10_bio->sectors;
 	mdk_rdev_t*rdev;
 	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+	int d = r10_bio->devs[r10_bio->read_slot].devnum;
 
 	rcu_read_lock();
-	{
-		int d = r10_bio->devs[r10_bio->read_slot].devnum;
+	rdev = rcu_dereference(conf->mirrors[d].rdev);
+	if (rdev) { /* If rdev is not NULL */
 		char b[BDEVNAME_SIZE];
 		int cur_read_error_count = 0;
 
-		rdev = rcu_dereference(conf->mirrors[d].rdev);
 		bdevname(rdev->bdev, b);
 
 		if (test_bit(Faulty, &rdev->flags)) {
@@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 
 		rcu_read_lock();
 		do {
-			int d = r10_bio->devs[sl].devnum;
+			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev &&
 			    test_bit(In_sync, &rdev->flags)) {
@@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 		rcu_read_lock();
 		while (sl != r10_bio->read_slot) {
 			char b[BDEVNAME_SIZE];
-			int d;
+
 			if (sl==0)
 				sl = conf->copies;
 			sl--;
@@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
 		}
 		sl = start;
 		while (sl != r10_bio->read_slot) {
-			int d;
+
 			if (sl==0)
 				sl = conf->copies;
 			sl--;

From e93f68a1fc6244c05ad8fae28e75835ec74ab34e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 15 Jun 2010 09:36:03 +0100
Subject: [PATCH 03/12] md: fix handling of array level takeover that
 re-arranges devices.

Most array level changes leave the list of devices largely unchanged,
possibly causing one at the end to become redundant.
However conversions between RAID0 and RAID10 need to renumber
all devices (except 0).

This renumbering is currently being done in the ->run method when the
new personality takes over.  However this is too late as the common
code in md.c might already have invalidated some of the devices if
they had a ->raid_disk number that appeared to high.

Moving it into the ->takeover method is too early as the array is
still active at that time and wrong ->raid_disk numbers could cause
confusion.

So add a ->new_raid_disk field to mdk_rdev_s and use it to communicate
the new raid_disk number.
Now the common code knows exactly which devices need to be renumbered,
and which can be invalidated, and can do it all at a convenient time
when the array is suspend.
It can also update some symlinks in sysfs which previously were not be
updated correctly.

Reported-by: Maciej Trela <maciej.trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     | 35 ++++++++++++++++++++++++++++++-----
 drivers/md/md.h     |  3 +++
 drivers/md/raid0.c  | 11 +++--------
 drivers/md/raid0.h  |  3 ---
 drivers/md/raid10.c | 19 +++++--------------
 drivers/md/raid10.h |  5 -----
 6 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4edcda8f4869..4869128bf742 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3001,6 +3001,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 		return -EINVAL;
 	}
 
+	list_for_each_entry(rdev, &mddev->disks, same_set)
+		rdev->new_raid_disk = rdev->raid_disk;
+
 	/* ->takeover must set new_* and/or delta_disks
 	 * if it succeeds, and may set them when it fails.
 	 */
@@ -3051,13 +3054,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 		mddev->safemode = 0;
 	}
 
-	module_put(mddev->pers->owner);
-	/* Invalidate devices that are now superfluous */
-	list_for_each_entry(rdev, &mddev->disks, same_set)
-		if (rdev->raid_disk >= mddev->raid_disks) {
-			rdev->raid_disk = -1;
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		char nm[20];
+		if (rdev->raid_disk < 0)
+			continue;
+		if (rdev->new_raid_disk > mddev->raid_disks)
+			rdev->new_raid_disk = -1;
+		if (rdev->new_raid_disk == rdev->raid_disk)
+			continue;
+		sprintf(nm, "rd%d", rdev->raid_disk);
+		sysfs_remove_link(&mddev->kobj, nm);
+	}
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (rdev->raid_disk < 0)
+			continue;
+		if (rdev->new_raid_disk == rdev->raid_disk)
+			continue;
+		rdev->raid_disk = rdev->new_raid_disk;
+		if (rdev->raid_disk < 0)
 			clear_bit(In_sync, &rdev->flags);
+		else {
+			char nm[20];
+			sprintf(nm, "rd%d", rdev->raid_disk);
+			if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
+				printk("md: cannot register %s for %s after level change\n",
+				       nm, mdname(mddev));
 		}
+	}
+
+	module_put(mddev->pers->owner);
 	mddev->pers = pers;
 	mddev->private = priv;
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7ab5ea155452..10597bfec000 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -78,6 +78,9 @@ struct mdk_rdev_s
 
 	int desc_nr;			/* descriptor index in the superblock */
 	int raid_disk;			/* role of device in array */
+	int new_raid_disk;		/* role that the device will have in
+					 * the array after a level-change completes.
+					 */
 	int saved_raid_disk;		/* role that device used to have in the
 					 * array and could again if we did a partial
 					 * resync from the bitmap
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e70f004c99e8..7c7c38058bc2 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		int j = rdev1->raid_disk;
 
-		if (mddev->level == 10)
+		if (mddev->level == 10) {
 			/* taking over a raid10-n2 array */
 			j /= 2;
+			rdev1->new_raid_disk = j;
+		}
 
 		if (j < 0 || j >= mddev->raid_disks) {
 			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
@@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev)
 		mddev->private = conf;
 	}
 	conf = mddev->private;
-	if (conf->scale_raid_disks) {
-		int i;
-		for (i=0; i < conf->strip_zone[0].nb_dev; i++)
-			conf->devlist[i]->raid_disk /= conf->scale_raid_disks;
-		/* FIXME update sysfs rd links */
-	}
 
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
@@ -643,7 +639,6 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
 	mddev->recovery_cp = MaxSector;
 
 	create_strip_zones(mddev, &priv_conf);
-	priv_conf->scale_raid_disks = 2;
 	return priv_conf;
 }
 
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index d724e664ca4d..91f8e876ee64 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -13,9 +13,6 @@ struct raid0_private_data
 	struct strip_zone *strip_zone;
 	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
-	int scale_raid_disks; /* divide rdev->raid_disks by this in run()
-			       * to handle conversion from raid10
-			       */
 };
 
 typedef struct raid0_private_data raid0_conf_t;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6d420cb487b5..1bab3559f3e2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev)
 	if (!conf->thread)
 		goto out;
 
-	conf->scale_disks = 0;
 	conf->mddev = mddev;
 	return conf;
 
@@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev)
 		if (disk_idx >= conf->raid_disks
 		    || disk_idx < 0)
 			continue;
-		if (conf->scale_disks) {
-			disk_idx *= conf->scale_disks;
-			rdev->raid_disk = disk_idx;
-			/* MOVE 'rd%d' link !! */
-		}
 		disk = conf->mirrors + disk_idx;
 
 		disk->rdev = rdev;
@@ -2435,13 +2429,6 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
 		return ERR_PTR(-EINVAL);
 	}
 
-	/* Update slot numbers to obtain
-	 * degraded raid10 with missing mirrors
-	 */
-	list_for_each_entry(rdev, &mddev->disks, same_set) {
-		rdev->raid_disk *= 2;
-	}
-
 	/* Set new parameters */
 	mddev->new_level = 10;
 	/* new layout: far_copies = 1, near_copies = 2 */
@@ -2454,7 +2441,11 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
 	mddev->recovery_cp = MaxSector;
 
 	conf = setup_conf(mddev);
-	conf->scale_disks = 2;
+	if (!IS_ERR(conf))
+		list_for_each_entry(rdev, &mddev->disks, same_set)
+			if (rdev->raid_disk >= 0)
+				rdev->new_raid_disk = rdev->raid_disk * 2;
+		
 	return conf;
 }
 
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3824a087e17c..2316ac2e8e21 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -38,11 +38,6 @@ struct r10_private_data_s {
 	int chunk_shift; /* shift from chunks to sectors */
 	sector_t chunk_mask;
 
-	int			scale_disks;  /* When starting array, multiply
-					       * each ->raid_disk by this.
-					       * Need for raid0->raid10 migration
-					       */
-
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;

From f73ea87375a1b2bf6c0be82bb9a3cb9d5ee7a407 Mon Sep 17 00:00:00 2001
From: Maciej Trela <maciej.trela@intel.com>
Date: Wed, 16 Jun 2010 11:46:29 +0100
Subject: [PATCH 04/12] md: fix raid10 takeover: use new_layout for setup_conf

Use mddev->new_layout in setup_conf.
Also use new_chunk, and don't set ->degraded in takeover().  That
gets set in run()

Signed-off-by: Maciej Trela <maciej.trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1bab3559f3e2..42e64e4e5e25 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev)
 	sector_t stride, size;
 	int err = -EINVAL;
 
-	if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
-	    !is_power_of_2(mddev->chunk_sectors)) {
+	if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
+	    !is_power_of_2(mddev->new_chunk_sectors)) {
 		printk(KERN_ERR "md/raid10:%s: chunk size must be "
 		       "at least PAGE_SIZE(%ld) and be a power of 2.\n",
 		       mdname(mddev), PAGE_SIZE);
 		goto out;
 	}
 
-	nc = mddev->layout & 255;
-	fc = (mddev->layout >> 8) & 255;
-	fo = mddev->layout & (1<<16);
+	nc = mddev->new_layout & 255;
+	fc = (mddev->new_layout >> 8) & 255;
+	fo = mddev->new_layout & (1<<16);
 
 	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
-	    (mddev->layout >> 17)) {
+	    (mddev->new_layout >> 17)) {
 		printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
-		       mdname(mddev), mddev->layout);
+		       mdname(mddev), mddev->new_layout);
 		goto out;
 	}
 
@@ -2435,7 +2435,6 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
 	mddev->new_layout = (1<<8) + 2;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->delta_disks = mddev->raid_disks;
-	mddev->degraded = mddev->raid_disks;
 	mddev->raid_disks *= 2;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;

From 001048a318d48e93cb6a1246f3b20335b2a7c855 Mon Sep 17 00:00:00 2001
From: Maciej Trela <maciej.trela@intel.com>
Date: Wed, 16 Jun 2010 11:55:14 +0100
Subject: [PATCH 05/12] md: clear layout after ->raid0 takeover

After takeover from raid5/10 -> raid0 mddev->layout is not cleared.

Signed-off-by: Maciej Trela <maciej.trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7c7c38058bc2..ac09b7d38553 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -592,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev)
 
 	/* Set new parameters */
 	mddev->new_level = 0;
+	mddev->new_layout = 0;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->raid_disks--;
 	mddev->delta_disks = -1;
@@ -631,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
 
 	/* Set new parameters */
 	mddev->new_level = 0;
+	mddev->new_layout = 0;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->delta_disks = - mddev->raid_disks / 2;
 	mddev->raid_disks += mddev->delta_disks;

From 049d6c1ef983c9ac43aa423dfd752071a5b0002d Mon Sep 17 00:00:00 2001
From: Maciej Trela <maciej.trela@intel.com>
Date: Wed, 16 Jun 2010 11:56:12 +0100
Subject: [PATCH 06/12] md: enable raid4->raid0 takeover

Only level 5 with layout=PARITY_N can be taken over to raid0 now.
Lets allow level 4 either.

Signed-off-by: Maciej Trela <maciej.trela@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ac09b7d38553..563abed5a2cb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -569,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 	return;
 }
 
-static void *raid0_takeover_raid5(mddev_t *mddev)
+static void *raid0_takeover_raid45(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
 	raid0_conf_t *priv_conf;
@@ -647,12 +647,16 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
 static void *raid0_takeover(mddev_t *mddev)
 {
 	/* raid0 can take over:
+	 *  raid4 - if all data disks are active.
 	 *  raid5 - providing it is Raid4 layout and one disk is faulty
 	 *  raid10 - assuming we have all necessary active disks
 	 */
+	if (mddev->level == 4)
+		return raid0_takeover_raid45(mddev);
+
 	if (mddev->level == 5) {
 		if (mddev->layout == ALGORITHM_PARITY_N)
-			return raid0_takeover_raid5(mddev);
+			return raid0_takeover_raid45(mddev);
 
 		printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
 		       mdname(mddev), ALGORITHM_PARITY_N);

From e4e11e385d1e5516ac76c956d6c25e6c2fa1b8d0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 16 Jun 2010 16:45:16 +1000
Subject: [PATCH 07/12] md/raid5: avoid oops when number of devices is reduced
 then increased.

The entries in the stripe_cache maintained by raid5 are enlarged
when we increased the number of devices in the array, but not
shrunk when we reduce the number of devices.
So if entries are added after reducing the number of devices, we
much ensure to initialise the whole entry, not just the part that
is currently relevant.  Otherwise if we enlarge the array again,
we will reference uninitialised values.

As grow_buffers/shrink_buffer now want to use a count that is stored
explicity in the raid_conf, they should get it from there rather than
being passed it as a parameter.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d2c0f94fa37d..2c055dec8c68 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -277,12 +277,13 @@ out:
 	return sh;
 }
 
-static void shrink_buffers(struct stripe_head *sh, int num)
+static void shrink_buffers(struct stripe_head *sh)
 {
 	struct page *p;
 	int i;
+	int num = sh->raid_conf->pool_size;
 
-	for (i=0; i<num ; i++) {
+	for (i = 0; i < num ; i++) {
 		p = sh->dev[i].page;
 		if (!p)
 			continue;
@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)
 	}
 }
 
-static int grow_buffers(struct stripe_head *sh, int num)
+static int grow_buffers(struct stripe_head *sh)
 {
 	int i;
+	int num = sh->raid_conf->pool_size;
 
-	for (i=0; i<num; i++) {
+	for (i = 0; i < num; i++) {
 		struct page *page;
 
 		if (!(page = alloc_page(GFP_KERNEL))) {
@@ -1240,19 +1242,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 static int grow_one_stripe(raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
-	int disks = max(conf->raid_disks, conf->previous_raid_disks);
 	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
 	if (!sh)
 		return 0;
-	memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev));
+	memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
 	sh->raid_conf = conf;
 	spin_lock_init(&sh->lock);
 	#ifdef CONFIG_MULTICORE_RAID456
 	init_waitqueue_head(&sh->ops.wait_for_ops);
 	#endif
 
-	if (grow_buffers(sh, disks)) {
-		shrink_buffers(sh, disks);
+	if (grow_buffers(sh)) {
+		shrink_buffers(sh);
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
 	}
@@ -1468,7 +1469,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
 	if (!sh)
 		return 0;
 	BUG_ON(atomic_read(&sh->count));
-	shrink_buffers(sh, conf->pool_size);
+	shrink_buffers(sh);
 	kmem_cache_free(conf->slab_cache, sh);
 	atomic_dec(&conf->active_stripes);
 	return 1;

From 70fffd0bfab1558a8c64c5e903dea1fb84cd9f6b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 16 Jun 2010 17:01:25 +1000
Subject: [PATCH 08/12] md: Don't update ->recovery_offset when reshaping an
 array to fewer devices.

When an array is reshaped to have fewer devices, the reshape proceeds
from the end of the devices to the beginning.

If a device happens to be non-In_sync (which is possible but rare)
we would normally update the ->recovery_offset as the reshape
progresses. However that would be wrong as the recover_offset records
that the early part of the device is in_sync, while in fact it would
only be the later part that is in_sync, and in any case the offset
number would be measured from the wrong end of the device.

Relatedly, if after a reshape a spare is discovered to not be
recoverred all the way to the end, not allow spare_active
to incorporate it in the array.

This becomes relevant in the following sample scenario:

A 4 drive RAID5 is converted to a 6 drive RAID6 in a combined
operation.
The RAID5->RAID6 conversion will cause a 5 drive to be included as a
spare, then the 5drive -> 6drive reshape will effectively rebuild that
spare as it progresses.  The 6th drive is treated as in_sync the whole
time as there is never any case that we might consider reading from
it, but must not because there is no valid data.

If we interrupt this reshape part-way through and reverse it to return
to a 5-drive RAID6 (or event a 4-drive RAID5), we don't want to update
the recovery_offset - as that would be wrong - and we don't want to
include that spare as active in the 5-drive RAID6 when the reversed
reshape completed and it will be mostly out-of-sync still.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 2 ++
 drivers/md/raid5.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4869128bf742..cb20d0b0555a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 	/* First make sure individual recovery_offsets are correct */
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->raid_disk >= 0 &&
+		    mddev->delta_disks >= 0 &&
 		    !test_bit(In_sync, &rdev->flags) &&
 		    mddev->curr_resync_completed > rdev->recovery_offset)
 				rdev->recovery_offset = mddev->curr_resync_completed;
@@ -6872,6 +6873,7 @@ void md_do_sync(mddev_t *mddev)
 			rcu_read_lock();
 			list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
 				if (rdev->raid_disk >= 0 &&
+				    mddev->delta_disks >= 0 &&
 				    !test_bit(Faulty, &rdev->flags) &&
 				    !test_bit(In_sync, &rdev->flags) &&
 				    rdev->recovery_offset < mddev->curr_resync)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2c055dec8c68..f972a94bbc32 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5208,6 +5208,7 @@ static int raid5_spare_active(mddev_t *mddev)
 	for (i = 0; i < conf->raid_disks; i++) {
 		tmp = conf->disks + i;
 		if (tmp->rdev
+		    && tmp->rdev->recovery_offset == MaxSector
 		    && !test_bit(Faulty, &tmp->rdev->flags)
 		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			unsigned long flags;

From 674806d62fb02a22eea948c9f1b5e58e0947b728 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 16 Jun 2010 17:17:53 +1000
Subject: [PATCH 09/12] md/raid5: More careful check for "has array failed".

When we are reshaping an array, the device failure combinations
that cause us to decide that the array as failed are more subtle.

In particular, any 'spare' will be fully in-sync in the section
of the array that has already been reshaped, thus failures that
affect only that section are less critical.

So encode this subtlety in a new function and call it as appropriate.

The case that showed this problem was a 4 drive RAID5 to 8 drive RAID6
conversion where the last two devices failed.
This resulted in:

  good good good good incomplete good good failed failed

while converting a 5-drive RAID6 to 8 drive RAID5
The incomplete device causes the whole array to look bad,
bad as it was actually good for the section that had been
converted to 8-drives, all the data was actually safe.

Reported-by: Terry Morris <tbmorris@tbmorris.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 75 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 71 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f972a94bbc32..d4b233c25f2e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -366,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
 	return NULL;
 }
 
+/*
+ * Need to check if array has failed when deciding whether to:
+ *  - start an array
+ *  - remove non-faulty devices
+ *  - add a spare
+ *  - allow a reshape
+ * This determination is simple when no reshape is happening.
+ * However if there is a reshape, we need to carefully check
+ * both the before and after sections.
+ * This is because some failed devices may only affect one
+ * of the two sections, and some non-in_sync devices may
+ * be insync in the section most affected by failed devices.
+ */
+static int has_failed(raid5_conf_t *conf)
+{
+	int degraded;
+	int i;
+	if (conf->mddev->reshape_position == MaxSector)
+		return conf->mddev->degraded > conf->max_degraded;
+
+	rcu_read_lock();
+	degraded = 0;
+	for (i = 0; i < conf->previous_raid_disks; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+		if (!rdev || test_bit(Faulty, &rdev->flags))
+			degraded++;
+		else if (test_bit(In_sync, &rdev->flags))
+			;
+		else
+			/* not in-sync or faulty.
+			 * If the reshape increases the number of devices,
+			 * this is being recovered by the reshape, so
+			 * this 'previous' section is not in_sync.
+			 * If the number of devices is being reduced however,
+			 * the device can only be part of the array if
+			 * we are reverting a reshape, so this section will
+			 * be in-sync.
+			 */
+			if (conf->raid_disks >= conf->previous_raid_disks)
+				degraded++;
+	}
+	rcu_read_unlock();
+	if (degraded > conf->max_degraded)
+		return 1;
+	rcu_read_lock();
+	degraded = 0;
+	for (i = 0; i < conf->raid_disks; i++) {
+		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+		if (!rdev || test_bit(Faulty, &rdev->flags))
+			degraded++;
+		else if (test_bit(In_sync, &rdev->flags))
+			;
+		else
+			/* not in-sync or faulty.
+			 * If reshape increases the number of devices, this
+			 * section has already been recovered, else it
+			 * almost certainly hasn't.
+			 */
+			if (conf->raid_disks <= conf->previous_raid_disks)
+				degraded++;
+	}
+	rcu_read_unlock();
+	if (degraded > conf->max_degraded)
+		return 1;
+	return 0;
+}
+
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(struct request_queue *q);
 
@@ -5006,7 +5073,7 @@ static int run(mddev_t *mddev)
 	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
 			   - working_disks);
 
-	if (mddev->degraded > conf->max_degraded) {
+	if (has_failed(conf)) {
 		printk(KERN_ERR "md/raid:%s: not enough operational devices"
 			" (%d/%d failed)\n",
 			mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -5244,7 +5311,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
 		 * isn't possible.
 		 */
 		if (!test_bit(Faulty, &rdev->flags) &&
-		    mddev->degraded <= conf->max_degraded &&
+		    !has_failed(conf) &&
 		    number < conf->raid_disks) {
 			err = -EBUSY;
 			goto abort;
@@ -5272,7 +5339,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
-	if (mddev->degraded > conf->max_degraded)
+	if (has_failed(conf))
 		/* no point adding a device */
 		return -EINVAL;
 
@@ -5364,7 +5431,7 @@ static int check_reshape(mddev_t *mddev)
 	if (mddev->bitmap)
 		/* Cannot grow a bitmap yet */
 		return -EBUSY;
-	if (mddev->degraded > conf->max_degraded)
+	if (has_failed(conf))
 		return -EINVAL;
 	if (mddev->delta_disks < 0) {
 		/* We might be able to shrink, but the devices must

From 415e72d034c50520ddb7ff79e7d1792c1306f0c9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 17 Jun 2010 17:25:21 +1000
Subject: [PATCH 10/12] md/raid5: Allow recovered part of partially recovered
 devices to be in-sync

During a recovery of reshape the early part of some devices might be
in-sync while the later parts are not.
We we know we are looking at an early part it is good to treat that
part as in-sync for stripe calculations.

This is particularly important for a reshape which suffers device
failure.  Treating the data as in-sync can mean the difference between
data-safety and data-loss.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d4b233c25f2e..09f07dadf404 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3031,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh)
 		mdk_rdev_t *rdev;
 
 		dev = &sh->dev[i];
-		clear_bit(R5_Insync, &dev->flags);
 
 		pr_debug("check %d: state 0x%lx toread %p read %p write %p "
 			"written %p\n",	i, dev->flags, dev->toread, dev->read,
@@ -3068,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh)
 			blocked_rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
 		}
-		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+		clear_bit(R5_Insync, &dev->flags);
+		if (!rdev)
+			/* Not in-sync */;
+		else if (test_bit(In_sync, &rdev->flags))
+			set_bit(R5_Insync, &dev->flags);
+		else {
+			/* could be in-sync depending on recovery/reshape status */
+			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+				set_bit(R5_Insync, &dev->flags);
+		}
+		if (!test_bit(R5_Insync, &dev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
 			clear_bit(R5_ReWrite, &dev->flags);
 		}
-		if (!rdev || !test_bit(In_sync, &rdev->flags)
-		    || test_bit(R5_ReadError, &dev->flags)) {
+		if (test_bit(R5_ReadError, &dev->flags))
+			clear_bit(R5_Insync, &dev->flags);
+		if (!test_bit(R5_Insync, &dev->flags)) {
 			s.failed++;
 			s.failed_num = i;
-		} else
-			set_bit(R5_Insync, &dev->flags);
+		}
 	}
 	rcu_read_unlock();
 
@@ -3312,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh)
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
 		dev = &sh->dev[i];
-		clear_bit(R5_Insync, &dev->flags);
 
 		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
 			i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -3350,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh)
 			blocked_rdev = rdev;
 			atomic_inc(&rdev->nr_pending);
 		}
-		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+		clear_bit(R5_Insync, &dev->flags);
+		if (!rdev)
+			/* Not in-sync */;
+		else if (test_bit(In_sync, &rdev->flags))
+			set_bit(R5_Insync, &dev->flags);
+		else {
+			/* in sync if before recovery_offset */
+			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+				set_bit(R5_Insync, &dev->flags);
+		}
+		if (!test_bit(R5_Insync, &dev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
 			clear_bit(R5_ReWrite, &dev->flags);
 		}
-		if (!rdev || !test_bit(In_sync, &rdev->flags)
-		    || test_bit(R5_ReadError, &dev->flags)) {
+		if (test_bit(R5_ReadError, &dev->flags))
+			clear_bit(R5_Insync, &dev->flags);
+		if (!test_bit(R5_Insync, &dev->flags)) {
 			if (s.failed < 2)
 				r6s.failed_num[s.failed] = i;
 			s.failed++;
-		} else
-			set_bit(R5_Insync, &dev->flags);
+		}
 	}
 	rcu_read_unlock();
 

From 2f115882499f3e5eca33d1df07b8876cc752a1ff Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 17 Jun 2010 17:41:03 +1000
Subject: [PATCH 11/12] md/raid5: add a missing 'continue' in a loop.

As the comment says, the tail of this loop only applies to devices
that are not fully in sync, so if In_sync was set, we should avoid
the rest of the loop.

This bug will hardly ever cause an actual problem.  The worst it
can do is allow an array to be assembled that is dirty and degraded,
which is not generally a good idea (without warning the sysadmin
first).

This will only happen if the array is RAID4 or a RAID5/6 in an
intermediate state during a reshape and so has one drive that is
all 'parity' - no data - while some other device has failed.

This is certainly possible, but not at all common.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 09f07dadf404..66cd47973398 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5057,8 +5057,10 @@ static int run(mddev_t *mddev)
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->raid_disk < 0)
 			continue;
-		if (test_bit(In_sync, &rdev->flags))
+		if (test_bit(In_sync, &rdev->flags)) {
 			working_disks++;
+			continue;
+		}
 		/* This disc is not fully in-sync.  However if it
 		 * just stored parity (beyond the recovery_offset),
 		 * when we don't need to be concerned about the

From 3424bf6a772cff606fc4bc24a3639c937afb547f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 17 Jun 2010 17:48:26 +1000
Subject: [PATCH 12/12] md/raid5: don't include 'spare' drives when reshaping
 to fewer devices.

There are few situations where it would make any sense to add a spare
when reducing the number of devices in an array, but it is
conceivable:  A 6 drive RAID6 with two missing devices could be
reshaped to a 5 drive RAID6, and a spare could become available
just in time for the reshape, but not early enough to have been
recovered first.  'freezing' recovery can make this easy to
do without any races.

However doing such a thing is a bad idea.  md will not record the
partially-recovered state of the 'spare' and when the reshape
finished it will think that the spare is still spare.
Easiest way to avoid this confusion is to simply disallow it.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 66cd47973398..96c690279fc6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5526,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev)
 
 	/* Add some new drives, as many as will fit.
 	 * We know there are enough to make the newly sized array work.
+	 * Don't add devices if we are reducing the number of
+	 * devices in the array.  This is because it is not possible
+	 * to correctly record the "partially reconstructed" state of
+	 * such devices during the reshape and confusion could result.
 	 */
-	list_for_each_entry(rdev, &mddev->disks, same_set)
+	if (mddev->delta_disks >= 0)
+	    list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
 			if (raid5_add_disk(mddev, rdev) == 0) {
@@ -5549,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 		}
 
 	/* When a reshape changes the number of devices, ->degraded
-	 * is measured against the large of the pre and post number of
+	 * is measured against the larger of the pre and post number of
 	 * devices.*/
 	if (mddev->delta_disks > 0) {
 		spin_lock_irqsave(&conf->device_lock, flags);