From 13864515f7bf6cabd60e63c62e09d311386ae1f1 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@novell.com>
Date: Sat, 28 Jun 2008 08:31:19 +1000
Subject: [PATCH 01/52] linear: correct disk numbering error check

From: "Nikanth Karthikesan" <knikanth@novell.com>

Correct disk numbering problem check.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/linear.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 10748240cb2f..ec921f58fbb8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -126,7 +126,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		int j = rdev->raid_disk;
 		dev_info_t *disk = conf->disks + j;
 
-		if (j < 0 || j > raid_disks || disk->rdev) {
+		if (j < 0 || j >= raid_disks || disk->rdev) {
 			printk("linear: disk numbering problem. Aborting!\n");
 			goto out;
 		}

From 0e13fe23a00ad88c737d91d94a050707c6139ce4 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:20 +1000
Subject: [PATCH 02/52] use bio_endio instead of a call to bi_end_io

Turn calls to bi->bi_end_io() into bio_endio(). Apparently bio_endio does
exactly the same error processing as is hardcoded at these places.

bio_endio() avoids recursion (or will soon), so it should be used.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 54c8ee28fcc4..214b44122822 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -115,9 +115,7 @@ static void return_io(struct bio *return_bi)
 		return_bi = bi->bi_next;
 		bi->bi_next = NULL;
 		bi->bi_size = 0;
-		bi->bi_end_io(bi,
-			      test_bit(BIO_UPTODATE, &bi->bi_flags)
-			        ? 0 : -EIO);
+		bio_endio(bi, 0);
 		bi = return_bi;
 	}
 }
@@ -3700,9 +3698,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 		if ( rw == WRITE )
 			md_write_end(mddev);
 
-		bi->bi_end_io(bi,
-			      test_bit(BIO_UPTODATE, &bi->bi_flags)
-			        ? 0 : -EIO);
+		bio_endio(bi, 0);
 	}
 	return 0;
 }
@@ -4005,12 +4001,8 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 	spin_lock_irq(&conf->device_lock);
 	remaining = --raid_bio->bi_phys_segments;
 	spin_unlock_irq(&conf->device_lock);
-	if (remaining == 0) {
-
-		raid_bio->bi_end_io(raid_bio,
-			      test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
-			        ? 0 : -EIO);
-	}
+	if (remaining == 0)
+		bio_endio(raid_bio, 0);
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_stripe);
 	return handled;

From a0da84f35b25875870270d16b6eccda4884d61a7 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:22 +1000
Subject: [PATCH 03/52] Improve setting of "events_cleared" for write-intent
 bitmaps.

When an array is degraded, bits in the write-intent bitmap are not
cleared, so that if the missing device is re-added, it can be synced
by only updated those parts of the device that have changed since
it was removed.

The enable this a 'events_cleared' value is stored. It is the event
counter for the array the last time that any bits were cleared.

Sometimes - if a device disappears from an array while it is 'clean' -
the events_cleared value gets updated incorrectly (there are subtle
ordering issues between updateing events in the main metadata and the
bitmap metadata) resulting in the missing device appearing to require
a full resync when it is re-added.

With this patch, we update events_cleared precisely when we are about
to clear a bit in the bitmap.  We record events_cleared when we clear
the bit internally, and copy that to the superblock which is written
out before the bit on storage.  This makes it more "obviously correct".

We also need to update events_cleared when the event_count is going
backwards (as happens on a dirty->clean transition of a non-degraded
array).

Thanks to Mike Snitzer for identifying this problem and testing early
"fixes".

Cc:  "Mike Snitzer" <snitzer@gmail.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/bitmap.c         | 29 ++++++++++++++++++++++++-----
 include/linux/raid/bitmap.h |  1 +
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b26927ce889c..dedba16d42f7 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -454,8 +454,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
 	sb->events = cpu_to_le64(bitmap->mddev->events);
-	if (!bitmap->mddev->degraded)
-		sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
+	if (bitmap->mddev->events < bitmap->events_cleared) {
+		/* rocking back to read-only */
+		bitmap->events_cleared = bitmap->mddev->events;
+		sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
+	}
 	kunmap_atomic(sb, KM_USER0);
 	write_page(bitmap, bitmap->sb_page, 1);
 }
@@ -1085,9 +1088,19 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 			} else
 				spin_unlock_irqrestore(&bitmap->lock, flags);
 			lastpage = page;
-/*
-			printk("bitmap clean at page %lu\n", j);
-*/
+
+			/* We are possibly going to clear some bits, so make
+			 * sure that events_cleared is up-to-date.
+			 */
+			if (bitmap->need_sync) {
+				bitmap_super_t *sb;
+				bitmap->need_sync = 0;
+				sb = kmap_atomic(bitmap->sb_page, KM_USER0);
+				sb->events_cleared =
+					cpu_to_le64(bitmap->events_cleared);
+				kunmap_atomic(sb, KM_USER0);
+				write_page(bitmap, bitmap->sb_page, 1);
+			}
 			spin_lock_irqsave(&bitmap->lock, flags);
 			clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
 		}
@@ -1257,6 +1270,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 			return;
 		}
 
+		if (success &&
+		    bitmap->events_cleared < bitmap->mddev->events) {
+			bitmap->events_cleared = bitmap->mddev->events;
+			bitmap->need_sync = 1;
+		}
+
 		if (!success && ! (*bmc & NEEDED_MASK))
 			*bmc |= NEEDED_MASK;
 
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 78bfdea24a8e..e98900671ca9 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -221,6 +221,7 @@ struct bitmap {
 	unsigned long syncchunk;
 
 	__u64	events_cleared;
+	int need_sync;
 
 	/* bitmap spinlock */
 	spinlock_t lock;

From 5e96ee65c8bd629ce093da67a066d3946468298a Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:24 +1000
Subject: [PATCH 04/52] Allow setting start point for requested check/repair

This makes it possible to just resync a small part of an array.
e.g. if a drive reports that it has questionable sectors,
a 'repair' of just the region covering those sectors will
cause them to be read and, if there is an error, re-written
with correct data.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c           | 47 ++++++++++++++++++++++++++++++++++-----
 include/linux/raid/md_k.h |  2 ++
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2580ac1b9b0f..261322722c19 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -278,6 +278,7 @@ static mddev_t * mddev_find(dev_t unit)
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
 	new->reshape_position = MaxSector;
+	new->resync_min = 0;
 	new->resync_max = MaxSector;
 	new->level = LEVEL_NONE;
 
@@ -3074,6 +3075,36 @@ sync_completed_show(mddev_t *mddev, char *page)
 
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 
+static ssize_t
+min_sync_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n",
+		       (unsigned long long)mddev->resync_min);
+}
+static ssize_t
+min_sync_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	unsigned long long min;
+	if (strict_strtoull(buf, 10, &min))
+		return -EINVAL;
+	if (min > mddev->resync_max)
+		return -EINVAL;
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+
+	/* Must be a multiple of chunk_size */
+	if (mddev->chunk_size) {
+		if (min & (sector_t)((mddev->chunk_size>>9)-1))
+			return -EINVAL;
+	}
+	mddev->resync_min = min;
+
+	return len;
+}
+
+static struct md_sysfs_entry md_min_sync =
+__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
+
 static ssize_t
 max_sync_show(mddev_t *mddev, char *page)
 {
@@ -3089,9 +3120,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 	if (strncmp(buf, "max", 3) == 0)
 		mddev->resync_max = MaxSector;
 	else {
-		char *ep;
-		unsigned long long max = simple_strtoull(buf, &ep, 10);
-		if (ep == buf || (*ep != 0 && *ep != '\n'))
+		unsigned long long max;
+		if (strict_strtoull(buf, 10, &max))
+			return -EINVAL;
+		if (max < mddev->resync_min)
 			return -EINVAL;
 		if (max < mddev->resync_max &&
 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3222,6 +3254,7 @@ static struct attribute *md_redundancy_attrs[] = {
 	&md_sync_speed.attr,
 	&md_sync_force_parallel.attr,
 	&md_sync_completed.attr,
+	&md_min_sync.attr,
 	&md_max_sync.attr,
 	&md_suspend_lo.attr,
 	&md_suspend_hi.attr,
@@ -3777,6 +3810,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
 		mddev->size = 0;
 		mddev->raid_disks = 0;
 		mddev->recovery_cp = 0;
+		mddev->resync_min = 0;
 		mddev->resync_max = MaxSector;
 		mddev->reshape_position = MaxSector;
 		mddev->external = 0;
@@ -5625,9 +5659,11 @@ void md_do_sync(mddev_t *mddev)
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
 		/* we don't use the checkpoint if there's a bitmap */
-		if (!mddev->bitmap &&
-		    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			j = mddev->resync_min;
+		else if (!mddev->bitmap)
 			j = mddev->recovery_cp;
+
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->size << 1;
 	else {
@@ -5796,6 +5832,7 @@ void md_do_sync(mddev_t *mddev)
 
  skip:
 	mddev->curr_resync = 0;
+	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	wake_up(&resync_wait);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 3dea9f545c8f..780e0613e6d5 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -227,6 +227,8 @@ struct mddev_s
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
 	sector_t			recovery_cp;
+	sector_t			resync_min;	/* user requested sync
+							 * starts here */
 	sector_t			resync_max;	/* resync should pause
 							 * when it gets here */
 

From f48ed538386cb41559282d989354e8f5d442d71c Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:26 +1000
Subject: [PATCH 05/52] Close race in md_probe

There is a possible race in md_probe.  If two threads call md_probe
for the same device, then one could exit (having checked that
->gendisk exists) before the other has called kobject_init_and_add,
thus returning an incomplete kobj which will cause problems when
we try to add children to it.

So extend the range of protection of disks_mutex slightly to
avoid this possibility.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 261322722c19..97852099defd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3359,9 +3359,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	disk->queue = mddev->queue;
 	add_disk(disk);
 	mddev->gendisk = disk;
-	mutex_unlock(&disks_mutex);
 	error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
 				     "%s", "md");
+	mutex_unlock(&disks_mutex);
 	if (error)
 		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
 		       disk->disk_name);

From 1a0fd497733bd029a7d5f2e5c69b1dff715b7792 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:27 +1000
Subject: [PATCH 06/52] Don't try to make md arrays dirty if that is not
 meaningful.

Arrays personalities such as 'raid0' and 'linear' have no redundancy,
and so marking them as 'clean' or 'dirty' is not meaningful.
So always allow write requests without requiring a superblock update.

Such arrays types are detected by ->sync_request being NULL.  If it is
not possible to send a sync request we don't need a 'dirty' flag because
all a dirty flag does is trigger some sync_requests.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 97852099defd..9e3ce432e37e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5536,6 +5536,8 @@ void md_allow_write(mddev_t *mddev)
 		return;
 	if (mddev->ro)
 		return;
+	if (!mddev->pers->sync_request)
+		return;
 
 	spin_lock_irq(&mddev->write_lock);
 	if (mddev->in_sync) {

From 8ed0a5216a0238f53b482ec88ce4aeed4b9f0da1 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:29 +1000
Subject: [PATCH 07/52] Enable setting of 'offset' and 'size' of a hot-added
 spare.

offset_store and rdev_size_store allow control of the region of a
device which is to be using in an md/raid array.
They only allow these values to be set when an array is being assembled,
as changing them on an active array could be dangerous.
However when adding a spare device to an array, we might need to
set the offset and size before starting recovery.  So allow
these values to be set also if "->raid_disk < 0" which indicates that
the device is still a spare.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9e3ce432e37e..3b5cd4ef54f1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1984,7 +1984,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 	unsigned long long offset = simple_strtoull(buf, &e, 10);
 	if (e==buf || (*e && *e != '\n'))
 		return -EINVAL;
-	if (rdev->mddev->pers)
+	if (rdev->mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
 	if (rdev->size && rdev->mddev->external)
 		/* Must set offset before size, so overlap checks
@@ -2023,7 +2023,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 	if (e==buf || (*e && *e != '\n'))
 		return -EINVAL;
-	if (my_mddev->pers)
+	if (my_mddev->pers && rdev->raid_disk >= 0)
 		return -EBUSY;
 	rdev->size = size;
 	if (size > oldsize && rdev->mddev->external) {

From 6c2fce2ef6b4821c21b5c42c7207cb9cf8c87eda Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:31 +1000
Subject: [PATCH 08/52] Support adding a spare to a live md array with external
 metadata.

i.e. extend the 'md/dev-XXX/slot' attribute so that you can
tell a device to fill an vacant slot in an and md array.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c        | 42 +++++++++++++++++++++++++++++++++++++++---
 drivers/md/multipath.c |  7 ++++++-
 drivers/md/raid1.c     |  7 ++++++-
 drivers/md/raid10.c    | 10 ++++++++--
 drivers/md/raid5.c     | 10 ++++++++--
 5 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3b5cd4ef54f1..5d6fac1fd39e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		slot = -1;
 	else if (e==buf || (*e && *e!= '\n'))
 		return -EINVAL;
-	if (rdev->mddev->pers) {
+	if (rdev->mddev->pers && slot == -1) {
 		/* Setting 'slot' on an active array requires also
 		 * updating the 'rd%d' link, and communicating
 		 * with the personality with ->hot_*_disk.
@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		 * failed/spare devices.  This normally happens automatically,
 		 * but not when the metadata is externally managed.
 		 */
-		if (slot != -1)
-			return -EBUSY;
 		if (rdev->raid_disk == -1)
 			return -EEXIST;
 		/* personality does all needed checks */
@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		sysfs_remove_link(&rdev->mddev->kobj, nm);
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
+	} else if (rdev->mddev->pers) {
+		mdk_rdev_t *rdev2;
+		struct list_head *tmp;
+		/* Activating a spare .. or possibly reactivating
+		 * if we every get bitmaps working here.
+		 */
+
+		if (rdev->raid_disk != -1)
+			return -EBUSY;
+
+		if (rdev->mddev->pers->hot_add_disk == NULL)
+			return -EINVAL;
+
+		rdev_for_each(rdev2, tmp, rdev->mddev)
+			if (rdev2->raid_disk == slot)
+				return -EEXIST;
+
+		rdev->raid_disk = slot;
+		if (test_bit(In_sync, &rdev->flags))
+			rdev->saved_raid_disk = slot;
+		else
+			rdev->saved_raid_disk = -1;
+		err = rdev->mddev->pers->
+			hot_add_disk(rdev->mddev, rdev);
+		if (err != 1) {
+			rdev->raid_disk = -1;
+			if (err == 0)
+				return -EEXIST;
+			return err;
+		}
+		sprintf(nm, "rd%d", rdev->raid_disk);
+		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
+			printk(KERN_WARNING
+			       "md: cannot register "
+			       "%s for %s\n",
+			       nm, mdname(rdev->mddev));
+
+		/* don't wakeup anyone, leave that to userspace. */
 	} else {
 		if (slot >= rdev->mddev->raid_disks)
 			return -ENOSPC;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e968116e0de9..4a1d714c048e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int found = 0;
 	int path;
 	struct multipath_info *p;
+	int first = 0;
+	int last = mddev->raid_disks - 1;
+
+	if (rdev->raid_disk >= 0)
+		first = last = rdev->raid_disk;
 
 	print_multipath_conf(conf);
 
-	for (path=0; path<mddev->raid_disks; path++) 
+	for (path = first; path <= last; path++)
 		if ((p=conf->multipaths+path)->rdev == NULL) {
 			q = rdev->bdev->bd_disk->queue;
 			blk_queue_stack_limits(mddev->queue, q);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c610b947218a..d32fc559ff05 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int found = 0;
 	int mirror = 0;
 	mirror_info_t *p;
+	int first = 0;
+	int last = mddev->raid_disks - 1;
 
-	for (mirror=0; mirror < mddev->raid_disks; mirror++)
+	if (rdev->raid_disk >= 0)
+		first = last = rdev->raid_disk;
+
+	for (mirror = first; mirror <= last; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 
 			blk_queue_stack_limits(mddev->queue,
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a71277b640ab..50ad8d2ae0e8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int found = 0;
 	int mirror;
 	mirror_info_t *p;
+	int first = 0;
+	int last = mddev->raid_disks - 1;
 
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!enough(conf))
 		return 0;
 
+	if (rdev->raid_disk)
+		first = last = rdev->raid_disk;
+
 	if (rdev->saved_raid_disk >= 0 &&
+	    rdev->saved_raid_disk >= first &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
 	else
-		mirror = 0;
-	for ( ; mirror < mddev->raid_disks; mirror++)
+		mirror = first;
+	for ( ; mirror <= last ; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 
 			blk_queue_stack_limits(mddev->queue,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 214b44122822..002f33b1ae00 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4607,21 +4607,27 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	int found = 0;
 	int disk;
 	struct disk_info *p;
+	int first = 0;
+	int last = conf->raid_disks - 1;
 
 	if (mddev->degraded > conf->max_degraded)
 		/* no point adding a device */
 		return 0;
 
+	if (rdev->raid_disk >= 0)
+		first = last = rdev->raid_disk;
+
 	/*
 	 * find the disk ... but prefer rdev->saved_raid_disk
 	 * if possible.
 	 */
 	if (rdev->saved_raid_disk >= 0 &&
+	    rdev->saved_raid_disk >= first &&
 	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
 		disk = rdev->saved_raid_disk;
 	else
-		disk = 0;
-	for ( ; disk < conf->raid_disks; disk++)
+		disk = first;
+	for ( ; disk <= last ; disk++)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;

From 199050ea1ff2270174ee525b73bc4c3323098897 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:33 +1000
Subject: [PATCH 09/52] rationalise return value for ->hot_add_disk method.

For all array types but linear, ->hot_add_disk returns 1 on
success, 0 on failure.
For linear, it returns 0 on success and -errno on failure.

This doesn't cause a functional problem because the ->hot_add_disk
function of linear is used quite differently to the others.
However it is confusing.

So convert all to return 0 for success or -errno on failure
and fix call sites to match.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c        |  7 +++----
 drivers/md/multipath.c |  8 +++++---
 drivers/md/raid1.c     |  6 +++---
 drivers/md/raid10.c    | 10 +++++-----
 drivers/md/raid5.c     | 10 +++++-----
 5 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5d6fac1fd39e..45e255d4916f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1977,10 +1977,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			rdev->saved_raid_disk = -1;
 		err = rdev->mddev->pers->
 			hot_add_disk(rdev->mddev, rdev);
-		if (err != 1) {
+		if (err) {
 			rdev->raid_disk = -1;
-			if (err == 0)
-				return -EEXIST;
 			return err;
 		}
 		sprintf(nm, "rd%d", rdev->raid_disk);
@@ -5920,7 +5918,8 @@ static int remove_and_add_spares(mddev_t *mddev)
 			if (rdev->raid_disk < 0
 			    && !test_bit(Faulty, &rdev->flags)) {
 				rdev->recovery_offset = 0;
-				if (mddev->pers->hot_add_disk(mddev,rdev)) {
+				if (mddev->pers->
+				    hot_add_disk(mddev, rdev) == 0) {
 					char nm[20];
 					sprintf(nm, "rd%d", rdev->raid_disk);
 					if (sysfs_create_link(&mddev->kobj,
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4a1d714c048e..541cbe3414bd 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -281,7 +281,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	multipath_conf_t *conf = mddev->private;
 	struct request_queue *q;
-	int found = 0;
+	int err = -EEXIST;
 	int path;
 	struct multipath_info *p;
 	int first = 0;
@@ -312,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			rdev->raid_disk = path;
 			set_bit(In_sync, &rdev->flags);
 			rcu_assign_pointer(p->rdev, rdev);
-			found = 1;
+			err = 0;
+			break;
 		}
 
 	print_multipath_conf(conf);
-	return found;
+
+	return err;
 }
 
 static int multipath_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d32fc559ff05..f05d5983efb6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1100,7 +1100,7 @@ static int raid1_spare_active(mddev_t *mddev)
 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	conf_t *conf = mddev->private;
-	int found = 0;
+	int err = -EEXIST;
 	int mirror = 0;
 	mirror_info_t *p;
 	int first = 0;
@@ -1124,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
-			found = 1;
+			err = 0;
 			/* As all devices are equivalent, we don't need a full recovery
 			 * if this was recently any drive of the array
 			 */
@@ -1135,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		}
 
 	print_conf(conf);
-	return found;
+	return err;
 }
 
 static int raid1_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 50ad8d2ae0e8..df08a9fa3a1f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1113,7 +1113,7 @@ static int raid10_spare_active(mddev_t *mddev)
 static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	conf_t *conf = mddev->private;
-	int found = 0;
+	int err = -EEXIST;
 	int mirror;
 	mirror_info_t *p;
 	int first = 0;
@@ -1123,9 +1123,9 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
 		 */
-		return 0;
+		return -EBUSY;
 	if (!enough(conf))
-		return 0;
+		return -EINVAL;
 
 	if (rdev->raid_disk)
 		first = last = rdev->raid_disk;
@@ -1151,7 +1151,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
-			found = 1;
+			err = 0;
 			if (rdev->saved_raid_disk != mirror)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
@@ -1159,7 +1159,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		}
 
 	print_conf(conf);
-	return found;
+	return err;
 }
 
 static int raid10_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 002f33b1ae00..8c4e6149daea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4604,7 +4604,7 @@ abort:
 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	raid5_conf_t *conf = mddev->private;
-	int found = 0;
+	int err = -EEXIST;
 	int disk;
 	struct disk_info *p;
 	int first = 0;
@@ -4612,7 +4612,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (mddev->degraded > conf->max_degraded)
 		/* no point adding a device */
-		return 0;
+		return -EINVAL;
 
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
@@ -4631,14 +4631,14 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
-			found = 1;
+			err = 0;
 			if (rdev->saved_raid_disk != disk)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
 			break;
 		}
 	print_raid5_conf(conf);
-	return found;
+	return err;
 }
 
 static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4739,7 +4739,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	rdev_for_each(rdev, rtmp, mddev)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
-			if (raid5_add_disk(mddev, rdev)) {
+			if (raid5_add_disk(mddev, rdev) == 0) {
 				char nm[20];
 				set_bit(In_sync, &rdev->flags);
 				added_devices++;

From c7d0c941ae7f82940a13f785be70dc3097d96687 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:34 +1000
Subject: [PATCH 10/52] Don't reject HOT_REMOVE_DISK request for an array that
 is not yet started.

There is really no need for this test here, and there are valid
cases for selectively removing devices from an array that
it not actually active.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 45e255d4916f..1442761ac98e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4300,9 +4300,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
 	char b[BDEVNAME_SIZE];
 	mdk_rdev_t *rdev;
 
-	if (!mddev->pers)
-		return -ENODEV;
-
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;

From 0fd62b861eac7d2dea9b7e939953b20f37186ea1 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:36 +1000
Subject: [PATCH 11/52] Make sure all changes to md/array_state are notified.

Changes in md/array_state could be of interest to a monitoring
program.  So make sure all changes trigger a notification.

Exceptions:
   changing active_idle to active is not reported because it
      is frequent and not interesting.
   changing active to active_idle is only reported on arrays
      with externally managed metadata, as it is not interesting
      otherwise.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt |  5 +++++
 drivers/md/md.c      | 29 ++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index a8b430627473..dca97ba4944a 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -236,6 +236,11 @@ All md devices contain:
      writing the word for the desired state, however some states
      cannot be explicitly set, and some transitions are not allowed.
 
+     Select/poll works on this file.  All changes except between
+     	active_idle and active (which can be frequent and are not
+	very interesting) are notified.  active->active_idle is
+	reported if the metadata is externally managed.
+
      clear
          No devices, no size, no level
          Writing is equivalent to STOP_ARRAY ioctl
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1442761ac98e..5b9d4fe4e6e4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2716,8 +2716,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 	}
 	if (err)
 		return err;
-	else
+	else {
+		sysfs_notify(&mddev->kobj, NULL, "array_state");
 		return len;
+	}
 }
 static struct md_sysfs_entry md_array_state =
 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -3408,7 +3410,11 @@ static void md_safemode_timeout(unsigned long data)
 {
 	mddev_t *mddev = (mddev_t *) data;
 
-	mddev->safemode = 1;
+	if (!atomic_read(&mddev->writes_pending)) {
+		mddev->safemode = 1;
+		if (mddev->external)
+			sysfs_notify(&mddev->kobj, NULL, "array_state");
+	}
 	md_wakeup_thread(mddev->thread);
 }
 
@@ -3675,6 +3681,7 @@ static int do_md_run(mddev_t * mddev)
 
 	mddev->changed = 1;
 	md_new_event(mddev);
+	sysfs_notify(&mddev->kobj, NULL, "array_state");
 	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
 	return 0;
 }
@@ -3709,6 +3716,8 @@ static int restart_array(mddev_t *mddev)
 		md_wakeup_thread(mddev->thread);
 		md_wakeup_thread(mddev->sync_thread);
 		err = 0;
+		sysfs_notify(&mddev->kobj, NULL, "array_state");
+
 	} else
 		err = -EINVAL;
 
@@ -3879,6 +3888,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
 			mdname(mddev));
 	err = 0;
 	md_new_event(mddev);
+	sysfs_notify(&mddev->kobj, NULL, "array_state");
 out:
 	return err;
 }
@@ -4876,8 +4886,9 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	    mddev->ro && mddev->pers) {
 		if (mddev->ro == 2) {
 			mddev->ro = 0;
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-		md_wakeup_thread(mddev->thread);
+			sysfs_notify(&mddev->kobj, NULL, "array_state");
+			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			md_wakeup_thread(mddev->thread);
 
 		} else {
 			err = -EROFS;
@@ -5516,6 +5527,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
  */
 void md_write_start(mddev_t *mddev, struct bio *bi)
 {
+	int did_change = 0;
 	if (bio_data_dir(bi) != WRITE)
 		return;
 
@@ -5526,6 +5538,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
 		md_wakeup_thread(mddev->sync_thread);
+		did_change = 1;
 	}
 	atomic_inc(&mddev->writes_pending);
 	if (mddev->safemode == 1)
@@ -5536,10 +5549,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
 			mddev->in_sync = 0;
 			set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			md_wakeup_thread(mddev->thread);
+			did_change = 1;
 		}
 		spin_unlock_irq(&mddev->write_lock);
-		sysfs_notify(&mddev->kobj, NULL, "array_state");
 	}
+	if (did_change)
+		sysfs_notify(&mddev->kobj, NULL, "array_state");
 	wait_event(mddev->sb_wait,
 		   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
 		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -5991,18 +6006,22 @@ void md_check_recovery(mddev_t *mddev)
 		int spares = 0;
 
 		if (!mddev->external) {
+			int did_change = 0;
 			spin_lock_irq(&mddev->write_lock);
 			if (mddev->safemode &&
 			    !atomic_read(&mddev->writes_pending) &&
 			    !mddev->in_sync &&
 			    mddev->recovery_cp == MaxSector) {
 				mddev->in_sync = 1;
+				did_change = 1;
 				if (mddev->persistent)
 					set_bit(MD_CHANGE_CLEAN, &mddev->flags);
 			}
 			if (mddev->safemode == 1)
 				mddev->safemode = 0;
 			spin_unlock_irq(&mddev->write_lock);
+			if (did_change)
+				sysfs_notify(&mddev->kobj, NULL, "array_state");
 		}
 
 		if (mddev->flags)

From 72a23c211e4587859d5bf61ac4962d76e593fb02 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:41 +1000
Subject: [PATCH 12/52] Make sure all changes to md/sync_action are notified.

When the 'resync' thread starts or stops, when we explicitly
set sync_action, or when we determine that there is definitely nothing
to do, we notify sync_action.

To stop "sync_action" from occasionally showing the wrong value,
we introduce a new flags - MD_RECOVERY_RECOVER - to say that a
recovery is probably needed or happening, and we make sure
that we set MD_RECOVERY_RUNNING before clearing MD_RECOVERY_NEEDED.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt      |  6 ++++++
 drivers/md/md.c           | 34 ++++++++++++++++++++++++++++------
 include/linux/raid/md_k.h |  2 ++
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index dca97ba4944a..c05bfb55659e 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -386,6 +386,12 @@ also have
 	'check' and 'repair' will start the appropriate process
            providing the current state is 'idle'.
 
+      This file responds to select/poll.  Any important change in the value
+      triggers a poll event.  Sometimes the value will briefly be
+      "recover" if a recovery seems to be needed, but cannot be
+      achieved. In that case, the transition to "recover" isn't
+      notified, but the transition away is.
+
    mismatch_count
       When performing 'check' and 'repair', and possibly when
       performing 'resync', md will count the number of errors that are
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5b9d4fe4e6e4..c26dcad8a3ac 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
-	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 }
 EXPORT_SYMBOL_GPL(md_new_event);
 
@@ -2936,7 +2935,7 @@ action_show(mddev_t *mddev, char *page)
 				type = "check";
 			else
 				type = "repair";
-		} else
+		} else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
 			type = "recover";
 	}
 	return sprintf(page, "%s\n", type);
@@ -2958,9 +2957,12 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
-	else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
+	else if (cmd_match(page, "resync"))
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	else if (cmd_match(page, "reshape")) {
+	else if (cmd_match(page, "recover")) {
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	} else if (cmd_match(page, "reshape")) {
 		int err;
 		if (mddev->pers->start_reshape == NULL)
 			return -EINVAL;
@@ -2977,6 +2979,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	}
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
+	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 	return len;
 }
 
@@ -3682,6 +3685,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->changed = 1;
 	md_new_event(mddev);
 	sysfs_notify(&mddev->kobj, NULL, "array_state");
+	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
 	return 0;
 }
@@ -4252,6 +4256,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			export_rdev(rdev);
 
 		md_update_sb(mddev, 1);
+		if (mddev->degraded)
+			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
 		return err;
@@ -5105,6 +5111,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!mddev->pers->error_handler)
 		return;
 	mddev->pers->error_handler(mddev,rdev);
+	if (mddev->degraded)
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6055,13 +6063,18 @@ void md_check_recovery(mddev_t *mddev)
 			mddev->recovery = 0;
 			/* flag recovery needed just to double check */
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			sysfs_notify(&mddev->kobj, NULL, "sync_action");
 			md_new_event(mddev);
 			goto unlock;
 		}
+		/* Set RUNNING before clearing NEEDED to avoid
+		 * any transients in the value of "sync_action".
+		 */
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		/* Clear some bits that don't mean anything, but
 		 * might be left set
 		 */
-		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
@@ -6079,17 +6092,19 @@ void md_check_recovery(mddev_t *mddev)
 				/* Cannot proceed */
 				goto unlock;
 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if ((spares = remove_and_add_spares(mddev))) {
 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if (mddev->recovery_cp < MaxSector) {
 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 			/* nothing to be done ... */
 			goto unlock;
 
 		if (mddev->pers->sync_request) {
-			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
 				/* We are adding a device or devices to an array
 				 * which has the bitmap stored on all devices.
@@ -6108,9 +6123,16 @@ void md_check_recovery(mddev_t *mddev)
 				mddev->recovery = 0;
 			} else
 				md_wakeup_thread(mddev->sync_thread);
+			sysfs_notify(&mddev->kobj, NULL, "sync_action");
 			md_new_event(mddev);
 		}
 	unlock:
+		if (!mddev->sync_thread) {
+			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
+					       &mddev->recovery))
+				sysfs_notify(&mddev->kobj, NULL, "sync_action");
+		}
 		mddev_unlock(mddev);
 	}
 }
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 780e0613e6d5..62aa9c9a6ddc 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -188,6 +188,7 @@ struct mddev_s
 	 * NEEDED:   we might need to start a resync/recover
 	 * RUNNING:  a thread is running, or about to be started
 	 * SYNC:     actually doing a resync, not a recovery
+	 * RECOVER:  doing recovery, or need to try it.
 	 * INTR:     resync needs to be aborted for some reason
 	 * DONE:     thread is done and is waiting to be reaped
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
@@ -198,6 +199,7 @@ struct mddev_s
 	 */
 #define	MD_RECOVERY_RUNNING	0
 #define	MD_RECOVERY_SYNC	1
+#define	MD_RECOVERY_RECOVER	2
 #define	MD_RECOVERY_INTR	3
 #define	MD_RECOVERY_DONE	4
 #define	MD_RECOVERY_NEEDED	5

From a99ac97113d5bc25ddc4d17f404c2024ac6c57f9 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:43 +1000
Subject: [PATCH 13/52] Make sure all changes to md/degraded are notified.

When a device fails, when a spare is activated, when
an array is reshaped, or when an array is started,
the extent to which the array is degraded can change.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt | 7 +++++++
 drivers/md/md.c      | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index c05bfb55659e..eb6e69e3732e 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -392,6 +392,13 @@ also have
       achieved. In that case, the transition to "recover" isn't
       notified, but the transition away is.
 
+   degraded
+      This contains a count of the number of devices by which the
+      arrays is degraded.  So an optimal array with show '0'.  A
+      single failed/missing drive will show '1', etc.
+      This file responds to select/poll, any increase or decrease
+      in the count of missing devices will trigger an event.
+
    mismatch_count
       When performing 'check' and 'repair', and possibly when
       performing 'resync', md will count the number of errors that are
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c26dcad8a3ac..60d4cad88c20 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2969,6 +2969,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 		err = mddev->pers->start_reshape(mddev);
 		if (err)
 			return err;
+		sysfs_notify(&mddev->kobj, NULL, "degraded");
 	} else {
 		if (cmd_match(page, "check"))
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -3686,6 +3687,7 @@ static int do_md_run(mddev_t * mddev)
 	md_new_event(mddev);
 	sysfs_notify(&mddev->kobj, NULL, "array_state");
 	sysfs_notify(&mddev->kobj, NULL, "sync_action");
+	sysfs_notify(&mddev->kobj, NULL, "degraded");
 	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
 	return 0;
 }
@@ -6049,7 +6051,9 @@ void md_check_recovery(mddev_t *mddev)
 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 				/* success...*/
 				/* activate any spares */
-				mddev->pers->spare_active(mddev);
+				if (mddev->pers->spare_active(mddev))
+					sysfs_notify(&mddev->kobj, NULL,
+						     "degraded");
 			}
 			md_update_sb(mddev, 1);
 

From 526647320e696f434647f38421a6ecf65b859c43 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:44 +1000
Subject: [PATCH 14/52] Make sure all changes to md/dev-XX/state are notified

The important state change happens during an interrupt
in md_error.  So just set a flag there and call sysfs_notify
later in process context.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt      | 10 ++++++++++
 drivers/md/md.c           | 14 +++++++++++++-
 include/linux/raid/md_k.h |  3 +++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index eb6e69e3732e..e06cc59437e4 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -297,6 +297,10 @@ Each directory contains:
 	      writemostly - device will only be subject to read
 		         requests if there are no other options.
 			 This applies only to raid1 arrays.
+	      blocked  - device has failed, metadata is "external",
+	                 and the failure hasn't been acknowledged yet.
+			 Writes that would write to this device if
+			 it were not faulty are blocked.
 	      spare    - device is working, but not a full member.
 			 This includes spares that are in the process
 			 of being recovered to
@@ -306,6 +310,12 @@ Each directory contains:
 	Writing "remove" removes the device from the array.
 	Writing "writemostly" sets the writemostly flag.
 	Writing "-writemostly" clears the writemostly flag.
+	Writing "blocked" sets the "blocked" flag.
+	Writing "-blocked" clear the "blocked" flag and allows writes
+		to complete.
+
+	This file responds to select/poll. Any change to 'faulty'
+	or 'blocked' causes an event.
 
       errors
 	An approximate count of read errors that have been detected on
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 60d4cad88c20..dc99d95a1b6d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1886,6 +1886,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 		err = 0;
 	}
+	if (!err)
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	return err ? err : len;
 }
 static struct rdev_sysfs_entry rdev_state =
@@ -1979,7 +1981,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		if (err) {
 			rdev->raid_disk = -1;
 			return err;
-		}
+		} else
+			sysfs_notify(&rdev->kobj, NULL, "state");
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
 			printk(KERN_WARNING
@@ -1996,6 +1999,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		clear_bit(Faulty, &rdev->flags);
 		clear_bit(WriteMostly, &rdev->flags);
 		set_bit(In_sync, &rdev->flags);
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	}
 	return len;
 }
@@ -3525,6 +3529,7 @@ static int do_md_run(mddev_t * mddev)
 				return -EINVAL;
 			}
 		}
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	}
 
 	md_probe(mddev->unit, NULL, NULL);
@@ -4256,6 +4261,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		}
 		if (err)
 			export_rdev(rdev);
+		else
+			sysfs_notify(&rdev->kobj, NULL, "state");
 
 		md_update_sb(mddev, 1);
 		if (mddev->degraded)
@@ -5115,6 +5122,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	mddev->pers->error_handler(mddev,rdev);
 	if (mddev->degraded)
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+	set_bit(StateChanged, &rdev->flags);
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6037,6 +6045,10 @@ void md_check_recovery(mddev_t *mddev)
 		if (mddev->flags)
 			md_update_sb(mddev, 0);
 
+		rdev_for_each(rdev, rtmp, mddev)
+			if (test_and_clear_bit(StateChanged, &rdev->flags))
+				sysfs_notify(&rdev->kobj, NULL, "state");
+
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 62aa9c9a6ddc..df30c4395875 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -87,6 +87,9 @@ struct mdk_rdev_s
 #define Blocked		8		/* An error occured on an externally
 					 * managed array, don't allow writes
 					 * until it is cleared */
+#define StateChanged	9		/* Faulty or Blocked has changed during
+					 * interrupt, so it needs to be
+					 * notified by the thread */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */

From 0cd17fec983b6bca505eecee1af33138687220b6 Mon Sep 17 00:00:00 2001
From: Chris Webb <chris@arachsys.com>
Date: Sat, 28 Jun 2008 08:31:46 +1000
Subject: [PATCH 15/52] Support changing rdev size on running arrays.

From: Chris Webb <chris@arachsys.com>

Allow /sys/block/mdX/md/rdY/size to change on running arrays, moving the
superblock if necessary for this metadata version. We prevent the available
space from shrinking to less than the used size, and allow it to be set to zero
to fill all the available space on the underlying device.

Signed-off-by: Chris Webb <chris@arachsys.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 100 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 87 insertions(+), 13 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index dc99d95a1b6d..df1230af02cd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -658,11 +658,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
  */
 
 struct super_type  {
-	char 		*name;
-	struct module	*owner;
-	int		(*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
-	int		(*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
-	void		(*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	char		    *name;
+	struct module	    *owner;
+	int		    (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
+					  int minor_version);
+	int		    (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	void		    (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
+						unsigned long long size);
 };
 
 /*
@@ -1003,6 +1006,27 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->sb_csum = calc_sb_csum(sb);
 }
 
+/*
+ * rdev_size_change for 0.90.0
+ */
+static unsigned long long
+super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
+{
+	if (size && size < rdev->mddev->size)
+		return 0; /* component must fit device */
+	size *= 2; /* convert to sectors */
+	if (rdev->mddev->bitmap_offset)
+		return 0; /* can't move bitmap */
+	rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+	if (!size || size > rdev->sb_offset*2)
+		size = rdev->sb_offset*2;
+	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
+		       rdev->sb_page);
+	md_super_wait(rdev->mddev);
+	return size/2; /* kB for sysfs */
+}
+
+
 /*
  * version 1 superblock
  */
@@ -1328,21 +1352,59 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->sb_csum = calc_sb_1_csum(sb);
 }
 
+static unsigned long long
+super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
+{
+	struct mdp_superblock_1 *sb;
+	unsigned long long max_size;
+	if (size && size < rdev->mddev->size)
+		return 0; /* component must fit device */
+	size *= 2; /* convert to sectors */
+	if (rdev->sb_offset < rdev->data_offset/2) {
+		/* minor versions 1 and 2; superblock before data */
+		max_size = (rdev->bdev->bd_inode->i_size >> 9);
+		max_size -= rdev->data_offset;
+		if (!size || size > max_size)
+			size = max_size;
+	} else if (rdev->mddev->bitmap_offset) {
+		/* minor version 0 with bitmap we can't move */
+		return 0;
+	} else {
+		/* minor version 0; superblock after data */
+		sector_t sb_offset;
+		sb_offset = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
+		sb_offset &= ~(sector_t)(4*2 - 1);
+		max_size = rdev->size*2 + sb_offset - rdev->sb_offset*2;
+		if (!size || size > max_size)
+			size = max_size;
+		rdev->sb_offset = sb_offset/2;
+	}
+	sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
+	sb->data_size = cpu_to_le64(size);
+	sb->super_offset = rdev->sb_offset*2;
+	sb->sb_csum = calc_sb_1_csum(sb);
+	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
+		       rdev->sb_page);
+	md_super_wait(rdev->mddev);
+	return size/2; /* kB for sysfs */
+}
 
 static struct super_type super_types[] = {
 	[0] = {
 		.name	= "0.90.0",
 		.owner	= THIS_MODULE,
-		.load_super	= super_90_load,
-		.validate_super	= super_90_validate,
-		.sync_super	= super_90_sync,
+		.load_super	    = super_90_load,
+		.validate_super	    = super_90_validate,
+		.sync_super	    = super_90_sync,
+		.rdev_size_change   = super_90_rdev_size_change,
 	},
 	[1] = {
 		.name	= "md-1",
 		.owner	= THIS_MODULE,
-		.load_super	= super_1_load,
-		.validate_super	= super_1_validate,
-		.sync_super	= super_1_sync,
+		.load_super	    = super_1_load,
+		.validate_super	    = super_1_validate,
+		.sync_super	    = super_1_sync,
+		.rdev_size_change   = super_1_rdev_size_change,
 	},
 };
 
@@ -2060,8 +2122,20 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 	if (e==buf || (*e && *e != '\n'))
 		return -EINVAL;
-	if (my_mddev->pers && rdev->raid_disk >= 0)
-		return -EBUSY;
+	if (my_mddev->pers && rdev->raid_disk >= 0) {
+		if (rdev->mddev->persistent) {
+			size = super_types[rdev->mddev->major_version].
+				rdev_size_change(rdev, size);
+			if (!size)
+				return -EBUSY;
+		} else if (!size) {
+			size = (rdev->bdev->bd_inode->i_size >> 10);
+			size -= rdev->data_offset/2;
+		}
+		if (size < rdev->mddev->size)
+			return -EINVAL; /* component must fit device */
+	}
+
 	rdev->size = size;
 	if (size > oldsize && rdev->mddev->external) {
 		/* need to check that all other rdevs with the same ->bdev

From b203886edbcaac3ca427cf4dbcb50b18bdb346fd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:50 +1000
Subject: [PATCH 16/52] md: kill STRIPE_OP_MOD_DMA in raid5 offload

From: Dan Williams <dan.j.williams@intel.com>

This micro-optimization allowed the raid code to skip a re-read of the
parity block after checking parity.  It took advantage of the fact that
xor-offload-engines have their own internal result buffer and can check
parity without writing to memory.  Remove it for the following reasons:

1/ It is a layering violation for MD to need to manage the DMA and
   non-DMA paths within async_xor_zero_sum
2/ Bad precedent to toggle the 'ops' flags outside the lock
3/ Hard to realize a performance gain as reads will not need an updated
   parity block and writes will dirty it anyways.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 10 ----------
 include/linux/raid/raid5.h |  2 --
 2 files changed, 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8c4e6149daea..60e61d2464b5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -837,15 +837,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 static void ops_complete_check(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	int pd_idx = sh->pd_idx;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
-		sh->ops.zero_sum_result == 0)
-		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-
 	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
@@ -873,11 +868,6 @@ static void ops_run_check(struct stripe_head *sh)
 	tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
 
-	if (tx)
-		set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-	else
-		clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-
 	atomic_inc(&sh->count);
 	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
 		ops_complete_check, sh);
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f0827d31ae6f..4ecae31a3dcb 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -267,10 +267,8 @@ struct r6_state {
 
 /* modifiers to the base operations
  * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
- * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
  */
 #define STRIPE_OP_MOD_REPAIR_PD 7
-#define STRIPE_OP_MOD_DMA_CHECK 8
 
 /*
  * Plugging:

From 2b7497f0e0a0b9cf21d822e427d5399b2056501a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:52 +1000
Subject: [PATCH 17/52] md: kill STRIPE_OP_IO flag

From: Dan Williams <dan.j.williams@intel.com>

The R5_Want{Read,Write} flags already gate i/o.  So, this flag is
superfluous and we can unconditionally call ops_run_io().

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 32 +++++---------------------------
 include/linux/raid/raid5.h |  1 -
 2 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 60e61d2464b5..cac97080b278 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -373,8 +373,6 @@ static unsigned long get_stripe_work(struct stripe_head *sh)
 	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
 	test_and_ack_op(STRIPE_OP_POSTXOR, pending);
 	test_and_ack_op(STRIPE_OP_CHECK, pending);
-	if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
-		ack++;
 
 	sh->ops.count -= ack;
 	if (unlikely(sh->ops.count < 0)) {
@@ -399,7 +397,6 @@ static void ops_run_io(struct stripe_head *sh)
 
 	might_sleep();
 
-	set_bit(STRIPE_IO_STARTED, &sh->state);
 	for (i = disks; i--; ) {
 		int rw;
 		struct bio *bi;
@@ -433,6 +430,8 @@ static void ops_run_io(struct stripe_head *sh)
 				test_bit(STRIPE_EXPAND_READY, &sh->state))
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
+			set_bit(STRIPE_IO_STARTED, &sh->state);
+
 			bi->bi_bdev = rdev->bdev;
 			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
 				__func__, (unsigned long long)sh->sector,
@@ -900,9 +899,6 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 	if (test_bit(STRIPE_OP_CHECK, &pending))
 		ops_run_check(sh);
 
-	if (test_bit(STRIPE_OP_IO, &pending))
-		ops_run_io(sh);
-
 	if (overlap_clear)
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2013,8 +2009,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 			 */
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantread, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			s->locked++;
 			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
 				s->syncing);
@@ -2208,9 +2202,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 						"%d for r-m-w\n", i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
-					if (!test_and_set_bit(
-						STRIPE_OP_IO, &sh->ops.pending))
-						sh->ops.count++;
 					s->locked++;
 				} else {
 					set_bit(STRIPE_DELAYED, &sh->state);
@@ -2234,9 +2225,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 						"%d for Reconstruct\n", i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
-					if (!test_and_set_bit(
-						STRIPE_OP_IO, &sh->ops.pending))
-						sh->ops.count++;
 					s->locked++;
 				} else {
 					set_bit(STRIPE_DELAYED, &sh->state);
@@ -2444,8 +2432,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 
 		set_bit(R5_LOCKED, &dev->flags);
 		set_bit(R5_Wantwrite, &dev->flags);
-		if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-			sh->ops.count++;
 
 		clear_bit(STRIPE_DEGRADED, &sh->state);
 		s->locked++;
@@ -2801,9 +2787,6 @@ static void handle_stripe5(struct stripe_head *sh)
 				(i == sh->pd_idx || dev->written)) {
 				pr_debug("Writing block %d\n", i);
 				set_bit(R5_Wantwrite, &dev->flags);
-				if (!test_and_set_bit(
-				    STRIPE_OP_IO, &sh->ops.pending))
-					sh->ops.count++;
 				if (prexor)
 					continue;
 				if (!test_bit(R5_Insync, &dev->flags) ||
@@ -2857,16 +2840,12 @@ static void handle_stripe5(struct stripe_head *sh)
 		dev = &sh->dev[s.failed_num];
 		if (!test_bit(R5_ReWrite, &dev->flags)) {
 			set_bit(R5_Wantwrite, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
 		}
@@ -2884,13 +2863,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
 		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
 
-		for (i = conf->raid_disks; i--; ) {
+		for (i = conf->raid_disks; i--; )
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
-		}
 	}
 
 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
@@ -2926,6 +2902,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (pending)
 		raid5_run_ops(sh, pending);
 
+	ops_run_io(sh);
+
 	return_io(return_bi);
 
 }
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 4ecae31a3dcb..1301195abf4b 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -263,7 +263,6 @@ struct r6_state {
 #define STRIPE_OP_BIODRAIN	3
 #define STRIPE_OP_POSTXOR	4
 #define STRIPE_OP_CHECK	5
-#define STRIPE_OP_IO		6
 
 /* modifiers to the base operations
  * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back

From c4e5ac0a22e664eecf29249553cf16c2433f5f25 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:53 +1000
Subject: [PATCH 18/52] md: use stripe_head_state in ops_run_io()

From: Dan Williams <dan.j.williams@intel.com>

In handle_stripe after taking sh->lock we sample some bits into 's' (struct
stripe_head_state):

	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);

Use these values from 's' in ops_run_io() rather than re-sampling the bits.
This ensures a consistent snapshot (as seen under sh->lock) is used.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cac97080b278..c4ef3071c290 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -390,7 +390,7 @@ raid5_end_read_request(struct bio *bi, int error);
 static void
 raid5_end_write_request(struct bio *bi, int error);
 
-static void ops_run_io(struct stripe_head *sh)
+static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int i, disks = sh->disks;
@@ -425,9 +425,7 @@ static void ops_run_io(struct stripe_head *sh)
 		rcu_read_unlock();
 
 		if (rdev) {
-			if (test_bit(STRIPE_SYNCING, &sh->state) ||
-				test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
-				test_bit(STRIPE_EXPAND_READY, &sh->state))
+			if (s->syncing || s->expanding || s->expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -2902,10 +2900,9 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (pending)
 		raid5_run_ops(sh, pending);
 
-	ops_run_io(sh);
+	ops_run_io(sh, &s);
 
 	return_io(return_bi);
-
 }
 
 static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)

From f0e43bcdebf709d747a3effb210aff1941e819ab Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:55 +1000
Subject: [PATCH 19/52] md: unify raid5/6 i/o submission

From: Dan Williams <dan.j.williams@intel.com>

Let the raid6 path call ops_run_io to get pending i/o submitted.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 63 ++--------------------------------------------
 1 file changed, 2 insertions(+), 61 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c4ef3071c290..6f3dd12dd3a4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3114,68 +3114,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
+	ops_run_io(sh, &s);
+
 	return_io(return_bi);
-
-	for (i=disks; i-- ;) {
-		int rw;
-		struct bio *bi;
-		mdk_rdev_t *rdev;
-		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-			rw = WRITE;
-		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-			rw = READ;
-		else
-			continue;
-
-		set_bit(STRIPE_IO_STARTED, &sh->state);
-
-		bi = &sh->dev[i].req;
-
-		bi->bi_rw = rw;
-		if (rw == WRITE)
-			bi->bi_end_io = raid5_end_write_request;
-		else
-			bi->bi_end_io = raid5_end_read_request;
-
-		rcu_read_lock();
-		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && test_bit(Faulty, &rdev->flags))
-			rdev = NULL;
-		if (rdev)
-			atomic_inc(&rdev->nr_pending);
-		rcu_read_unlock();
-
-		if (rdev) {
-			if (s.syncing || s.expanding || s.expanded)
-				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
-			bi->bi_bdev = rdev->bdev;
-			pr_debug("for %llu schedule op %ld on disc %d\n",
-				(unsigned long long)sh->sector, bi->bi_rw, i);
-			atomic_inc(&sh->count);
-			bi->bi_sector = sh->sector + rdev->data_offset;
-			bi->bi_flags = 1 << BIO_UPTODATE;
-			bi->bi_vcnt = 1;
-			bi->bi_max_vecs = 1;
-			bi->bi_idx = 0;
-			bi->bi_io_vec = &sh->dev[i].vec;
-			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-			bi->bi_io_vec[0].bv_offset = 0;
-			bi->bi_size = STRIPE_SIZE;
-			bi->bi_next = NULL;
-			if (rw == WRITE &&
-			    test_bit(R5_ReWrite, &sh->dev[i].flags))
-				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
-			generic_make_request(bi);
-		} else {
-			if (rw == WRITE)
-				set_bit(STRIPE_DEGRADED, &sh->state);
-			pr_debug("skip op %ld on disc %d for sector %llu\n",
-				bi->bi_rw, i, (unsigned long long)sh->sector);
-			clear_bit(R5_LOCKED, &sh->dev[i].flags);
-			set_bit(STRIPE_HANDLE, &sh->state);
-		}
-	}
 }
 
 static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)

From ecc65c9b3f9b9d740a5deade3d85b39be56401b6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:57 +1000
Subject: [PATCH 20/52] md: replace STRIPE_OP_CHECK with 'check_states'

From: Dan Williams <dan.j.williams@intel.com>

The STRIPE_OP_* flags record the state of stripe operations which are
performed outside the stripe lock.  Their use in indicating which
operations need to be run is straightforward; however, interpolating what
the next state of the stripe should be based on a given combination of
these flags is not straightforward, and has led to bugs.  An easier to read
implementation with minimal degrees of freedom is needed.

Towards this goal, this patch introduces explicit states to replace what was
previously interpolated from the STRIPE_OP_* flags.  For now this only converts
the handle_parity_checks5 path, removing a user of the
ops.{pending,ack,complete,count} fields of struct stripe_operations.

This conversion also found a remaining issue with the current code.  There is
a small window for a drive to fail between when we schedule a repair and when
the parity calculation for that repair completes.  When this happens we will
writeback to 'failed_num' when we really want to write back to 'pd_idx'.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 174 ++++++++++++++++++-------------------
 include/linux/raid/raid5.h |  46 ++++++++--
 2 files changed, 124 insertions(+), 96 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6f3dd12dd3a4..544e1600f208 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -605,7 +605,11 @@ static void ops_complete_compute5(void *stripe_head_ref)
 	set_bit(R5_UPTODATE, &tgt->flags);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 	clear_bit(R5_Wantcompute, &tgt->flags);
-	set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
+	if (sh->check_state == check_state_compute_run)
+		sh->check_state = check_state_compute_result;
+	else
+		set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -838,7 +842,7 @@ static void ops_complete_check(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
+	sh->check_state = check_state_check_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -870,7 +874,8 @@ static void ops_run_check(struct stripe_head *sh)
 		ops_complete_check, sh);
 }
 
-static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
+static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
+			  unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -880,7 +885,8 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) ||
+	    test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
 		tx = ops_run_compute5(sh, pending);
 
 	if (test_bit(STRIPE_OP_PREXOR, &pending))
@@ -894,7 +900,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 	if (test_bit(STRIPE_OP_POSTXOR, &pending))
 		ops_run_postxor(sh, tx, pending);
 
-	if (test_bit(STRIPE_OP_CHECK, &pending))
+	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
 
 	if (overlap_clear)
@@ -1961,8 +1967,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 	/* don't schedule compute operations or reads on the parity block while
 	 * a check is in flight
 	 */
-	if ((disk_idx == sh->pd_idx) &&
-	     test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+	if (disk_idx == sh->pd_idx && sh->check_state)
 		return ~0;
 
 	/* is the data in this block needed, and can we get it? */
@@ -1983,9 +1988,8 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 		 * 3/ We hold off parity block re-reads until check operations
 		 * have quiesced.
 		 */
-		if ((s->uptodate == disks - 1) &&
-		    (s->failed && disk_idx == s->failed_num) &&
-		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+		if ((s->uptodate == disks - 1) && !sh->check_state &&
+		    (s->failed && disk_idx == s->failed_num)) {
 			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
 			set_bit(R5_Wantcompute, &dev->flags);
 			sh->ops.target = disk_idx;
@@ -2021,12 +2025,8 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 {
 	int i;
 
-	/* Clear completed compute operations.  Parity recovery
-	 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
-	 * later on in this routine
-	 */
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-		!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+	/* Clear completed compute operations */
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) {
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
@@ -2350,90 +2350,85 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks)
 {
-	int canceled_check = 0;
+	struct r5dev *dev = NULL;
 
 	set_bit(STRIPE_HANDLE, &sh->state);
 
-	/* complete a check operation */
-	if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
-		clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
-		clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+	switch (sh->check_state) {
+	case check_state_idle:
+		/* start a new check operation if there are no failures */
 		if (s->failed == 0) {
-			if (sh->ops.zero_sum_result == 0)
-				/* parity is correct (on disc,
-				 * not in buffer any more)
-				 */
-				set_bit(STRIPE_INSYNC, &sh->state);
-			else {
-				conf->mddev->resync_mismatches +=
-					STRIPE_SECTORS;
-				if (test_bit(
-				     MD_RECOVERY_CHECK, &conf->mddev->recovery))
-					/* don't try to repair!! */
-					set_bit(STRIPE_INSYNC, &sh->state);
-				else {
-					set_bit(STRIPE_OP_COMPUTE_BLK,
-						&sh->ops.pending);
-					set_bit(STRIPE_OP_MOD_REPAIR_PD,
-						&sh->ops.pending);
-					set_bit(R5_Wantcompute,
-						&sh->dev[sh->pd_idx].flags);
-					sh->ops.target = sh->pd_idx;
-					sh->ops.count++;
-					s->uptodate++;
-				}
-			}
-		} else
-			canceled_check = 1; /* STRIPE_INSYNC is not set */
-	}
-
-	/* start a new check operation if there are no failures, the stripe is
-	 * not insync, and a repair is not in flight
-	 */
-	if (s->failed == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state) &&
-	    !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-		if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
 			BUG_ON(s->uptodate != disks);
+			sh->check_state = check_state_run;
+			set_bit(STRIPE_OP_CHECK, &s->ops_request);
 			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
-			sh->ops.count++;
 			s->uptodate--;
+			break;
 		}
-	}
-
-	/* check if we can clear a parity disk reconstruct */
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-	    test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-
-		clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
-	}
-
-
-	/* Wait for check parity and compute block operations to complete
-	 * before write-back.  If a failure occurred while the check operation
-	 * was in flight we need to cycle this stripe through handle_stripe
-	 * since the parity block may not be uptodate
-	 */
-	if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
-	    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
-		struct r5dev *dev;
-		/* either failed parity check, or recovery is happening */
-		if (s->failed == 0)
-			s->failed_num = sh->pd_idx;
 		dev = &sh->dev[s->failed_num];
+		/* fall through */
+	case check_state_compute_result:
+		sh->check_state = check_state_idle;
+		if (!dev)
+			dev = &sh->dev[sh->pd_idx];
+
+		/* check that a write has not made the stripe insync */
+		if (test_bit(STRIPE_INSYNC, &sh->state))
+			break;
+
+		/* either failed parity check, or recovery is happening */
 		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
 		BUG_ON(s->uptodate != disks);
 
 		set_bit(R5_LOCKED, &dev->flags);
+		s->locked++;
 		set_bit(R5_Wantwrite, &dev->flags);
 
 		clear_bit(STRIPE_DEGRADED, &sh->state);
-		s->locked++;
 		set_bit(STRIPE_INSYNC, &sh->state);
+		break;
+	case check_state_run:
+		break; /* we will be called again upon completion */
+	case check_state_check_result:
+		sh->check_state = check_state_idle;
+
+		/* if a failure occurred during the check operation, leave
+		 * STRIPE_INSYNC not set and let the stripe be handled again
+		 */
+		if (s->failed)
+			break;
+
+		/* handle a successful check operation, if parity is correct
+		 * we are done.  Otherwise update the mismatch count and repair
+		 * parity if !MD_RECOVERY_CHECK
+		 */
+		if (sh->ops.zero_sum_result == 0)
+			/* parity is correct (on disc,
+			 * not in buffer any more)
+			 */
+			set_bit(STRIPE_INSYNC, &sh->state);
+		else {
+			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+				/* don't try to repair!! */
+				set_bit(STRIPE_INSYNC, &sh->state);
+			else {
+				sh->check_state = check_state_compute_run;
+				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+				set_bit(R5_Wantcompute,
+					&sh->dev[sh->pd_idx].flags);
+				sh->ops.target = sh->pd_idx;
+				s->uptodate++;
+			}
+		}
+		break;
+	case check_state_compute_run:
+		break;
+	default:
+		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+		       __func__, sh->check_state,
+		       (unsigned long long) sh->sector);
+		BUG();
 	}
 }
 
@@ -2807,7 +2802,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 *    block.
 	 */
 	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-			  !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+	    !sh->check_state)
 		handle_issuing_new_write_requests5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
@@ -2815,11 +2810,10 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * data is available.  The parity check is held off while parity
 	 * dependent operations are in flight.
 	 */
-	if ((s.syncing && s.locked == 0 &&
+	if (sh->check_state ||
+	    (s.syncing && s.locked == 0 &&
 	     !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
-	     !test_bit(STRIPE_INSYNC, &sh->state)) ||
-	      test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
-	      test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
+	     !test_bit(STRIPE_INSYNC, &sh->state)))
 		handle_parity_checks5(conf, sh, &s, disks);
 
 	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2897,8 +2891,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
-	if (pending)
-		raid5_run_ops(sh, pending);
+	if (pending || s.ops_request)
+		raid5_run_ops(sh, pending, s.ops_request);
 
 	ops_run_io(sh, &s);
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 1301195abf4b..2c96d5fd54bf 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -158,6 +158,41 @@
  *    the compute block completes.
  */
 
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon.  For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+	check_state_idle = 0,
+	check_state_run, /* parity check */
+	check_state_check_result,
+	check_state_compute_run, /* parity repair */
+	check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+	reconstruct_state_idle = 0,
+	reconstruct_state_drain_run,		/* write */
+	reconstruct_state_run,			/* expand */
+	reconstruct_state_drain_result,
+	reconstruct_state_result,
+};
+
 struct stripe_head {
 	struct hlist_node	hash;
 	struct list_head	lru;			/* inactive_list or handle_list */
@@ -169,6 +204,7 @@ struct stripe_head {
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
+	enum check_states	check_state;
 	/* stripe_operations
 	 * @pending - pending ops flags (set for request->issue->complete)
 	 * @ack - submitted ops flags (set for issue->complete)
@@ -202,6 +238,7 @@ struct stripe_head_state {
 	int locked, uptodate, to_read, to_write, failed, written;
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num;
+	unsigned long ops_request;
 };
 
 /* r6_state - extra state data only relevant to r6 */
@@ -254,8 +291,10 @@ struct r6_state {
 #define	STRIPE_EXPAND_READY	11
 #define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
 #define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
+#define	STRIPE_BIOFILL_RUN	14
+#define	STRIPE_COMPUTE_RUN	15
 /*
- * Operations flags (in issue order)
+ * Operation request flags
  */
 #define STRIPE_OP_BIOFILL	0
 #define STRIPE_OP_COMPUTE_BLK	1
@@ -264,11 +303,6 @@ struct r6_state {
 #define STRIPE_OP_POSTXOR	4
 #define STRIPE_OP_CHECK	5
 
-/* modifiers to the base operations
- * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
- */
-#define STRIPE_OP_MOD_REPAIR_PD 7
-
 /*
  * Plugging:
  *

From 83de75cc92be599850e5ef3928e07cd840833499 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:58 +1000
Subject: [PATCH 21/52] md: replace STRIPE_OP_BIOFILL with STRIPE_BIOFILL_RUN

From: Dan Williams <dan.j.williams@intel.com>

Track the state of read operations (copying data from the stripe cache to bio
buffers outside the lock) with a state flag.  Reduce the scope of the
STRIPE_OP_BIOFILL flag to only tracking whether a biofill operation has been
requested via the ops_request field of struct stripe_head_state.

This is another step towards the removal of ops.{pending,ack,complete,count},
i.e. STRIPE_OP_BIOFILL only requests an operation and does not track the state
of the operation.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 544e1600f208..b9c0a32a4f95 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -523,38 +523,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
 		(unsigned long long)sh->sector);
 
 	/* clear completed biofills */
+	spin_lock_irq(&conf->device_lock);
 	for (i = sh->disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
 		/* acknowledge completion of a biofill operation */
 		/* and check if we need to reply to a read request,
 		 * new R5_Wantfill requests are held off until
-		 * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)
+		 * !STRIPE_BIOFILL_RUN
 		 */
 		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
 			struct bio *rbi, *rbi2;
 
-			/* The access to dev->read is outside of the
-			 * spin_lock_irq(&conf->device_lock), but is protected
-			 * by the STRIPE_OP_BIOFILL pending bit
-			 */
 			BUG_ON(!dev->read);
 			rbi = dev->read;
 			dev->read = NULL;
 			while (rbi && rbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
-				spin_lock_irq(&conf->device_lock);
 				if (--rbi->bi_phys_segments == 0) {
 					rbi->bi_next = return_bi;
 					return_bi = rbi;
 				}
-				spin_unlock_irq(&conf->device_lock);
 				rbi = rbi2;
 			}
 		}
 	}
-	set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
+	spin_unlock_irq(&conf->device_lock);
+	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
 	return_io(return_bi);
 
@@ -880,7 +876,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
 
-	if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
+	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
 		ops_run_biofill(sh);
 		overlap_clear++;
 	}
@@ -2630,15 +2626,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+
 	/* Now to look around and see what can be done */
-
-	/* clean-up completed biofill operations */
-	if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
-		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
-		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
-		clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
-	}
-
 	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
@@ -2652,10 +2641,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		/* maybe we can request a biofill operation
 		 *
 		 * new wantfill requests are only permitted while
-		 * STRIPE_OP_BIOFILL is clear
+		 * ops_complete_biofill is guaranteed to be inactive
 		 */
 		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
-			!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
 			set_bit(R5_Wantfill, &dev->flags);
 
 		/* now count some things */
@@ -2699,8 +2688,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		goto unlock;
 	}
 
-	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
-		sh->ops.count++;
+	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
+	}
 
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",

From 976ea8d475675da6e86bd434328814ccbf5ae641 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:03 +1000
Subject: [PATCH 22/52] md: replace STRIPE_OP_COMPUTE_BLK with
 STRIPE_COMPUTE_RUN

From: Dan Williams <dan.j.williams@intel.com>

Track the state of compute operations (recalculating a block from all the other
blocks in a stripe) with a state flag.  Reduces the scope of the
STRIPE_OP_COMPUTE_BLK flag to only tracking whether a compute operation has
been requested via the ops_request field of struct stripe_head_state.

Note, the compute operation that is performed in the course of doing a 'repair'
operation (check the parity block, recalculate it and write it back if the
check result is not zero) is tracked separately with the 'check_state'
variable.  Compute operations are held off while a 'check' is in progress, and
moving this check out to handle_issuing_new_read_requests5 the helper routine
__handle_issuing_new_read_requests5 can be simplified.

This is another step towards the removal of ops.{pending,ack,complete,count},
i.e. STRIPE_OP_COMPUTE_BLK only requests an operation and does not track the
state of the operation.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 65 +++++++++++++---------------------------------
 1 file changed, 18 insertions(+), 47 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9c0a32a4f95..835046bf384e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -604,8 +604,6 @@ static void ops_complete_compute5(void *stripe_head_ref)
 	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
 	if (sh->check_state == check_state_compute_run)
 		sh->check_state = check_state_compute_result;
-	else
-		set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -881,8 +879,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) ||
-	    test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
 		tx = ops_run_compute5(sh, pending);
 
 	if (test_bit(STRIPE_OP_PREXOR, &pending))
@@ -1960,12 +1957,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 	struct r5dev *dev = &sh->dev[disk_idx];
 	struct r5dev *failed_dev = &sh->dev[s->failed_num];
 
-	/* don't schedule compute operations or reads on the parity block while
-	 * a check is in flight
-	 */
-	if (disk_idx == sh->pd_idx && sh->check_state)
-		return ~0;
-
 	/* is the data in this block needed, and can we get it? */
 	if (!test_bit(R5_LOCKED, &dev->flags) &&
 	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
@@ -1974,23 +1965,16 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 	     (failed_dev->toread || (failed_dev->towrite &&
 	     !test_bit(R5_OVERWRITE, &failed_dev->flags)
 	     ))))) {
-		/* 1/ We would like to get this block, possibly by computing it,
-		 * but we might not be able to.
-		 *
-		 * 2/ Since parity check operations potentially make the parity
-		 * block !uptodate it will need to be refreshed before any
-		 * compute operations on data disks are scheduled.
-		 *
-		 * 3/ We hold off parity block re-reads until check operations
-		 * have quiesced.
+		/* We would like to get this block, possibly by computing it,
+		 * otherwise read it if the backing disk is insync
 		 */
-		if ((s->uptodate == disks - 1) && !sh->check_state &&
+		if ((s->uptodate == disks - 1) &&
 		    (s->failed && disk_idx == s->failed_num)) {
-			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
 			set_bit(R5_Wantcompute, &dev->flags);
 			sh->ops.target = disk_idx;
 			s->req_compute = 1;
-			sh->ops.count++;
 			/* Careful: from this point on 'uptodate' is in the eye
 			 * of raid5_run_ops which services 'compute' operations
 			 * before writes. R5_Wantcompute flags a block that will
@@ -1999,12 +1983,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 			 */
 			s->uptodate++;
 			return 0; /* uptodate + compute == disks */
-		} else if ((s->uptodate < disks - 1) &&
-			test_bit(R5_Insync, &dev->flags)) {
-			/* Note: we hold off compute operations while checks are
-			 * in flight, but we still prefer 'compute' over 'read'
-			 * hence we only read if (uptodate < * disks-1)
-			 */
+		} else if (test_bit(R5_Insync, &dev->flags)) {
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantread, &dev->flags);
 			s->locked++;
@@ -2021,20 +2000,13 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 {
 	int i;
 
-	/* Clear completed compute operations */
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) {
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
-	}
-
 	/* look for blocks to read/compute, skip this if a compute
 	 * is already in flight, or if the stripe contents are in the
 	 * midst of changing due to a write
 	 */
-	if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
-		!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
-		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
+	    !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
+	    !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
 		for (i = disks; i--; )
 			if (__handle_issuing_new_read_requests5(
 				sh, s, i, disks) == 0)
@@ -2236,10 +2208,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	 * simultaneously.  If this is not the case then new writes need to be
 	 * held off until the compute completes.
 	 */
-	if ((s->req_compute ||
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
-		(s->locked == 0 && (rcw == 0 || rmw == 0) &&
-		!test_bit(STRIPE_BIT_DELAY, &sh->state)))
+	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
+	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
+	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		s->locked += handle_write_operations5(sh, rcw == 0, 0);
 }
 
@@ -2410,6 +2381,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 				set_bit(STRIPE_INSYNC, &sh->state);
 			else {
 				sh->check_state = check_state_compute_run;
+				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
 				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
 				set_bit(R5_Wantcompute,
 					&sh->dev[sh->pd_idx].flags);
@@ -2725,8 +2697,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * or to load a block that is being partially written.
 	 */
 	if (s.to_read || s.non_overwrite ||
-	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
-	    test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
+	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
 		handle_issuing_new_read_requests5(sh, &s, disks);
 
 	/* Now we check to see if any write operations have recently
@@ -2803,7 +2774,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 */
 	if (sh->check_state ||
 	    (s.syncing && s.locked == 0 &&
-	     !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
 	     !test_bit(STRIPE_INSYNC, &sh->state)))
 		handle_parity_checks5(conf, sh, &s, disks);
 
@@ -2869,7 +2840,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	}
 
 	if (s.expanding && s.locked == 0 &&
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
+	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
 		handle_stripe_expansion(conf, sh, NULL);
 
 	if (sh->ops.count)
@@ -3089,7 +3060,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	}
 
 	if (s.expanding && s.locked == 0 &&
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
+	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
 		handle_stripe_expansion(conf, sh, &r6s);
 
  unlock:

From 600aa10993012ff2dd5617720dac081e4f992017 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:05 +1000
Subject: [PATCH 23/52] md: replace STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} with
 'reconstruct_states'

From: Dan Williams <dan.j.williams@intel.com>

Track the state of reconstruct operations (recalculating the parity block
usually due to incoming writes, or as part of array expansion)  Reduces the
scope of the STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} flags to only tracking whether
a reconstruct operation has been requested via the ops_request field of struct
stripe_head_state.

This is the final step in the removal of ops.{pending,ack,complete,count}, i.e.
the STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} flags only request an operation and do
not track the state of the operation.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 204 +++++++++++--------------------------
 include/linux/raid/raid5.h |   9 +-
 2 files changed, 63 insertions(+), 150 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 835046bf384e..b9159367491a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -122,6 +122,13 @@ static void return_io(struct bio *return_bi)
 
 static void print_raid5_conf (raid5_conf_t *conf);
 
+static int stripe_operations_active(struct stripe_head *sh)
+{
+	return sh->check_state || sh->reconstruct_state ||
+	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
+	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
+}
+
 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 {
 	if (atomic_dec_and_test(&sh->count)) {
@@ -141,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 			}
 			md_wakeup_thread(conf->mddev->thread);
 		} else {
-			BUG_ON(sh->ops.pending);
+			BUG_ON(stripe_operations_active(sh));
 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 				atomic_dec(&conf->preread_active_stripes);
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -243,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
 
 	BUG_ON(atomic_read(&sh->count) != 0);
 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
-	BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
+	BUG_ON(stripe_operations_active(sh));
 
 	CHECK_DEVLOCK();
 	pr_debug("init_stripe called, stripe %llu\n",
@@ -344,47 +351,6 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 	return sh;
 }
 
-/* test_and_ack_op() ensures that we only dequeue an operation once */
-#define test_and_ack_op(op, pend) \
-do {							\
-	if (test_bit(op, &sh->ops.pending) &&		\
-		!test_bit(op, &sh->ops.complete)) {	\
-		if (test_and_set_bit(op, &sh->ops.ack)) \
-			clear_bit(op, &pend);		\
-		else					\
-			ack++;				\
-	} else						\
-		clear_bit(op, &pend);			\
-} while (0)
-
-/* find new work to run, do not resubmit work that is already
- * in flight
- */
-static unsigned long get_stripe_work(struct stripe_head *sh)
-{
-	unsigned long pending;
-	int ack = 0;
-
-	pending = sh->ops.pending;
-
-	test_and_ack_op(STRIPE_OP_BIOFILL, pending);
-	test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
-	test_and_ack_op(STRIPE_OP_PREXOR, pending);
-	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
-	test_and_ack_op(STRIPE_OP_POSTXOR, pending);
-	test_and_ack_op(STRIPE_OP_CHECK, pending);
-
-	sh->ops.count -= ack;
-	if (unlikely(sh->ops.count < 0)) {
-		printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
-			"ops.complete: %#lx\n", pending, sh->ops.pending,
-			sh->ops.ack, sh->ops.complete);
-		BUG();
-	}
-
-	return pending;
-}
-
 static void
 raid5_end_read_request(struct bio *bi, int error);
 static void
@@ -609,7 +575,7 @@ static void ops_complete_compute5(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_compute5(struct stripe_head *sh, unsigned long pending)
+ops_run_compute5(struct stripe_head *sh, unsigned long ops_request)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -640,7 +606,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
 			ops_complete_compute5, sh);
 
 	/* ack now if postxor is not set to be run */
-	if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
+	if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
 		async_tx_ack(tx);
 
 	return tx;
@@ -652,8 +618,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
-
-	set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
 }
 
 static struct dma_async_tx_descriptor *
@@ -686,7 +650,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		 unsigned long pending)
+		 unsigned long ops_request)
 {
 	int disks = sh->disks;
 	int pd_idx = sh->pd_idx, i;
@@ -694,7 +658,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (Wantprexor)
 	 */
-	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
+	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -744,7 +708,7 @@ static void ops_complete_postxor(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+	sh->reconstruct_state = reconstruct_state_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -763,16 +727,14 @@ static void ops_complete_write(void *stripe_head_ref)
 			set_bit(R5_UPTODATE, &dev->flags);
 	}
 
-	set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
+	sh->reconstruct_state = reconstruct_state_drain_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
 static void
 ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		unsigned long pending)
+		unsigned long ops_request)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -780,7 +742,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
-	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
+	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
 	unsigned long flags;
 	dma_async_tx_callback callback;
 
@@ -807,7 +769,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	}
 
 	/* check whether this postxor is part of a write */
-	callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
+	callback = test_bit(STRIPE_OP_BIODRAIN, &ops_request) ?
 		ops_complete_write : ops_complete_postxor;
 
 	/* 1/ if we prexor'd then the dest is reused as a source
@@ -868,8 +830,7 @@ static void ops_run_check(struct stripe_head *sh)
 		ops_complete_check, sh);
 }
 
-static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
-			  unsigned long ops_request)
+static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -880,18 +841,18 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
 	}
 
 	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
-		tx = ops_run_compute5(sh, pending);
+		tx = ops_run_compute5(sh, ops_request);
 
-	if (test_bit(STRIPE_OP_PREXOR, &pending))
+	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
 		tx = ops_run_prexor(sh, tx);
 
-	if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
-		tx = ops_run_biodrain(sh, tx, pending);
+	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
+		tx = ops_run_biodrain(sh, tx, ops_request);
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_POSTXOR, &pending))
-		ops_run_postxor(sh, tx, pending);
+	if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
+		ops_run_postxor(sh, tx, ops_request);
 
 	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
@@ -1684,11 +1645,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 	}
 }
 
-static int
-handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
+static void
+handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
+			 int rcw, int expand)
 {
 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
-	int locked = 0;
 
 	if (rcw) {
 		/* if we are not expanding this is a proper write request, and
@@ -1696,12 +1657,12 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 		 * stripe cache
 		 */
 		if (!expand) {
-			set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-			sh->ops.count++;
-		}
+			sh->reconstruct_state = reconstruct_state_drain_run;
+			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+		} else
+			sh->reconstruct_state = reconstruct_state_run;
 
-		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-		sh->ops.count++;
+		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -1710,21 +1671,20 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 				set_bit(R5_LOCKED, &dev->flags);
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
-				locked++;
+				s->locked++;
 			}
 		}
-		if (locked + 1 == disks)
+		if (s->locked + 1 == disks)
 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
 				atomic_inc(&sh->raid_conf->pending_full_writes);
 	} else {
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
-		set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-		set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-
-		sh->ops.count += 3;
+		sh->reconstruct_state = reconstruct_state_drain_run;
+		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -1742,7 +1702,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 				set_bit(R5_Wantprexor, &dev->flags);
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
-				locked++;
+				s->locked++;
 			}
 		}
 	}
@@ -1752,13 +1712,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 	 */
 	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
 	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-	locked++;
+	s->locked++;
 
-	pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
+	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
-		locked, sh->ops.pending);
-
-	return locked;
+		s->locked, s->ops_request);
 }
 
 /*
@@ -2005,8 +1963,7 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 	 * midst of changing due to a write
 	 */
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-	    !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
-	    !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+	    !sh->reconstruct_state) {
 		for (i = disks; i--; )
 			if (__handle_issuing_new_read_requests5(
 				sh, s, i, disks) == 0)
@@ -2211,7 +2168,7 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
-		s->locked += handle_write_operations5(sh, rcw == 0, 0);
+		handle_write_operations5(sh, s, rcw == 0, 0);
 }
 
 static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
@@ -2581,15 +2538,14 @@ static void handle_stripe5(struct stripe_head *sh)
 	struct bio *return_bi = NULL;
 	struct stripe_head_state s;
 	struct r5dev *dev;
-	unsigned long pending = 0;
 	mdk_rdev_t *blocked_rdev = NULL;
 	int prexor;
 
 	memset(&s, 0, sizeof(s));
-	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
-		"ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
-		atomic_read(&sh->count), sh->pd_idx,
-		sh->ops.pending, sh->ops.ack, sh->ops.complete);
+	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
+		 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
+		 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
+		 sh->reconstruct_state);
 
 	spin_lock(&sh->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2703,34 +2659,12 @@ static void handle_stripe5(struct stripe_head *sh)
 	/* Now we check to see if any write operations have recently
 	 * completed
 	 */
-
-	/* leave prexor set until postxor is done, allows us to distinguish
-	 * a rmw from a rcw during biodrain
-	 */
 	prexor = 0;
-	if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
-		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-		prexor = 1;
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-
+	if (sh->reconstruct_state == reconstruct_state_drain_result) {
+		sh->reconstruct_state = reconstruct_state_idle;
 		for (i = disks; i--; )
-			clear_bit(R5_Wantprexor, &sh->dev[i].flags);
-	}
-
-	/* if only POSTXOR is set then this is an 'expand' postxor */
-	if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
-		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+			prexor += test_and_clear_bit(R5_Wantprexor,
+						     &sh->dev[i].flags);
 
 		/* All the 'written' buffers and the parity block are ready to
 		 * be written back to disk
@@ -2763,8 +2697,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
 	 */
-	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-	    !sh->check_state)
+	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
 		handle_issuing_new_write_requests5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
@@ -2805,18 +2738,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		}
 	}
 
-	/* Finish postxor operations initiated by the expansion
-	 * process
-	 */
-	if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
-		!test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
-
+	/* Finish reconstruct operations initiated by the expansion process */
+	if (sh->reconstruct_state == reconstruct_state_result) {
+		sh->reconstruct_state = reconstruct_state_idle;
 		clear_bit(STRIPE_EXPANDING, &sh->state);
-
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
 		for (i = conf->raid_disks; i--; )
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			set_bit(R5_LOCKED, &dev->flags);
@@ -2824,15 +2749,13 @@ static void handle_stripe5(struct stripe_head *sh)
 	}
 
 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
-		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+	    !sh->reconstruct_state) {
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
 			conf->raid_disks);
-		s.locked += handle_write_operations5(sh, 1, 1);
-	} else if (s.expanded &&
-		   s.locked == 0 &&
-		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+		handle_write_operations5(sh, &s, 1, 1);
+	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
@@ -2843,9 +2766,6 @@ static void handle_stripe5(struct stripe_head *sh)
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
 		handle_stripe_expansion(conf, sh, NULL);
 
-	if (sh->ops.count)
-		pending = get_stripe_work(sh);
-
  unlock:
 	spin_unlock(&sh->lock);
 
@@ -2853,8 +2773,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
-	if (pending || s.ops_request)
-		raid5_run_ops(sh, pending, s.ops_request);
+	if (s.ops_request)
+		raid5_run_ops(sh, s.ops_request);
 
 	ops_run_io(sh, &s);
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 2c96d5fd54bf..5f3e674b87dd 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -205,19 +205,12 @@ struct stripe_head {
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
 	enum check_states	check_state;
+	enum reconstruct_states reconstruct_state;
 	/* stripe_operations
-	 * @pending - pending ops flags (set for request->issue->complete)
-	 * @ack - submitted ops flags (set for issue->complete)
-	 * @complete - completed ops flags (set for complete)
 	 * @target - STRIPE_OP_COMPUTE_BLK target
-	 * @count - raid5_runs_ops is set to run when this is non-zero
 	 */
 	struct stripe_operations {
-		unsigned long	   pending;
-		unsigned long	   ack;
-		unsigned long	   complete;
 		int		   target;
-		int		   count;
 		u32		   zero_sum_result;
 	} ops;
 	struct r5dev {

From d8ee0728b5b30d7a6f62c399a95e953616d31f23 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:06 +1000
Subject: [PATCH 24/52] md: replace R5_WantPrexor with R5_WantDrain, add
 'prexor' reconstruct_states

From: Dan Williams <dan.j.williams@intel.com>

Currently ops_run_biodrain and other locations have extra logic to determine
which blocks are processed in the prexor and non-prexor cases.  This can be
eliminated if handle_write_operations5 flags the blocks to be processed in all
cases via R5_Wantdrain.  The presence of the prexor operation is tracked in
sh->reconstruct_state.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 89 +++++++++++++-------------------------
 include/linux/raid/raid5.h |  6 +--
 2 files changed, 32 insertions(+), 63 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9159367491a..c71246061c0e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -637,7 +637,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
+		if (test_bit(R5_Wantdrain, &dev->flags))
 			xor_srcs[count++] = dev->page;
 	}
 
@@ -649,16 +649,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		 unsigned long ops_request)
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	int pd_idx = sh->pd_idx, i;
-
-	/* check if prexor is active which means only process blocks
-	 * that are part of a read-modify-write (Wantprexor)
-	 */
-	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
+	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -666,20 +660,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		struct bio *chosen;
-		int towrite;
 
-		towrite = 0;
-		if (prexor) { /* rmw */
-			if (dev->towrite &&
-			    test_bit(R5_Wantprexor, &dev->flags))
-				towrite = 1;
-		} else { /* rcw */
-			if (i != pd_idx && dev->towrite &&
-				test_bit(R5_LOCKED, &dev->flags))
-				towrite = 1;
-		}
-
-		if (towrite) {
+		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
 			struct bio *wbi;
 
 			spin_lock(&sh->lock);
@@ -702,18 +684,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 }
 
 static void ops_complete_postxor(void *stripe_head_ref)
-{
-	struct stripe_head *sh = stripe_head_ref;
-
-	pr_debug("%s: stripe %llu\n", __func__,
-		(unsigned long long)sh->sector);
-
-	sh->reconstruct_state = reconstruct_state_result;
-	set_bit(STRIPE_HANDLE, &sh->state);
-	release_stripe(sh);
-}
-
-static void ops_complete_write(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
 	int disks = sh->disks, i, pd_idx = sh->pd_idx;
@@ -727,14 +697,21 @@ static void ops_complete_write(void *stripe_head_ref)
 			set_bit(R5_UPTODATE, &dev->flags);
 	}
 
-	sh->reconstruct_state = reconstruct_state_drain_result;
+	if (sh->reconstruct_state == reconstruct_state_drain_run)
+		sh->reconstruct_state = reconstruct_state_drain_result;
+	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
+		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
+	else {
+		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
+		sh->reconstruct_state = reconstruct_state_result;
+	}
+
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
 static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		unsigned long ops_request)
+ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -742,9 +719,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
-	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
+	int prexor = 0;
 	unsigned long flags;
-	dma_async_tx_callback callback;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -752,7 +728,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
-	if (prexor) {
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		prexor = 1;
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -768,10 +745,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 		}
 	}
 
-	/* check whether this postxor is part of a write */
-	callback = test_bit(STRIPE_OP_BIODRAIN, &ops_request) ?
-		ops_complete_write : ops_complete_postxor;
-
 	/* 1/ if we prexor'd then the dest is reused as a source
 	 * 2/ if we did not prexor then we are redoing the parity
 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
@@ -785,10 +758,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	if (unlikely(count == 1)) {
 		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-			flags, tx, callback, sh);
+			flags, tx, ops_complete_postxor, sh);
 	} else
 		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-			flags, tx, callback, sh);
+			flags, tx, ops_complete_postxor, sh);
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -847,12 +820,12 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 		tx = ops_run_prexor(sh, tx);
 
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
-		tx = ops_run_biodrain(sh, tx, ops_request);
+		tx = ops_run_biodrain(sh, tx);
 		overlap_clear++;
 	}
 
 	if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
-		ops_run_postxor(sh, tx, ops_request);
+		ops_run_postxor(sh, tx);
 
 	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
@@ -1669,6 +1642,7 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 
 			if (dev->towrite) {
 				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantdrain, &dev->flags);
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
@@ -1681,7 +1655,7 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
-		sh->reconstruct_state = reconstruct_state_drain_run;
+		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
 		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
 		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
 		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
@@ -1691,15 +1665,10 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 			if (i == pd_idx)
 				continue;
 
-			/* For a read-modify write there may be blocks that are
-			 * locked for reading while others are ready to be
-			 * written so we distinguish these blocks by the
-			 * R5_Wantprexor bit
-			 */
 			if (dev->towrite &&
 			    (test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags))) {
-				set_bit(R5_Wantprexor, &dev->flags);
+			     test_bit(R5_Wantcompute, &dev->flags))) {
+				set_bit(R5_Wantdrain, &dev->flags);
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
@@ -2660,11 +2629,11 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * completed
 	 */
 	prexor = 0;
-	if (sh->reconstruct_state == reconstruct_state_drain_result) {
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
+		prexor = 1;
+	if (sh->reconstruct_state == reconstruct_state_drain_result ||
+	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
 		sh->reconstruct_state = reconstruct_state_idle;
-		for (i = disks; i--; )
-			prexor += test_and_clear_bit(R5_Wantprexor,
-						     &sh->dev[i].flags);
 
 		/* All the 'written' buffers and the parity block are ready to
 		 * be written back to disk
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 5f3e674b87dd..3b2672792457 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -187,8 +187,10 @@ enum check_states {
  */
 enum reconstruct_states {
 	reconstruct_state_idle = 0,
+	reconstruct_state_prexor_drain_run,	/* prexor-write */
 	reconstruct_state_drain_run,		/* write */
 	reconstruct_state_run,			/* expand */
+	reconstruct_state_prexor_drain_result,
 	reconstruct_state_drain_result,
 	reconstruct_state_result,
 };
@@ -258,9 +260,7 @@ struct r6_state {
 #define	R5_Wantfill	12 /* dev->toread contains a bio that needs
 				    * filling
 				    */
-#define	R5_Wantprexor	13 /* distinguish blocks ready for rmw from
-				    * other "towrites"
-				    */
+#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
 /*
  * Write method
  */

From 7b3a871ed995270268a481404454ceafe1a87478 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:09 +1000
Subject: [PATCH 25/52] md: handle operation chaining in raid5_run_ops

From: Dan Williams <dan.j.williams@intel.com>

Neil said:
> At the end of ops_run_compute5 you have:
>         /* ack now if postxor is not set to be run */
>         if (tx && !test_bit(STRIPE_OP_POSTXOR, &s->ops_run))
>                 async_tx_ack(tx);
>
> It looks odd having that test there.  Would it fit in raid5_run_ops
> better?

The intended global interpretation is that raid5_run_ops can build a chain
of xor and memcpy operations.  When MD registers the compute-xor it tells
async_tx to keep the operation handle around so that another item in the
dependency chain can be submitted. If we are just computing a block to
satisfy a read then we can terminate the chain immediately.  raid5_run_ops
gives a better context for this test since it cares about the entire chain.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c71246061c0e..456c3c2c961d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -574,8 +574,7 @@ static void ops_complete_compute5(void *stripe_head_ref)
 	release_stripe(sh);
 }
 
-static struct dma_async_tx_descriptor *
-ops_run_compute5(struct stripe_head *sh, unsigned long ops_request)
+static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -605,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long ops_request)
 			ASYNC_TX_XOR_ZERO_DST, NULL,
 			ops_complete_compute5, sh);
 
-	/* ack now if postxor is not set to be run */
-	if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
-		async_tx_ack(tx);
-
 	return tx;
 }
 
@@ -813,8 +808,12 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
-		tx = ops_run_compute5(sh, ops_request);
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
+		tx = ops_run_compute5(sh);
+		/* terminate the chain if postxor is not set to be run */
+		if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
+			async_tx_ack(tx);
+	}
 
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
 		tx = ops_run_prexor(sh, tx);

From 1fe797e67fb07d605b82300934d0de67068a0aca Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 09:16:30 +1000
Subject: [PATCH 26/52] md: rationalize raid5 function names

From: Dan Williams <dan.j.williams@intel.com>

Commit a4456856 refactored some of the deep code paths in raid5.c into separate
functions.  The names chosen at the time do not consistently indicate what is
going to happen to the stripe.  So, update the names, and since a stripe is a
cache element use cache semantics like fill, dirty, and clean.

(also, fix up the indentation in fetch_block5)

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c | 76 ++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 456c3c2c961d..442622067cae 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1618,7 +1618,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 }
 
 static void
-handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
+schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
 			 int rcw, int expand)
 {
 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
@@ -1783,7 +1783,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 }
 
 static void
-handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
+handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks,
 				struct bio **return_bi)
 {
@@ -1874,23 +1874,28 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
 			md_wakeup_thread(conf->mddev->thread);
 }
 
-/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
- * to process
+/* fetch_block5 - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill5 to continue
  */
-static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
-			struct stripe_head_state *s, int disk_idx, int disks)
+static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
+			int disk_idx, int disks)
 {
 	struct r5dev *dev = &sh->dev[disk_idx];
 	struct r5dev *failed_dev = &sh->dev[s->failed_num];
 
 	/* is the data in this block needed, and can we get it? */
 	if (!test_bit(R5_LOCKED, &dev->flags) &&
-	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
-	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-	     s->syncing || s->expanding || (s->failed &&
-	     (failed_dev->toread || (failed_dev->towrite &&
-	     !test_bit(R5_OVERWRITE, &failed_dev->flags)
-	     ))))) {
+	    !test_bit(R5_UPTODATE, &dev->flags) &&
+	    (dev->toread ||
+	     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+	     s->syncing || s->expanding ||
+	     (s->failed &&
+	      (failed_dev->toread ||
+	       (failed_dev->towrite &&
+		!test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
 		/* We would like to get this block, possibly by computing it,
 		 * otherwise read it if the backing disk is insync
 		 */
@@ -1908,7 +1913,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 			 * subsequent operation.
 			 */
 			s->uptodate++;
-			return 0; /* uptodate + compute == disks */
+			return 1; /* uptodate + compute == disks */
 		} else if (test_bit(R5_Insync, &dev->flags)) {
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantread, &dev->flags);
@@ -1918,10 +1923,13 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 		}
 	}
 
-	return ~0;
+	return 0;
 }
 
-static void handle_issuing_new_read_requests5(struct stripe_head *sh,
+/**
+ * handle_stripe_fill5 - read or compute data to satisfy pending requests.
+ */
+static void handle_stripe_fill5(struct stripe_head *sh,
 			struct stripe_head_state *s, int disks)
 {
 	int i;
@@ -1931,16 +1939,14 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 	 * midst of changing due to a write
 	 */
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-	    !sh->reconstruct_state) {
+	    !sh->reconstruct_state)
 		for (i = disks; i--; )
-			if (__handle_issuing_new_read_requests5(
-				sh, s, i, disks) == 0)
+			if (fetch_block5(sh, s, i, disks))
 				break;
-	}
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
 
-static void handle_issuing_new_read_requests6(struct stripe_head *sh,
+static void handle_stripe_fill6(struct stripe_head *sh,
 			struct stripe_head_state *s, struct r6_state *r6s,
 			int disks)
 {
@@ -1999,12 +2005,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
 }
 
 
-/* handle_completed_write_requests
+/* handle_stripe_clean_event
  * any written block on an uptodate or failed drive can be returned.
  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
  * never LOCKED, so we don't need to test 'failed' directly.
  */
-static void handle_completed_write_requests(raid5_conf_t *conf,
+static void handle_stripe_clean_event(raid5_conf_t *conf,
 	struct stripe_head *sh, int disks, struct bio **return_bi)
 {
 	int i;
@@ -2049,7 +2055,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
 			md_wakeup_thread(conf->mddev->thread);
 }
 
-static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
+static void handle_stripe_dirtying5(raid5_conf_t *conf,
 		struct stripe_head *sh,	struct stripe_head_state *s, int disks)
 {
 	int rmw = 0, rcw = 0, i;
@@ -2136,10 +2142,10 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
-		handle_write_operations5(sh, s, rcw == 0, 0);
+		schedule_reconstruction5(sh, s, rcw == 0, 0);
 }
 
-static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
+static void handle_stripe_dirtying6(raid5_conf_t *conf,
 		struct stripe_head *sh,	struct stripe_head_state *s,
 		struct r6_state *r6s, int disks)
 {
@@ -2597,8 +2603,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * need to be failed
 	 */
 	if (s.failed > 1 && s.to_read+s.to_write+s.written)
-		handle_requests_to_failed_array(conf, sh, &s, disks,
-						&return_bi);
+		handle_failed_stripe(conf, sh, &s, disks, &return_bi);
 	if (s.failed > 1 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2614,7 +2619,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	       !test_bit(R5_LOCKED, &dev->flags) &&
 	       test_bit(R5_UPTODATE, &dev->flags)) ||
 	       (s.failed == 1 && s.failed_num == sh->pd_idx)))
-		handle_completed_write_requests(conf, sh, disks, &return_bi);
+		handle_stripe_clean_event(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -2622,7 +2627,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 */
 	if (s.to_read || s.non_overwrite ||
 	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
-		handle_issuing_new_read_requests5(sh, &s, disks);
+		handle_stripe_fill5(sh, &s, disks);
 
 	/* Now we check to see if any write operations have recently
 	 * completed
@@ -2666,7 +2671,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 *    block.
 	 */
 	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_issuing_new_write_requests5(conf, sh, &s, disks);
+		handle_stripe_dirtying5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
@@ -2722,7 +2727,7 @@ static void handle_stripe5(struct stripe_head *sh)
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
 			conf->raid_disks);
-		handle_write_operations5(sh, &s, 1, 1);
+		schedule_reconstruction5(sh, &s, 1, 1);
 	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
@@ -2854,8 +2859,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	 * might need to be failed
 	 */
 	if (s.failed > 2 && s.to_read+s.to_write+s.written)
-		handle_requests_to_failed_array(conf, sh, &s, disks,
-						&return_bi);
+		handle_failed_stripe(conf, sh, &s, disks, &return_bi);
 	if (s.failed > 2 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2880,7 +2884,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	     ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && test_bit(R5_UPTODATE, &qdev->flags)))))
-		handle_completed_write_requests(conf, sh, disks, &return_bi);
+		handle_stripe_clean_event(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -2888,11 +2892,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	 */
 	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
 	    (s.syncing && (s.uptodate < disks)) || s.expanding)
-		handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
+		handle_stripe_fill6(sh, &s, &r6s, disks);
 
 	/* now to consider writing and what else, if anything should be read */
 	if (s.to_write)
-		handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
+		handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough

From b5470dc5fc18a8ff6517c3bb538d1479e58ecb02 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jun 2008 21:44:04 -0700
Subject: [PATCH 27/52] md: resolve external metadata handling deadlock in
 md_allow_write

md_allow_write() marks the metadata dirty while holding mddev->lock and then
waits for the write to complete.  For externally managed metadata this causes a
deadlock as userspace needs to take the lock to communicate that the metadata
update has completed.

Change md_allow_write() in the 'external' case to start the 'mark active'
operation and then return -EAGAIN.  The expected side effects while waiting for
userspace to write 'active' to 'array_state' are holding off reshape (code
currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to
fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and
cause updates to 'raid_disks' to fail.  Except for 'stripe_cache_size' changes
these failures can be mitigated by coordinating with mdmon.

md_write_start() still prevents writes from occurring until the metadata
handler has had a chance to take action as it unconditionally waits for
MD_CHANGE_CLEAN to be cleared.

[neilb@suse.de: return -EAGAIN, try GFP_NOIO]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/md.c         | 27 ++++++++++++++++-----------
 drivers/md/raid1.c      |  6 ++++--
 drivers/md/raid5.c      | 12 +++++++++---
 include/linux/raid/md.h |  2 +-
 4 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index df1230af02cd..43d033d9a05a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
 	char *ptr, *buf = NULL;
 	int err = -ENOMEM;
 
-	md_allow_write(mddev);
+	if (md_allow_write(mddev))
+		file = kmalloc(sizeof(*file), GFP_NOIO);
+	else
+		file = kmalloc(sizeof(*file), GFP_KERNEL);
 
-	file = kmalloc(sizeof(*file), GFP_KERNEL);
 	if (!file)
 		goto out;
 
@@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev)
  * may proceed without blocking.  It is important to call this before
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * Must be called with mddev_lock held.
+ *
+ * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * is dropped, so return -EAGAIN after notifying userspace.
  */
-void md_allow_write(mddev_t *mddev)
+int md_allow_write(mddev_t *mddev)
 {
 	if (!mddev->pers)
-		return;
+		return 0;
 	if (mddev->ro)
-		return;
+		return 0;
 	if (!mddev->pers->sync_request)
-		return;
+		return 0;
 
 	spin_lock_irq(&mddev->write_lock);
 	if (mddev->in_sync) {
@@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev)
 			mddev->safemode = 1;
 		spin_unlock_irq(&mddev->write_lock);
 		md_update_sb(mddev, 0);
-
 		sysfs_notify(&mddev->kobj, NULL, "array_state");
-		/* wait for the dirty state to be recorded in the metadata */
-		wait_event(mddev->sb_wait,
-			   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
-			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 	} else
 		spin_unlock_irq(&mddev->write_lock);
+
+	if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+		return -EAGAIN;
+	else
+		return 0;
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f05d5983efb6..491dc2d4ad5f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev)
 	conf_t *conf = mddev_to_conf(mddev);
 	int cnt, raid_disks;
 	unsigned long flags;
-	int d, d2;
+	int d, d2, err;
 
 	/* Cannot change chunk_size, layout, or level */
 	if (mddev->chunk_size != mddev->new_chunk ||
@@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev)
 		return -EINVAL;
 	}
 
-	md_allow_write(mddev);
+	err = md_allow_write(mddev);
+	if (err)
+		return err;
 
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 442622067cae..8f4c70a53210 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	struct stripe_head *osh, *nsh;
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
-	int err = 0;
+	int err;
 	struct kmem_cache *sc;
 	int i;
 
 	if (newsize <= conf->pool_size)
 		return 0; /* never bother to shrink */
 
-	md_allow_write(conf->mddev);
+	err = md_allow_write(conf->mddev);
+	if (err)
+		return err;
 
 	/* Step 1 */
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long new;
+	int err;
+
 	if (len >= PAGE_SIZE)
 		return -EINVAL;
 	if (!conf)
@@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 		else
 			break;
 	}
-	md_allow_write(mddev);
+	err = md_allow_write(mddev);
+	if (err)
+		return err;
 	while (new > conf->max_nr_stripes) {
 		if (grow_one_stripe(conf))
 			conf->max_nr_stripes++;
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b7386ae9d288..dc0e3fcb9f28 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 			struct page *page, int rw);
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
-extern void md_allow_write(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 
 #endif /* CONFIG_MD */

From bb57fc64b251d2696900d8a8f25ad5272d5d9c2a Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 25 Apr 2008 19:06:35 +0200
Subject: [PATCH 28/52] md: md_ioctl(): Fix misleading indentation.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 43d033d9a05a..912ed04adcff 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4973,14 +4973,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	 * here and hit the 'default' below, so only disallow
 	 * 'md' ioctls, and switch to rw mode if started auto-readonly.
 	 */
-	if (_IOC_TYPE(cmd) == MD_MAJOR &&
-	    mddev->ro && mddev->pers) {
+	if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
 		if (mddev->ro == 2) {
 			mddev->ro = 0;
 			sysfs_notify(&mddev->kobj, NULL, "array_state");
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 			md_wakeup_thread(mddev->thread);
-
 		} else {
 			err = -EROFS;
 			goto abort_unlock;

From 2f9618ce63cb049c5587f5c650f2725c0035aa96 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 25 Apr 2008 18:57:58 +0200
Subject: [PATCH 29/52] md: md_getgeo(): Move comment to proper position.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 912ed04adcff..80bcba9951c3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4807,6 +4807,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
 	return 0;
 }
 
+/*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	mddev_t *mddev = bdev->bd_disk->private_data;
@@ -4958,12 +4964,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
 			err = do_md_stop (mddev, 1);
 			goto done_unlock;
 
-	/*
-	 * We have a problem here : there is no easy way to give a CHS
-	 * virtual geometry. We currently pretend that we have a 2 heads
-	 * 4 sectors (with a BIG number of cylinders...). This drives
-	 * dosfs just mad... ;-)
-	 */
 	}
 
 	/*

From 13e53df354caea8986df951dcb6353c823e1f858 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Wed, 26 Mar 2008 00:07:03 +0100
Subject: [PATCH 30/52] md: do_md_run(): Fix misleading error message.

In case pers->run() succeeds but creating the bitmap fails, we
print an error message stating that pers->run() has failed.

Print this message only if pers->run() really failed.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 80bcba9951c3..b9d902652a51 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3675,7 +3675,9 @@ static int do_md_run(mddev_t * mddev)
 		mddev->ro = 2; /* read-only, but switch on first write */
 
 	err = mddev->pers->run(mddev);
-	if (!err && mddev->pers->sync_request) {
+	if (err)
+		printk(KERN_ERR "md: pers->run() failed ...\n");
+	else if (mddev->pers->sync_request) {
 		err = bitmap_create(mddev);
 		if (err) {
 			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3684,7 +3686,6 @@ static int do_md_run(mddev_t * mddev)
 		}
 	}
 	if (err) {
-		printk(KERN_ERR "md: pers->run() failed ...\n");
 		module_put(mddev->pers->owner);
 		mddev->pers = NULL;
 		bitmap_destroy(mddev);

From 9687a60c78bbf5649d9acbde1e8818be4c8c8b94 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Tue, 25 Mar 2008 22:24:09 +0100
Subject: [PATCH 31/52] md: sync_speed_show(): Trivial cleanups.

- Remove superfluous parentheses.
- Make format string match the type of the variable that is printed.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b9d902652a51..8545818fb6d0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3168,11 +3168,11 @@ static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
 	unsigned long resync, dt, db;
-	resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
-	dt = ((jiffies - mddev->resync_mark) / HZ);
+	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
+	dt = (jiffies - mddev->resync_mark) / HZ;
 	if (!dt) dt++;
-	db = resync - (mddev->resync_mark_cnt);
-	return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
+	db = resync - mddev->resync_mark_cnt;
+	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
 }
 
 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);

From 910d8cb3f4ef2c4a5914176592d2f2bc3cd94cdd Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Tue, 25 Mar 2008 21:00:53 +0100
Subject: [PATCH 32/52] md: Fix typo in array_state comment.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8545818fb6d0..f5b6a0d1a67c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2624,7 +2624,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
  *     When written, doesn't tear down array, but just stops it
  * suspended (not supported yet)
  *     All IO requests will block. The array can be reconfigured.
- *     Writing this, if accepted, will block until array is quiessent
+ *     Writing this, if accepted, will block until array is quiescent
  * readonly
  *     no resync can happen.  no superblocks get written.
  *     write requests fail

From 7f6ce7692807ad60d9341e41d565a7888a5bc2dd Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Sun, 23 Mar 2008 18:34:54 +0100
Subject: [PATCH 33/52] md: Fix a typo in the comment to cmd_match().

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f5b6a0d1a67c..a849e06f861c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1849,7 +1849,7 @@ repeat:
 
 }
 
-/* words written to sysfs files may, or my not, be \n terminated.
+/* words written to sysfs files may, or may not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
  */
 static int cmd_match(const char *cmd, const char *str)

From 35020f1a06edade6f52fc8349e150d95cdf7fd90 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Sun, 23 Mar 2008 15:10:33 +0100
Subject: [PATCH 34/52] md: sb_equal(): Fix misleading printk.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a849e06f861c..73399d17ac3e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -564,7 +564,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 
 	if (!tmp1 || !tmp2) {
 		ret = 0;
-		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+		printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
 		goto abort;
 	}
 

From 05710466c9ef2e3ee55166934c801a2393c32f80 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:20 +1000
Subject: [PATCH 35/52] md: Simplify uuid_equal().

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 73399d17ac3e..8a03c953b887 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -543,17 +543,12 @@ fail:
 
 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 {
-	if (	(sb1->set_uuid0 == sb2->set_uuid0) &&
-		(sb1->set_uuid1 == sb2->set_uuid1) &&
-		(sb1->set_uuid2 == sb2->set_uuid2) &&
-		(sb1->set_uuid3 == sb2->set_uuid3))
-
-		return 1;
-
-	return 0;
+	return 	sb1->set_uuid0 == sb2->set_uuid0 &&
+		sb1->set_uuid1 == sb2->set_uuid1 &&
+		sb1->set_uuid2 == sb2->set_uuid2 &&
+		sb1->set_uuid3 == sb2->set_uuid3;
 }
 
-
 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 {
 	int ret;

From ce0c8e05f8ef93d991d665aade8c4bf35806ea1a Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:20 +1000
Subject: [PATCH 36/52] md: Simplify sb_equal().

The only caller of sb_equal() tests the return value against
zero, so it's OK to return the negated return value of memcmp().

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8a03c953b887..20e03db8fdfc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -572,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
 	tmp1->nr_disks = 0;
 	tmp2->nr_disks = 0;
 
-	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
-		ret = 0;
-	else
-		ret = 1;
-
+	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
 abort:
 	kfree(tmp1);
 	kfree(tmp2);

From ebc243372842a81dddbe00bd047a25b8ee7d8b87 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:20 +1000
Subject: [PATCH 37/52] md: alloc_disk_sb(): Return proper error value.

If alloc_page() fails, ENOMEM is a more suitable error value
than EINVAL.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 20e03db8fdfc..9756ea0a19b9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
 	rdev->sb_page = alloc_page(GFP_KERNEL);
 	if (!rdev->sb_page) {
 		printk(KERN_ALERT "md: out of memory.\n");
-		return -EINVAL;
+		return -ENOMEM;
 	}
 
 	return 0;

From 80fab1d77b2852711917baa437e4fdab31c21fef Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:21 +1000
Subject: [PATCH 38/52] md: Simplify restart_array().

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 49 +++++++++++++++++--------------------------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9756ea0a19b9..bd1dc72ae522 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3766,40 +3766,25 @@ static int do_md_run(mddev_t * mddev)
 static int restart_array(mddev_t *mddev)
 {
 	struct gendisk *disk = mddev->gendisk;
-	int err;
 
-	/*
-	 * Complain if it has no devices
-	 */
-	err = -ENXIO;
+	/* Complain if it has no devices */
 	if (list_empty(&mddev->disks))
-		goto out;
-
-	if (mddev->pers) {
-		err = -EBUSY;
-		if (!mddev->ro)
-			goto out;
-
-		mddev->safemode = 0;
-		mddev->ro = 0;
-		set_disk_ro(disk, 0);
-
-		printk(KERN_INFO "md: %s switched to read-write mode.\n",
-			mdname(mddev));
-		/*
-		 * Kick recovery or resync if necessary
-		 */
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-		md_wakeup_thread(mddev->thread);
-		md_wakeup_thread(mddev->sync_thread);
-		err = 0;
-		sysfs_notify(&mddev->kobj, NULL, "array_state");
-
-	} else
-		err = -EINVAL;
-
-out:
-	return err;
+		return -ENXIO;
+	if (!mddev->pers)
+		return -EINVAL;
+	if (!mddev->ro)
+		return -EBUSY;
+	mddev->safemode = 0;
+	mddev->ro = 0;
+	set_disk_ro(disk, 0);
+	printk(KERN_INFO "md: %s switched to read-write mode.\n",
+		mdname(mddev));
+	/* Kick recovery or resync if necessary */
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+	md_wakeup_thread(mddev->sync_thread);
+	sysfs_notify(&mddev->kobj, NULL, "array_state");
+	return 0;
 }
 
 /* similar to deny_write_access, but accounts for our holding a reference

From 26ef379f53993b1da3c19b63257cd47e1d9cd672 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:21 +1000
Subject: [PATCH 39/52] md: get_disk_info(): Don't convert between signed and
 unsigned and back.

The current code copies a signed int from user space, converts it to
unsigned and passes the unsigned value to find_rdev_nr() which expects
a signed value. Simply pass the signed value from user space directly.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index bd1dc72ae522..328e247e2bf5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4186,15 +4186,12 @@ out:
 static int get_disk_info(mddev_t * mddev, void __user * arg)
 {
 	mdu_disk_info_t info;
-	unsigned int nr;
 	mdk_rdev_t *rdev;
 
 	if (copy_from_user(&info, arg, sizeof(info)))
 		return -EFAULT;
 
-	nr = info.number;
-
-	rdev = find_rdev_nr(mddev, nr);
+	rdev = find_rdev_nr(mddev, info.number);
 	if (rdev) {
 		info.major = MAJOR(rdev->bdev->bd_dev);
 		info.minor = MINOR(rdev->bdev->bd_dev);

From df5b20cf68f9c90204c5fd36b7b090635cee3cdf Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Fri, 11 Jul 2008 22:02:22 +1000
Subject: [PATCH 40/52] md: Better control of when do_md_stop is allowed to
 stop the array.

do_md_stop check the number of active users before allowing the array
to be stopped.
Two problems:
  1/ it assumes the request is coming through an open file descriptor
     (via ioctl) so it allows for that.  This is not always the case.
  2/ it doesn't do the check it the array hasn't been activated.
     This is not good for cases when we use an inactive array to hold
     some devices in a container.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 328e247e2bf5..4c4c79da72b9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2688,7 +2688,7 @@ array_state_show(mddev_t *mddev, char *page)
 	return sprintf(page, "%s\n", array_states[st]);
 }
 
-static int do_md_stop(mddev_t * mddev, int ro);
+static int do_md_stop(mddev_t * mddev, int ro, int is_open);
 static int do_md_run(mddev_t * mddev);
 static int restart_array(mddev_t *mddev);
 
@@ -2704,14 +2704,14 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 		/* stopping an active array */
 		if (atomic_read(&mddev->active) > 1)
 			return -EBUSY;
-		err = do_md_stop(mddev, 0);
+		err = do_md_stop(mddev, 0, 0);
 		break;
 	case inactive:
 		/* stopping an active array */
 		if (mddev->pers) {
 			if (atomic_read(&mddev->active) > 1)
 				return -EBUSY;
-			err = do_md_stop(mddev, 2);
+			err = do_md_stop(mddev, 2, 0);
 		} else
 			err = 0; /* already inactive */
 		break;
@@ -2719,7 +2719,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 		break; /* not supported yet */
 	case readonly:
 		if (mddev->pers)
-			err = do_md_stop(mddev, 1);
+			err = do_md_stop(mddev, 1, 0);
 		else {
 			mddev->ro = 1;
 			set_disk_ro(mddev->gendisk, 1);
@@ -2729,7 +2729,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 	case read_auto:
 		if (mddev->pers) {
 			if (mddev->ro != 1)
-				err = do_md_stop(mddev, 1);
+				err = do_md_stop(mddev, 1, 0);
 			else
 				err = restart_array(mddev);
 			if (err == 0) {
@@ -3818,16 +3818,17 @@ static void restore_bitmap_write_access(struct file *file)
  *   1 - switch to readonly
  *   2 - stop but do not disassemble array
  */
-static int do_md_stop(mddev_t * mddev, int mode)
+static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 {
 	int err = 0;
 	struct gendisk *disk = mddev->gendisk;
 
+	if (atomic_read(&mddev->active) > 1 + is_open) {
+		printk("md: %s still in use.\n",mdname(mddev));
+		return -EBUSY;
+	}
+
 	if (mddev->pers) {
-		if (atomic_read(&mddev->active)>2) {
-			printk("md: %s still in use.\n",mdname(mddev));
-			return -EBUSY;
-		}
 
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -3976,7 +3977,7 @@ static void autorun_array(mddev_t *mddev)
 	err = do_md_run (mddev);
 	if (err) {
 		printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
-		do_md_stop (mddev, 0);
+		do_md_stop (mddev, 0, 0);
 	}
 }
 
@@ -4931,11 +4932,11 @@ static int md_ioctl(struct inode *inode, struct file *file,
 			goto done_unlock;
 
 		case STOP_ARRAY:
-			err = do_md_stop (mddev, 0);
+			err = do_md_stop (mddev, 0, 1);
 			goto done_unlock;
 
 		case STOP_ARRAY_RO:
-			err = do_md_stop (mddev, 1);
+			err = do_md_stop (mddev, 1, 1);
 			goto done_unlock;
 
 	}
@@ -6226,7 +6227,7 @@ static int md_notify_reboot(struct notifier_block *this,
 
 		for_each_mddev(mddev, tmp)
 			if (mddev_trylock(mddev)) {
-				do_md_stop (mddev, 1);
+				do_md_stop (mddev, 1, 0);
 				mddev_unlock(mddev);
 			}
 		/*

From d71f9f88d74166dcdef743a057f9222d64d2d509 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:22 +1000
Subject: [PATCH 41/52] md: Make update_size() take the number of sectors.

Changing the internal representations of sizes of raid devices
from 1K blocks to sector counts (512B units) is desirable because
it allows to get rid of many divisions/multiplications and unnecessary
casts that are present in the current code.

This patch is a first step in this direction. It replaces the old
1K-based "size" argument of update_size() by "num_sectors" and
fixes up its two callers.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4c4c79da72b9..158c38f54a40 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2890,7 +2890,7 @@ size_show(mddev_t *mddev, char *page)
 	return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
 }
 
-static int update_size(mddev_t *mddev, unsigned long size);
+static int update_size(mddev_t *mddev, sector_t num_sectors);
 
 static ssize_t
 size_store(mddev_t *mddev, const char *buf, size_t len)
@@ -2907,7 +2907,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 		return -EINVAL;
 
 	if (mddev->pers) {
-		err = update_size(mddev, size);
+		err = update_size(mddev, size * 2);
 		md_update_sb(mddev, 1);
 	} else {
 		if (mddev->size == 0 ||
@@ -4617,24 +4617,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	return 0;
 }
 
-static int update_size(mddev_t *mddev, unsigned long size)
+static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
 	mdk_rdev_t * rdev;
 	int rv;
 	struct list_head *tmp;
-	int fit = (size == 0);
+	int fit = (num_sectors == 0);
 
 	if (mddev->pers->resize == NULL)
 		return -EINVAL;
-	/* The "size" is the amount of each device that is used.
-	 * This can only make sense for arrays with redundancy.
-	 * linear and raid0 always use whatever space is available
-	 * We can only consider changing the size if no resync
-	 * or reconstruction is happening, and if the new size
-	 * is acceptable. It must fit before the sb_offset or,
-	 * if that is <data_offset, it must fit before the
-	 * size of each device.
-	 * If size is zero, we find the largest size that fits.
+	/* The "num_sectors" is the number of sectors of each device that
+	 * is used.  This can only make sense for arrays with redundancy.
+	 * linear and raid0 always use whatever space is available. We can only
+	 * consider changing this number if no resync or reconstruction is
+	 * happening, and if the new size is acceptable. It must fit before the
+	 * sb_offset or, if that is <data_offset, it must fit before the size
+	 * of each device.  If num_sectors is zero, we find the largest size
+	 * that fits.
+
 	 */
 	if (mddev->sync_thread)
 		return -EBUSY;
@@ -4642,12 +4642,12 @@ static int update_size(mddev_t *mddev, unsigned long size)
 		sector_t avail;
 		avail = rdev->size * 2;
 
-		if (fit && (size == 0 || size > avail/2))
-			size = avail/2;
-		if (avail < ((sector_t)size << 1))
+		if (fit && (num_sectors == 0 || num_sectors > avail))
+			num_sectors = avail;
+		if (avail < num_sectors)
 			return -ENOSPC;
 	}
-	rv = mddev->pers->resize(mddev, (sector_t)size *2);
+	rv = mddev->pers->resize(mddev, num_sectors);
 	if (!rv) {
 		struct block_device *bdev;
 
@@ -4729,7 +4729,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 			return mddev->pers->reconfig(mddev, info->layout, -1);
 	}
 	if (info->size >= 0 && mddev->size != info->size)
-		rv = update_size(mddev, info->size);
+		rv = update_size(mddev, (sector_t)info->size * 2);
 
 	if (mddev->raid_disks    != info->raid_disks)
 		rv = update_raid_disks(mddev, info->raid_disks);

From e7debaa4951b37d6c9ace4c6b984cd4805c5bfbb Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:23 +1000
Subject: [PATCH 42/52] md: Replace calc_dev_size() by calc_num_sectors().

Number of sectors is the preferred unit for sizes of raid devices,
so change calc_dev_size() so that it returns this unit instead of
the number of 1K blocks.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 158c38f54a40..19f646a76402 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -353,15 +353,13 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 	return MD_NEW_SIZE_BLOCKS(size);
 }
 
-static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
 {
-	sector_t size;
-
-	size = rdev->sb_offset;
+	sector_t num_sectors = rdev->sb_offset * 2;
 
 	if (chunk_size)
-		size &= ~((sector_t)chunk_size/1024 - 1);
-	return size;
+		num_sectors &= ~((sector_t)chunk_size/512 - 1);
+	return num_sectors;
 }
 
 static int alloc_disk_sb(mdk_rdev_t * rdev)
@@ -753,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 		else 
 			ret = 0;
 	}
-	rdev->size = calc_dev_size(rdev, sb->chunk_size);
+	rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
 
 	if (rdev->size < sb->size && sb->level > 1)
 		/* "this cannot possibly happen" ... */
@@ -4359,7 +4357,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 		} else 
 			rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
-		rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+		rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
 
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
@@ -4398,7 +4396,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
 	int err;
-	unsigned int size;
 	mdk_rdev_t *rdev;
 
 	if (!mddev->pers)
@@ -4431,8 +4428,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 		rdev->sb_offset =
 			rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 
-	size = calc_dev_size(rdev, mddev->chunk_size);
-	rdev->size = size;
+	rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
 
 	if (test_bit(Faulty, &rdev->flags)) {
 		printk(KERN_WARNING 

From b73df2d3d629aefa187a0a3574fd81455e026bc8 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:23 +1000
Subject: [PATCH 43/52] md: Make calc_dev_sboffset() return a sector count.

As BLOCK_SIZE_BITS is 10 and

	MD_NEW_SIZE_SECTORS(2 * x) = 2 * NEW_SIZE_BLOCKS(x),

the return value of calc_dev_sboffset() doubles. Fix up all three
callers accordingly.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 19f646a76402..3276edde7576 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -347,10 +347,11 @@ static struct mdk_personality *find_pers(int level, char *clevel)
 	return NULL;
 }
 
+/* return the offset of the super block in 512byte sectors */
 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 {
-	sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
-	return MD_NEW_SIZE_BLOCKS(size);
+	sector_t num_sectors = bdev->bd_inode->i_size / 512;
+	return MD_NEW_SIZE_SECTORS(num_sectors);
 }
 
 static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
@@ -673,7 +674,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	 *
 	 * It also happens to be a multiple of 4Kb.
 	 */
-	sb_offset = calc_dev_sboffset(rdev->bdev);
+	sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
 	rdev->sb_offset = sb_offset;
 
 	ret = read_disk_sb(rdev, MD_SB_BYTES);
@@ -1006,7 +1007,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
 	size *= 2; /* convert to sectors */
 	if (rdev->mddev->bitmap_offset)
 		return 0; /* can't move bitmap */
-	rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+	rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
 	if (!size || size > rdev->sb_offset*2)
 		size = rdev->sb_offset*2;
 	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
@@ -4356,7 +4357,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			printk(KERN_INFO "md: nonpersistent superblock ...\n");
 			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 		} else 
-			rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+			rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
 		rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
 
 		err = bind_rdev_to_array(rdev, mddev);
@@ -4423,7 +4424,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	}
 
 	if (mddev->persistent)
-		rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+		rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
 	else
 		rdev->sb_offset =
 			rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;

From 0f420358e3a2abc028320ace7783e2e38cae77bf Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:23 +1000
Subject: [PATCH 44/52] md: Turn rdev->sb_offset into a sector-based quantity.

Rename it to sb_start to make sure all users have been converted.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/bitmap.c       | 10 ++---
 drivers/md/md.c           | 81 ++++++++++++++++++---------------------
 include/linux/raid/md_k.h |  2 +-
 3 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index dedba16d42f7..eba83e25b678 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
 		    || test_bit(Faulty, &rdev->flags))
 			continue;
 
-		target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
+		target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
 
 		if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
 			page->index = index;
@@ -262,12 +262,12 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 					/* bitmap runs in to metadata */
 					return -EINVAL;
 				if (rdev->data_offset + mddev->size*2
-				    > rdev->sb_offset*2 + bitmap->offset)
+				    > rdev->sb_start + bitmap->offset)
 					/* data runs in to bitmap */
 					return -EINVAL;
-			} else if (rdev->sb_offset*2 < rdev->data_offset) {
+			} else if (rdev->sb_start < rdev->data_offset) {
 				/* METADATA BITMAP DATA */
-				if (rdev->sb_offset*2
+				if (rdev->sb_start
 				    + bitmap->offset
 				    + page->index*(PAGE_SIZE/512) + size/512
 				    > rdev->data_offset)
@@ -277,7 +277,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				/* DATA METADATA BITMAP - no problems */
 			}
 			md_super_write(mddev, rdev,
-				       (rdev->sb_offset<<1) + bitmap->offset
+				       rdev->sb_start + bitmap->offset
 				       + page->index * (PAGE_SIZE/512),
 				       size,
 				       page);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3276edde7576..5590cb54b584 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -356,7 +356,7 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 
 static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
 {
-	sector_t num_sectors = rdev->sb_offset * 2;
+	sector_t num_sectors = rdev->sb_start;
 
 	if (chunk_size)
 		num_sectors &= ~((sector_t)chunk_size/512 - 1);
@@ -383,7 +383,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
 		put_page(rdev->sb_page);
 		rdev->sb_loaded = 0;
 		rdev->sb_page = NULL;
-		rdev->sb_offset = 0;
+		rdev->sb_start = 0;
 		rdev->size = 0;
 	}
 }
@@ -529,7 +529,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
 		return 0;
 
 
-	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
+	if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
 		goto fail;
 	rdev->sb_loaded = 1;
 	return 0;
@@ -666,16 +666,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 	mdp_super_t *sb;
 	int ret;
-	sector_t sb_offset;
 
 	/*
-	 * Calculate the position of the superblock,
+	 * Calculate the position of the superblock (512byte sectors),
 	 * it's at the end of the disk.
 	 *
 	 * It also happens to be a multiple of 4Kb.
 	 */
-	sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
-	rdev->sb_offset = sb_offset;
+	rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 
 	ret = read_disk_sb(rdev, MD_SB_BYTES);
 	if (ret) return ret;
@@ -1007,10 +1005,10 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
 	size *= 2; /* convert to sectors */
 	if (rdev->mddev->bitmap_offset)
 		return 0; /* can't move bitmap */
-	rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
-	if (!size || size > rdev->sb_offset*2)
-		size = rdev->sb_offset*2;
-	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
+	rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+	if (!size || size > rdev->sb_start)
+		size = rdev->sb_start;
+	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
 	return size/2; /* kB for sysfs */
@@ -1048,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 {
 	struct mdp_superblock_1 *sb;
 	int ret;
-	sector_t sb_offset;
+	sector_t sb_start;
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 	int bmask;
 
 	/*
-	 * Calculate the position of the superblock.
+	 * Calculate the position of the superblock in 512byte sectors.
 	 * It is always aligned to a 4K boundary and
 	 * depeding on minor_version, it can be:
 	 * 0: At least 8K, but less than 12K, from end of device
@@ -1062,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	 */
 	switch(minor_version) {
 	case 0:
-		sb_offset = rdev->bdev->bd_inode->i_size >> 9;
-		sb_offset -= 8*2;
-		sb_offset &= ~(sector_t)(4*2-1);
-		/* convert from sectors to K */
-		sb_offset /= 2;
+		sb_start = rdev->bdev->bd_inode->i_size >> 9;
+		sb_start -= 8*2;
+		sb_start &= ~(sector_t)(4*2-1);
 		break;
 	case 1:
-		sb_offset = 0;
+		sb_start = 0;
 		break;
 	case 2:
-		sb_offset = 4;
+		sb_start = 8;
 		break;
 	default:
 		return -EINVAL;
 	}
-	rdev->sb_offset = sb_offset;
+	rdev->sb_start = sb_start;
 
 	/* superblock is rarely larger than 1K, but it can be larger,
 	 * and it is safe to read 4k, so we do that
@@ -1091,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
 	    sb->major_version != cpu_to_le32(1) ||
 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
-	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
 		return -EINVAL;
 
@@ -1127,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
 
 	if (minor_version
-	    && rdev->data_offset < sb_offset + (rdev->sb_size/512))
+	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
 		return -EINVAL;
 
 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
@@ -1163,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	if (minor_version)
 		rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
 	else
-		rdev->size = rdev->sb_offset;
+		rdev->size = rdev->sb_start / 2;
 	if (rdev->size < le64_to_cpu(sb->data_size)/2)
 		return -EINVAL;
 	rdev->size = le64_to_cpu(sb->data_size)/2;
@@ -1350,7 +1346,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
 	if (size && size < rdev->mddev->size)
 		return 0; /* component must fit device */
 	size *= 2; /* convert to sectors */
-	if (rdev->sb_offset < rdev->data_offset/2) {
+	if (rdev->sb_start < rdev->data_offset) {
 		/* minor versions 1 and 2; superblock before data */
 		max_size = (rdev->bdev->bd_inode->i_size >> 9);
 		max_size -= rdev->data_offset;
@@ -1361,19 +1357,19 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
 		return 0;
 	} else {
 		/* minor version 0; superblock after data */
-		sector_t sb_offset;
-		sb_offset = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
-		sb_offset &= ~(sector_t)(4*2 - 1);
-		max_size = rdev->size*2 + sb_offset - rdev->sb_offset*2;
+		sector_t sb_start;
+		sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
+		sb_start &= ~(sector_t)(4*2 - 1);
+		max_size = rdev->size*2 + sb_start - rdev->sb_start;
 		if (!size || size > max_size)
 			size = max_size;
-		rdev->sb_offset = sb_offset/2;
+		rdev->sb_start = sb_start;
 	}
 	sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
 	sb->data_size = cpu_to_le64(size);
-	sb->super_offset = rdev->sb_offset*2;
+	sb->super_offset = rdev->sb_start;
 	sb->sb_csum = calc_sb_1_csum(sb);
-	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
+	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
 	return size/2; /* kB for sysfs */
@@ -1810,11 +1806,11 @@ repeat:
 		dprintk("%s ", bdevname(rdev->bdev,b));
 		if (!test_bit(Faulty, &rdev->flags)) {
 			md_super_write(mddev,rdev,
-				       rdev->sb_offset<<1, rdev->sb_size,
+				       rdev->sb_start, rdev->sb_size,
 				       rdev->sb_page);
 			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
 				bdevname(rdev->bdev,b),
-				(unsigned long long)rdev->sb_offset);
+				(unsigned long long)rdev->sb_start);
 			rdev->sb_events = mddev->events;
 
 		} else
@@ -3577,16 +3573,16 @@ static int do_md_run(mddev_t * mddev)
 		 * We don't want the data to overlap the metadata,
 		 * Internal Bitmap issues has handled elsewhere.
 		 */
-		if (rdev->data_offset < rdev->sb_offset) {
+		if (rdev->data_offset < rdev->sb_start) {
 			if (mddev->size &&
 			    rdev->data_offset + mddev->size*2
-			    > rdev->sb_offset*2) {
+			    > rdev->sb_start) {
 				printk("md: %s: data overlaps metadata\n",
 				       mdname(mddev));
 				return -EINVAL;
 			}
 		} else {
-			if (rdev->sb_offset*2 + rdev->sb_size/512
+			if (rdev->sb_start + rdev->sb_size/512
 			    > rdev->data_offset) {
 				printk("md: %s: metadata overlaps data\n",
 				       mdname(mddev));
@@ -4355,9 +4351,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 
 		if (!mddev->persistent) {
 			printk(KERN_INFO "md: nonpersistent superblock ...\n");
-			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+			rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 		} else 
-			rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
+			rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 		rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
 
 		err = bind_rdev_to_array(rdev, mddev);
@@ -4424,10 +4420,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	}
 
 	if (mddev->persistent)
-		rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
+		rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 	else
-		rdev->sb_offset =
-			rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+		rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 
 	rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
 
@@ -4628,7 +4623,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
 	 * linear and raid0 always use whatever space is available. We can only
 	 * consider changing this number if no resync or reconstruction is
 	 * happening, and if the new size is acceptable. It must fit before the
-	 * sb_offset or, if that is <data_offset, it must fit before the size
+	 * sb_start or, if that is <data_offset, it must fit before the size
 	 * of each device.  If num_sectors is zero, we find the largest size
 	 * that fits.
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index df30c4395875..e37aaa41efc6 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -59,7 +59,7 @@ struct mdk_rdev_s
 	int		sb_loaded;
 	__u64		sb_events;
 	sector_t	data_offset;	/* start of data in array */
-	sector_t	sb_offset;
+	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
 	int		sb_size;	/* bytes in the superblock */
 	int		preferred_minor;	/* autorun support */
 

From 7e93a89251d4ed7bd4475db62616ccd03ddfd01a Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:23 +1000
Subject: [PATCH 45/52] md: Remove some unused macros.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 include/linux/raid/md_p.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 3f2cd98c508b..8b4de4a41ff1 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -43,14 +43,11 @@
  */
 #define MD_RESERVED_BYTES		(64 * 1024)
 #define MD_RESERVED_SECTORS		(MD_RESERVED_BYTES / 512)
-#define MD_RESERVED_BLOCKS		(MD_RESERVED_BYTES / BLOCK_SIZE)
 
 #define MD_NEW_SIZE_SECTORS(x)		((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
-#define MD_NEW_SIZE_BLOCKS(x)		((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
 
 #define MD_SB_BYTES			4096
 #define MD_SB_WORDS			(MD_SB_BYTES / 4)
-#define MD_SB_BLOCKS			(MD_SB_BYTES / BLOCK_SIZE)
 #define MD_SB_SECTORS			(MD_SB_BYTES / 512)
 
 /*

From d7027458d68b2f1752a28016dcf2ffd0a7e8f567 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Sat, 12 Jul 2008 10:37:50 +1000
Subject: [PATCH 46/52] md: Tidy up rdev_size_store a bit:

 - used strict_strtoull in place of simple_strtoull
 - use my_mddev in place of rdev->mddev (they have the same value)
and more significantly,
 - don't adjust mddev->size to fit, rather reject changes which make
   rdev->size smaller than mddev->size

Adjusting mddev->size is a hangover from bind_rdev_to_array which
does a similar thing.  But it really is a better design to insist that
mddev->size is set as required, then the rdev->sizes are set to allow
for that.  The previous way invites confusion.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 Documentation/md.txt |  2 +-
 drivers/md/md.c      | 17 ++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index e06cc59437e4..1da9d1b1793f 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -347,7 +347,7 @@ Each directory contains:
         for storage of data.  This will normally be the same as the
 	component_size.  This can be written while assembling an
         array.  If a value less than the current component_size is
-        written, component_size will be reduced to this value.
+        written, it will be rejected.
 
 
 An active md device will also contain and entry for each active device
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5590cb54b584..95466bb089ab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2101,16 +2101,17 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
 static ssize_t
 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
-	char *e;
-	unsigned long long size = simple_strtoull(buf, &e, 10);
+	unsigned long long size;
 	unsigned long long oldsize = rdev->size;
 	mddev_t *my_mddev = rdev->mddev;
 
-	if (e==buf || (*e && *e != '\n'))
+	if (strict_strtoull(buf, 10, &size) < 0)
+		return -EINVAL;
+	if (size < my_mddev->size)
 		return -EINVAL;
 	if (my_mddev->pers && rdev->raid_disk >= 0) {
-		if (rdev->mddev->persistent) {
-			size = super_types[rdev->mddev->major_version].
+		if (my_mddev->persistent) {
+			size = super_types[my_mddev->major_version].
 				rdev_size_change(rdev, size);
 			if (!size)
 				return -EBUSY;
@@ -2118,12 +2119,12 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			size = (rdev->bdev->bd_inode->i_size >> 10);
 			size -= rdev->data_offset/2;
 		}
-		if (size < rdev->mddev->size)
+		if (size < my_mddev->size)
 			return -EINVAL; /* component must fit device */
 	}
 
 	rdev->size = size;
-	if (size > oldsize && rdev->mddev->external) {
+	if (size > oldsize && my_mddev->external) {
 		/* need to check that all other rdevs with the same ->bdev
 		 * do not overlap.  We need to unlock the mddev to avoid
 		 * a deadlock.  We have already changed rdev->size, and if
@@ -2165,8 +2166,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 			return -EBUSY;
 		}
 	}
-	if (size < my_mddev->size || my_mddev->size == 0)
-		my_mddev->size = size;
 	return len;
 }
 

From d07bd3bcc456228b56a790897162a634691fed9b Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 21 Jul 2008 14:42:07 +1000
Subject: [PATCH 47/52] md: Fix check for overlapping devices.

The checks in overlaps() expect all parameters either in block-based
or sector-based quantities. However, its single caller passes two
rdev->data_offset arguments as well as two rdev->size arguments, the
former being sector counts while the latter are measured in 1K blocks.

This could cause rdev_size_store() to accept an invalid size from user
space. Fix it by passing only sector-based quantities to overlaps().

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 95466bb089ab..a2813fa06b7c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2143,8 +2143,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 				if (test_bit(AllReserved, &rdev2->flags) ||
 				    (rdev->bdev == rdev2->bdev &&
 				     rdev != rdev2 &&
-				     overlaps(rdev->data_offset, rdev->size,
-					    rdev2->data_offset, rdev2->size))) {
+				     overlaps(rdev->data_offset, rdev->size * 2,
+					      rdev2->data_offset,
+					      rdev2->size * 2))) {
 					overlap = 1;
 					break;
 				}

From 15f4a5fdf3aa07b53f6a7969664741db5882e485 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 21 Jul 2008 14:42:12 +1000
Subject: [PATCH 48/52] md: Make super_type->rdev_size_change() take
 sector-based sizes.

Also, change the type of the size parameter from unsigned long long to
sector_t and rename it to num_sectors.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a2813fa06b7c..df13a17a9627 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -655,7 +655,7 @@ struct super_type  {
 	int		    (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 	void		    (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
 	unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
-						unsigned long long size);
+						sector_t num_sectors);
 };
 
 /*
@@ -998,20 +998,19 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
  * rdev_size_change for 0.90.0
  */
 static unsigned long long
-super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
+super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
-	if (size && size < rdev->mddev->size)
+	if (num_sectors && num_sectors < rdev->mddev->size * 2)
 		return 0; /* component must fit device */
-	size *= 2; /* convert to sectors */
 	if (rdev->mddev->bitmap_offset)
 		return 0; /* can't move bitmap */
 	rdev->sb_start = calc_dev_sboffset(rdev->bdev);
-	if (!size || size > rdev->sb_start)
-		size = rdev->sb_start;
+	if (!num_sectors || num_sectors > rdev->sb_start)
+		num_sectors = rdev->sb_start;
 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
-	return size/2; /* kB for sysfs */
+	return num_sectors / 2; /* kB for sysfs */
 }
 
 
@@ -1339,19 +1338,18 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 }
 
 static unsigned long long
-super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
+super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
 	struct mdp_superblock_1 *sb;
-	unsigned long long max_size;
-	if (size && size < rdev->mddev->size)
+	sector_t max_sectors;
+	if (num_sectors && num_sectors < rdev->mddev->size * 2)
 		return 0; /* component must fit device */
-	size *= 2; /* convert to sectors */
 	if (rdev->sb_start < rdev->data_offset) {
 		/* minor versions 1 and 2; superblock before data */
-		max_size = (rdev->bdev->bd_inode->i_size >> 9);
-		max_size -= rdev->data_offset;
-		if (!size || size > max_size)
-			size = max_size;
+		max_sectors = rdev->bdev->bd_inode->i_size >> 9;
+		max_sectors -= rdev->data_offset;
+		if (!num_sectors || num_sectors > max_sectors)
+			num_sectors = max_sectors;
 	} else if (rdev->mddev->bitmap_offset) {
 		/* minor version 0 with bitmap we can't move */
 		return 0;
@@ -1360,19 +1358,19 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
 		sector_t sb_start;
 		sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
 		sb_start &= ~(sector_t)(4*2 - 1);
-		max_size = rdev->size*2 + sb_start - rdev->sb_start;
-		if (!size || size > max_size)
-			size = max_size;
+		max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
+		if (!num_sectors || num_sectors > max_sectors)
+			num_sectors = max_sectors;
 		rdev->sb_start = sb_start;
 	}
 	sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
-	sb->data_size = cpu_to_le64(size);
+	sb->data_size = cpu_to_le64(num_sectors);
 	sb->super_offset = rdev->sb_start;
 	sb->sb_csum = calc_sb_1_csum(sb);
 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
-	return size/2; /* kB for sysfs */
+	return num_sectors / 2; /* kB for sysfs */
 }
 
 static struct super_type super_types[] = {
@@ -2112,7 +2110,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 	if (my_mddev->pers && rdev->raid_disk >= 0) {
 		if (my_mddev->persistent) {
 			size = super_types[my_mddev->major_version].
-				rdev_size_change(rdev, size);
+				rdev_size_change(rdev, size * 2);
 			if (!size)
 				return -EBUSY;
 		} else if (!size) {

From f233ea5c9e0d8b95e4283bf6a3436b88f6fd3586 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 21 Jul 2008 17:05:22 +1000
Subject: [PATCH 49/52] md: Make mddev->array_size sector-based.

This patch renames the array_size field of struct mddev_s to array_sectors
and converts all instances to use units of 512 byte sectors instead of 1k
blocks.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/faulty.c       |  2 +-
 drivers/md/linear.c       |  6 +++---
 drivers/md/md.c           | 12 +++++++-----
 drivers/md/multipath.c    |  2 +-
 drivers/md/raid0.c        |  8 ++++----
 drivers/md/raid1.c        | 11 ++++++-----
 drivers/md/raid10.c       |  2 +-
 drivers/md/raid5.c        | 16 +++++++++-------
 include/linux/raid/md_k.h |  2 +-
 9 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index d107ddceefcd..268547dbfbd3 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -297,7 +297,7 @@ static int run(mddev_t *mddev)
 	rdev_for_each(rdev, tmp, mddev)
 		conf->rdev = rdev;
 
-	mddev->array_size = mddev->size;
+	mddev->array_sectors = mddev->size * 2;
 	mddev->private = conf;
 
 	reconfig(mddev, mddev->layout, -1);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ec921f58fbb8..57644a780f16 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -256,7 +256,7 @@ static int linear_run (mddev_t *mddev)
 	if (!conf)
 		return 1;
 	mddev->private = conf;
-	mddev->array_size = conf->array_size;
+	mddev->array_sectors = conf->array_size * 2;
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
@@ -290,8 +290,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	newconf->prev = mddev_to_conf(mddev);
 	mddev->private = newconf;
 	mddev->raid_disks++;
-	mddev->array_size = newconf->array_size;
-	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	mddev->array_sectors = newconf->array_size * 2;
+	set_capacity(mddev->gendisk, mddev->array_sectors);
 	return 0;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index df13a17a9627..4bfbc1982cda 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3704,7 +3704,7 @@ static int do_md_run(mddev_t * mddev)
 	if (mddev->flags)
 		md_update_sb(mddev, 0);
 
-	set_capacity(disk, mddev->array_size<<1);
+	set_capacity(disk, mddev->array_sectors);
 
 	/* If we call blk_queue_make_request here, it will
 	 * re-initialise max_sectors etc which may have been
@@ -3905,7 +3905,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 
 		export_array(mddev);
 
-		mddev->array_size = 0;
+		mddev->array_sectors = 0;
 		mddev->size = 0;
 		mddev->raid_disks = 0;
 		mddev->recovery_cp = 0;
@@ -4644,7 +4644,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
 		bdev = bdget_disk(mddev->gendisk, 0);
 		if (bdev) {
 			mutex_lock(&bdev->bd_inode->i_mutex);
-			i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
+			i_size_write(bdev->bd_inode,
+				     (loff_t)mddev->array_sectors << 9);
 			mutex_unlock(&bdev->bd_inode->i_mutex);
 			bdput(bdev);
 		}
@@ -5391,10 +5392,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
 		if (!list_empty(&mddev->disks)) {
 			if (mddev->pers)
 				seq_printf(seq, "\n      %llu blocks",
-					(unsigned long long)mddev->array_size);
+					   (unsigned long long)
+					   mddev->array_sectors / 2);
 			else
 				seq_printf(seq, "\n      %llu blocks",
-					(unsigned long long)size);
+					   (unsigned long long)size);
 		}
 		if (mddev->persistent) {
 			if (mddev->major_version != 0 ||
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 541cbe3414bd..c4779ccba1c3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -504,7 +504,7 @@ static int multipath_run (mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_size = mddev->size;
+	mddev->array_sectors = mddev->size * 2;
 
 	mddev->queue->unplug_fn = multipath_unplug;
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 914c04ddec7c..2f30ebd8b7ab 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -293,16 +293,16 @@ static int raid0_run (mddev_t *mddev)
 		goto out_free_conf;
 
 	/* calculate array device size */
-	mddev->array_size = 0;
+	mddev->array_sectors = 0;
 	rdev_for_each(rdev, tmp, mddev)
-		mddev->array_size += rdev->size;
+		mddev->array_sectors += rdev->size * 2;
 
 	printk("raid0 : md_size is %llu blocks.\n", 
-		(unsigned long long)mddev->array_size);
+		(unsigned long long)mddev->array_sectors / 2);
 	printk("raid0 : conf->hash_spacing is %llu blocks.\n",
 		(unsigned long long)conf->hash_spacing);
 	{
-		sector_t s = mddev->array_size;
+		sector_t s = mddev->array_sectors / 2;
 		sector_t space = conf->hash_spacing;
 		int round;
 		conf->preshift = 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 491dc2d4ad5f..03a5ab705c20 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2043,7 +2043,7 @@ static int run(mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_size = mddev->size;
+	mddev->array_sectors = mddev->size * 2;
 
 	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2105,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
-	mddev->array_size = sectors>>1;
-	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	mddev->array_sectors = sectors;
+	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev->changed = 1;
-	if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
+	if (mddev->array_sectors / 2 > mddev->size &&
+	    mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
-	mddev->size = mddev->array_size;
+	mddev->size = mddev->array_sectors / 2;
 	mddev->resync_max_sectors = sectors;
 	return 0;
 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index df08a9fa3a1f..2acea4025243 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2164,7 +2164,7 @@ static int run(mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_size = size << (conf->chunk_shift-1);
+	mddev->array_sectors = size << conf->chunk_shift;
 	mddev->resync_max_sectors = size << conf->chunk_shift;
 
 	mddev->queue->unplug_fn = raid10_unplug;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8f4c70a53210..42a480ba767b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3540,7 +3540,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 			    j == raid6_next_disk(sh->pd_idx, sh->disks))
 				continue;
 			s = compute_blocknr(sh, j);
-			if (s < (mddev->array_size<<1)) {
+			if (s < mddev->array_sectors) {
 				skipped = 1;
 				continue;
 			}
@@ -4189,7 +4189,7 @@ static int run(mddev_t *mddev)
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
-	mddev->array_size =  mddev->size * (conf->previous_raid_disks -
+	mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
 					    conf->max_degraded);
 
 	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
@@ -4413,8 +4413,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 
 	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-	mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
-	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	mddev->array_sectors = sectors * (mddev->raid_disks
+					  - conf->max_degraded);
+	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev->changed = 1;
 	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
@@ -4547,15 +4548,16 @@ static void end_reshape(raid5_conf_t *conf)
 	struct block_device *bdev;
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-		conf->mddev->array_size = conf->mddev->size *
+		conf->mddev->array_sectors = 2 * conf->mddev->size *
 			(conf->raid_disks - conf->max_degraded);
-		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+		set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
 		conf->mddev->changed = 1;
 
 		bdev = bdget_disk(conf->mddev->gendisk, 0);
 		if (bdev) {
 			mutex_lock(&bdev->bd_inode->i_mutex);
-			i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
+			i_size_write(bdev->bd_inode,
+				     (loff_t)conf->mddev->array_sectors << 9);
 			mutex_unlock(&bdev->bd_inode->i_mutex);
 			bdput(bdev);
 		}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index e37aaa41efc6..6f72b47ae41c 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -150,7 +150,7 @@ struct mddev_s
 	int				raid_disks;
 	int				max_disks;
 	sector_t			size; /* used size of component devices */
-	sector_t			array_size; /* exported array size */
+	sector_t			array_sectors; /* exported array size */
 	__u64				events;
 
 	char				uuid[16];

From d6e2215052810678bc9782fd980b52706fc71f50 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 21 Jul 2008 17:05:25 +1000
Subject: [PATCH 50/52] md: linear: Make array_size sector-based and rename it
 to array_sectors.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c         | 16 ++++++++--------
 include/linux/raid/linear.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 57644a780f16..1cafaa959443 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -120,7 +120,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		return NULL;
 
 	cnt = 0;
-	conf->array_size = 0;
+	conf->array_sectors = 0;
 
 	rdev_for_each(rdev, tmp, mddev) {
 		int j = rdev->raid_disk;
@@ -144,7 +144,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->size = rdev->size;
-		conf->array_size += rdev->size;
+		conf->array_sectors += rdev->size * 2;
 
 		cnt++;
 	}
@@ -153,7 +153,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		goto out;
 	}
 
-	min_spacing = conf->array_size;
+	min_spacing = conf->array_sectors / 2;
 	sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
 
 	/* min_spacing is the minimum spacing that will fit the hash
@@ -162,7 +162,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	 * that is larger than min_spacing as use the size of that as
 	 * the actual spacing
 	 */
-	conf->hash_spacing = conf->array_size;
+	conf->hash_spacing = conf->array_sectors / 2;
 	for (i=0; i < cnt-1 ; i++) {
 		sector_t sz = 0;
 		int j;
@@ -192,7 +192,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		unsigned round;
 		unsigned long base;
 
-		sz = conf->array_size >> conf->preshift;
+		sz = conf->array_sectors >> (conf->preshift + 1);
 		sz += 1; /* force round-up */
 		base = conf->hash_spacing >> conf->preshift;
 		round = sector_div(sz, base);
@@ -219,7 +219,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	curr_offset = 0;
 	i = 0;
 	for (curr_offset = 0;
-	     curr_offset < conf->array_size;
+	     curr_offset < conf->array_sectors / 2;
 	     curr_offset += conf->hash_spacing) {
 
 		while (i < raid_disks-1 &&
@@ -256,7 +256,7 @@ static int linear_run (mddev_t *mddev)
 	if (!conf)
 		return 1;
 	mddev->private = conf;
-	mddev->array_sectors = conf->array_size * 2;
+	mddev->array_sectors = conf->array_sectors;
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
@@ -290,7 +290,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	newconf->prev = mddev_to_conf(mddev);
 	mddev->private = newconf;
 	mddev->raid_disks++;
-	mddev->array_sectors = newconf->array_size * 2;
+	mddev->array_sectors = newconf->array_sectors;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	return 0;
 }
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index ba15469daf11..7e375111d007 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -16,7 +16,7 @@ struct linear_private_data
 	struct linear_private_data *prev;	/* earlier version */
 	dev_info_t		**hash_table;
 	sector_t		hash_spacing;
-	sector_t		array_size;
+	sector_t		array_sectors;
 	int			preshift; /* shift before dividing by hash_spacing */
 	dev_info_t		disks[0];
 };

From f2ea68cf42aafdd93393b6b8b20fc3c2b5f4390c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 Jul 2008 17:05:25 +1000
Subject: [PATCH 51/52] md: only count actual openers as access which prevent a
 'stop'

Open isn't the only thing that increments ->active.  e.g. reading
/proc/mdstat will increment it briefly.  So to avoid false positives
in testing for concurrent access, introduce a new counter that counts
just the number of times the md device it open.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 9 ++++++---
 include/linux/raid/md_k.h | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4bfbc1982cda..450f30b6edf9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -273,6 +273,7 @@ static mddev_t * mddev_find(dev_t unit)
 	INIT_LIST_HEAD(&new->all_mddevs);
 	init_timer(&new->safemode_timer);
 	atomic_set(&new->active, 1);
+	atomic_set(&new->openers, 0);
 	spin_lock_init(&new->write_lock);
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
@@ -2695,14 +2696,14 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 		break;
 	case clear:
 		/* stopping an active array */
-		if (atomic_read(&mddev->active) > 1)
+		if (atomic_read(&mddev->openers) > 0)
 			return -EBUSY;
 		err = do_md_stop(mddev, 0, 0);
 		break;
 	case inactive:
 		/* stopping an active array */
 		if (mddev->pers) {
-			if (atomic_read(&mddev->active) > 1)
+			if (atomic_read(&mddev->openers) > 0)
 				return -EBUSY;
 			err = do_md_stop(mddev, 2, 0);
 		} else
@@ -3816,7 +3817,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	int err = 0;
 	struct gendisk *disk = mddev->gendisk;
 
-	if (atomic_read(&mddev->active) > 1 + is_open) {
+	if (atomic_read(&mddev->openers) > is_open) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		return -EBUSY;
 	}
@@ -5014,6 +5015,7 @@ static int md_open(struct inode *inode, struct file *file)
 
 	err = 0;
 	mddev_get(mddev);
+	atomic_inc(&mddev->openers);
 	mddev_unlock(mddev);
 
 	check_disk_change(inode->i_bdev);
@@ -5026,6 +5028,7 @@ static int md_release(struct inode *inode, struct file * file)
  	mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
 
 	BUG_ON(!mddev);
+	atomic_dec(&mddev->openers);
 	mddev_put(mddev);
 
 	return 0;
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 6f72b47ae41c..4bef4791d80d 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -215,7 +215,8 @@ struct mddev_s
 
 	int				in_sync;	/* know to not need resync */
 	struct mutex			reconfig_mutex;
-	atomic_t			active;
+	atomic_t			active;		/* general refcount */
+	atomic_t			openers;	/* number of active opens */
 
 	int				changed;	/* true if we might need to reread partition info */
 	int				degraded;	/* whether md should consider

From 4b80991c6cb9efa607bc4fd6f3ecdf5511c31bb0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 Jul 2008 17:05:25 +1000
Subject: [PATCH 52/52] md: Protect access to mddev->disks list using RCU

All modifications and most access to the mddev->disks list are made
under the reconfig_mutex lock.  However there are three places where
the list is walked without any locking.  If a reconfig happens at this
time, havoc (and oops) can ensue.

So use RCU to protect these accesses:
  - wrap them in rcu_read_{,un}lock()
  - use list_for_each_entry_rcu
  - add to the list with list_add_rcu
  - delete from the list with list_del_rcu
  - delay the 'free' with call_rcu rather than schedule_work

Note that export_rdev did a list_del_init on this list.  In almost all
cases the entry was not in the list anymore so it was a no-op and so
safe.  It is no longer safe as after list_del_rcu we may not touch
the list_head.
An audit shows that export_rdev is called:
  - after unbind_rdev_from_array, in which case the delete has
     already been done,
  - after bind_rdev_to_array fails, in which case the delete isn't needed.
  - before the device has been put on a list at all (e.g. in
      add_new_disk where reading the superblock fails).
  - and in autorun devices after a failure when the device is on a
      different list.

So remove the list_del_init call from export_rdev, and add it back
immediately before the called to export_rdev for that last case.

Note also that ->same_set is sometimes used for lists other than
mddev->list (e.g. candidates).  In these cases rcu is not needed.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c       | 15 ++++++++++-----
 drivers/md/md.c           | 30 ++++++++++++++++++------------
 include/linux/raid/md_k.h |  3 +++
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index eba83e25b678..621a272a2c74 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 	mddev_t *mddev = bitmap->mddev;
 
-	rdev_for_each(rdev, tmp, mddev)
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev)
 		if (test_bit(In_sync, &rdev->flags)
 		    && !test_bit(Faulty, &rdev->flags)) {
 			int size = PAGE_SIZE;
@@ -260,11 +260,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				    + (long)(page->index * (PAGE_SIZE/512))
 				    + size/512 > 0)
 					/* bitmap runs in to metadata */
-					return -EINVAL;
+					goto bad_alignment;
 				if (rdev->data_offset + mddev->size*2
 				    > rdev->sb_start + bitmap->offset)
 					/* data runs in to bitmap */
-					return -EINVAL;
+					goto bad_alignment;
 			} else if (rdev->sb_start < rdev->data_offset) {
 				/* METADATA BITMAP DATA */
 				if (rdev->sb_start
@@ -272,7 +272,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				    + page->index*(PAGE_SIZE/512) + size/512
 				    > rdev->data_offset)
 					/* bitmap runs in to data */
-					return -EINVAL;
+					goto bad_alignment;
 			} else {
 				/* DATA METADATA BITMAP - no problems */
 			}
@@ -282,10 +282,15 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				       size,
 				       page);
 		}
+	rcu_read_unlock();
 
 	if (wait)
 		md_super_wait(mddev);
 	return 0;
+
+ bad_alignment:
+	rcu_read_unlock();
+	return -EINVAL;
 }
 
 static void bitmap_file_kick(struct bitmap *bitmap);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 450f30b6edf9..c2ff77ccec50 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1395,15 +1395,17 @@ static struct super_type super_types[] = {
 
 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 {
-	struct list_head *tmp, *tmp2;
 	mdk_rdev_t *rdev, *rdev2;
 
-	rdev_for_each(rdev, tmp, mddev1)
-		rdev_for_each(rdev2, tmp2, mddev2)
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev1)
+		rdev_for_each_rcu(rdev2, mddev2)
 			if (rdev->bdev->bd_contains ==
-			    rdev2->bdev->bd_contains)
+			    rdev2->bdev->bd_contains) {
+				rcu_read_unlock();
 				return 1;
-
+			}
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -1470,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 		kobject_del(&rdev->kobj);
 		goto fail;
 	}
-	list_add(&rdev->same_set, &mddev->disks);
+	list_add_rcu(&rdev->same_set, &mddev->disks);
 	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
 	return 0;
 
@@ -1495,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		return;
 	}
 	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
-	list_del_init(&rdev->same_set);
+	list_del_rcu(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
 	sysfs_remove_link(&rdev->kobj, "block");
 
 	/* We need to delay this, otherwise we can deadlock when
-	 * writing to 'remove' to "dev/state"
+	 * writing to 'remove' to "dev/state".  We also need
+	 * to delay it due to rcu usage.
 	 */
+	synchronize_rcu();
 	INIT_WORK(&rdev->del_work, md_delayed_delete);
 	kobject_get(&rdev->kobj);
 	schedule_work(&rdev->del_work);
@@ -1558,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
 	if (rdev->mddev)
 		MD_BUG();
 	free_disk_sb(rdev);
-	list_del_init(&rdev->same_set);
 #ifndef MODULE
 	if (test_bit(AutoDetected, &rdev->flags))
 		md_autodetect_dev(rdev->bdev->bd_dev);
@@ -4062,8 +4065,10 @@ static void autorun_devices(int part)
 		/* on success, candidates will be empty, on error
 		 * it won't...
 		 */
-		rdev_for_each_list(rdev, tmp, candidates)
+		rdev_for_each_list(rdev, tmp, candidates) {
+			list_del_init(&rdev->same_set);
 			export_rdev(rdev);
+		}
 		mddev_put(mddev);
 	}
 	printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -5529,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
 static int is_mddev_idle(mddev_t *mddev)
 {
 	mdk_rdev_t * rdev;
-	struct list_head *tmp;
 	int idle;
 	long curr_events;
 
 	idle = 1;
-	rdev_for_each(rdev, tmp, mddev) {
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
 		curr_events = disk_stat_read(disk, sectors[0]) + 
 				disk_stat_read(disk, sectors[1]) - 
@@ -5566,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
 			idle = 0;
 		}
 	}
+	rcu_read_unlock();
 	return idle;
 }
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4bef4791d80d..9f2549ac0e2d 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -339,6 +339,9 @@ static inline char * mdname (mddev_t * mddev)
 #define rdev_for_each(rdev, tmp, mddev)				\
 	rdev_for_each_list(rdev, tmp, (mddev)->disks)
 
+#define rdev_for_each_rcu(rdev, mddev)				\
+	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
 typedef struct mdk_thread_s {
 	void			(*run) (mddev_t *mddev);
 	mddev_t			*mddev;