From b74fd2826c5acce20e6f691437b2d19372bc2057 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:47:19 +1000 Subject: [PATCH 1/7] md: fix loading of out-of-date bitmap. When md is loading a bitmap which it knows is out of date, it fills each page with 1s and writes it back out again. However the write_page call makes used of bitmap->file_pages and bitmap->last_page_size which haven't been set correctly yet. So this can sometimes fail. Move the setting of file_pages and last_page_size to before the call to write_page. This bug can cause the assembly on an array to fail, thus making the data inaccessible. Hence I think it is a suitable candidate for -stable. Cc: stable@kernel.org Reported-by: Vojtech Pavlik Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1fb91edc7de2..bc1d64b7b63b 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -986,6 +986,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) oldindex = index; oldpage = page; + bitmap->filemap[bitmap->file_pages++] = page; + bitmap->last_page_size = count; + if (outofdate) { /* * if bitmap is out of date, dirty the @@ -998,15 +1001,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) write_page(bitmap, page, 1); ret = -EIO; - if (bitmap->flags & BITMAP_WRITE_ERROR) { - /* release, page not in filemap yet */ - put_page(page); + if (bitmap->flags & BITMAP_WRITE_ERROR) goto err; - } } - - bitmap->filemap[bitmap->file_pages++] = page; - bitmap->last_page_size = count; } paddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) From 18055569127253755d01733f6ecc004ed02f88d0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:48:10 +1000 Subject: [PATCH 2/7] md/raid10: don't clear bitmap during recovery if array will still be degraded. If we have a raid10 with multiple missing devices, and we recover just one of these to a spare, then we risk (depending on the bitmap and array chunk size) clearing bits of the bitmap for which recovery isn't complete (because a device is still missing). This can lead to a subsequent "re-add" being recovered without any IO happening, which would result in loss of data. This patch takes the safe approach of not clearing bitmap bits if the array will still be degraded. This patch is suitable for all active -stable kernels. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/raid10.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 81a54f17417e..499620afb44b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1809,17 +1809,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r10_bio->sector = sect; raid10_find_phys(conf, r10_bio); - /* Need to check if this section will still be + + /* Need to check if the array will still be * degraded */ - for (j=0; jcopies;j++) { - int d = r10_bio->devs[j].devnum; - if (conf->mirrors[d].rdev == NULL || - test_bit(Faulty, &conf->mirrors[d].rdev->flags)) { + for (j=0; jraid_disks; j++) + if (conf->mirrors[j].rdev == NULL || + test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { still_degraded = 1; break; } - } + must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, still_degraded); From db305e507d554430a69ede901a6308e6ecb72349 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:49:06 +1000 Subject: [PATCH 3/7] md: fix some (more) errors with bitmaps on devices larger than 2TB. If a write intent bitmap covers more than 2TB, we sometimes work with values beyond 32bit, so these need to be sector_t. This patches add the required casts to some unsigned longs that are being shifted up. This will affect any raid10 larger than 2TB, or any raid1/4/5/6 with member devices that are larger than 2TB. Signed-off-by: NeilBrown Reported-by: "Mario 'BitKoenig' Holbe" Cc: stable@kernel.org --- drivers/md/bitmap.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index bc1d64b7b63b..47c68bc75a17 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1013,9 +1013,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) kunmap_atomic(paddr, KM_USER0); if (b) { /* if the disk bit is set, set the memory bit */ - bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), - ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start) - ); + int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) + >= start); + bitmap_set_memory_bits(bitmap, + (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), + needed); bit_cnt++; set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } @@ -1151,8 +1153,9 @@ void bitmap_daemon_work(struct bitmap *bitmap) spin_lock_irqsave(&bitmap->lock, flags); clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } - bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap), - &blocks, 0); + bmc = bitmap_get_counter(bitmap, + (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), + &blocks, 0); if (bmc) { /* if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc); @@ -1166,7 +1169,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) } else if (*bmc == 1) { /* we can clear the bit */ *bmc = 0; - bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap), + bitmap_count_page(bitmap, + (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), -1); /* clear the bit */ @@ -1511,7 +1515,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) unsigned long chunk; for (chunk = s; chunk <= e; chunk++) { - sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap); + sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); bitmap_set_memory_bits(bitmap, sec, 1); bitmap_file_set_bit(bitmap, sec); } From dd71cf6b2773310b01c6fe6c773064c80fd2476b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:49:35 +1000 Subject: [PATCH 4/7] md: tidy up status_resync to handle large arrays. Two problems in status_resync. 1/ It still used Kilobytes as the basic block unit, while most code now uses sectors uniformly. 2/ It doesn't allow for the possibility that max_sectors exceeds the range of "unsigned long". So - change "max_blocks" to "max_sectors", and store sector numbers in there and in 'resync' - Make 'rt' a 'sector_t' so it can temporarily hold the number of remaining sectors. - use sector_div rather than normal division. - change the magic '100' used to preserve precision to '32'. + making it a power of 2 makes division easier + it doesn't need to be as large as it was chosen when we averaged speed over the entire run. Now we average speed over the last 30 seconds or so. Reported-by: "Mario 'BitKoenig' Holbe" Signed-off-by: NeilBrown --- drivers/md/md.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 612343fdde94..5eb01a4d27ba 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5705,37 +5705,38 @@ static void status_unused(struct seq_file *seq) static void status_resync(struct seq_file *seq, mddev_t * mddev) { - sector_t max_blocks, resync, res; - unsigned long dt, db, rt; + sector_t max_sectors, resync, res; + unsigned long dt, db; + sector_t rt; int scale; unsigned int per_milli; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors >> 1; + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->dev_sectors / 2; + max_sectors = mddev->dev_sectors; /* * Should not happen. */ - if (!max_blocks) { + if (!max_sectors) { MD_BUG(); return; } /* Pick 'scale' such that (resync>>scale)*1000 will fit - * in a sector_t, and (max_blocks>>scale) will fit in a + * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. * Thus 'scale' must be at least 10 */ scale = 10; if (sizeof(sector_t) > sizeof(unsigned long)) { - while ( max_blocks/2 > (1ULL<<(scale+32))) + while ( max_sectors/2 > (1ULL<<(scale+32))) scale++; } res = (resync>>scale)*1000; - sector_div(res, (u32)((max_blocks>>scale)+1)); + sector_div(res, (u32)((max_sectors>>scale)+1)); per_milli = res; { @@ -5756,25 +5757,35 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"))), per_milli/10, per_milli % 10, - (unsigned long long) resync, - (unsigned long long) max_blocks); + (unsigned long long) resync/2, + (unsigned long long) max_sectors/2); /* - * We do not want to overflow, so the order of operands and - * the * 100 / 100 trick are important. We do a +1 to be - * safe against division by zero. We only estimate anyway. - * * dt: time from mark until now * db: blocks written from mark until now * rt: remaining time + * + * rt is a sector_t, so could be 32bit or 64bit. + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid loosing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - mddev->resync_mark_cnt; - rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; - seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + rt = max_sectors - resync; /* number of remaining sectors */ + sector_div(rt, db/32+1); + rt *= dt; + rt >>= 5; + + seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, + ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); } From 110518bccf076726cc93bf604527d8019aae50ba Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Thu, 7 May 2009 12:49:37 +1000 Subject: [PATCH 5/7] md: constify VFTs Signed-off-by: Jan Engelhardt Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5eb01a4d27ba..8350bde60d1b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5976,7 +5976,7 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations md_seq_ops = { +static const struct seq_operations md_seq_ops = { .start = md_seq_start, .next = md_seq_next, .stop = md_seq_stop, From 5bf295975416f8e97117bbbcfb0191c00bc3e2b4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:50:57 +1000 Subject: [PATCH 6/7] md: remove ability to explicit set an inactive array to 'clean'. Being able to write 'clean' to an 'array_state' of an inactive array to activate it in 'clean' mode is both unnecessary and inconvenient. It is unnecessary because the same can be achieved by writing 'active'. This activates and array, but it still remains 'clean' until the first write. It is inconvenient because writing 'clean' is more often used to cause an 'active' array to revert to 'clean' mode (thus blocking any writes until a 'write-pending' is promoted to 'active'). Allowing 'clean' to both activate an array and mark an active array as clean can lead to races: One program writes 'clean' to mark the active array as clean at the same time as another program writes 'inactive' to deactivate (stop) and active array. Depending on which writes first, the array could be deactivated and immediately reactivated which isn't what was desired. So just disable the use of 'clean' to activate an array. This avoids a race that can be triggered with mdadm-3.0 and external metadata, so it suitable for -stable. Reported-by: Rafal Marszewski Acked-by: Dan Williams Cc: Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8350bde60d1b..1dd723d31882 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3066,11 +3066,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) } else err = -EBUSY; spin_unlock_irq(&mddev->write_lock); - } else { - mddev->ro = 0; - mddev->recovery_cp = MaxSector; - err = do_md_run(mddev); - } + } else + err = -EINVAL; break; case active: if (mddev->pers) { From c4647292fda0833bebe45be27f04453b736981fa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 May 2009 12:51:06 +1000 Subject: [PATCH 7/7] md: remove rd%d links immediately after stopping an array. md maintains link in sys/mdXX/md/ to identify which device has which role in the array. e.g. rd2 -> dev-sda indicates that the device with role '2' in the array is sda. These links are only present when the array is active. They are created immediately after ->run is called, and so should be removed immediately after ->stop is called. However they are currently removed a little bit later, and it is possible for ->run to be called again, thus adding these links, before they are removed. So move the removal earlier so they are consistently only present when the array is active. Signed-off-by: NeilBrown --- drivers/md/md.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1dd723d31882..fccc8343a250 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4294,6 +4294,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) { int err = 0; struct gendisk *disk = mddev->gendisk; + mdk_rdev_t *rdev; if (atomic_read(&mddev->openers) > is_open) { printk("md: %s still in use.\n",mdname(mddev)); @@ -4336,6 +4337,13 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + set_capacity(disk, 0); mddev->changed = 1; @@ -4356,7 +4364,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) * Free resources if final stop */ if (mode == 0) { - mdk_rdev_t *rdev; printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); @@ -4368,13 +4375,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } mddev->bitmap_offset = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } - /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work();