From 9b622e2bbcf049c82e2550d35fb54ac205965f50 Mon Sep 17 00:00:00 2001 From: Tomasz Majchrzak Date: Thu, 28 Jul 2016 10:28:25 +0200 Subject: [PATCH 01/12] raid10: increment write counter after bio is split md pending write counter must be incremented after bio is split, otherwise it gets decremented too many times in end bio callback and becomes negative. Signed-off-by: Tomasz Majchrzak Reviewed-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ed29fc899f06..1a632a8c8005 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1064,6 +1064,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio) int max_sectors; int sectors; + md_write_start(mddev, bio); + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -1445,8 +1447,6 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) return; } - md_write_start(mddev, bio); - do { /* From ff00d3b4e5e4395c825e8ec628f25932d812f31a Mon Sep 17 00:00:00 2001 From: ZhengYuan Liu Date: Thu, 28 Jul 2016 14:22:14 +0800 Subject: [PATCH 02/12] raid5: fix incorrectly counter of conf->empty_inactive_list_nr The counter conf->empty_inactive_list_nr is only used for determine if the raid5 is congested which is deal with in function raid5_congested(). It was increased in get_free_stripe() when conf->inactive_list got to be empty and decreased in release_inactive_stripe_list() when splice temp_inactive_list to conf->inactive_list. However, this may have a problem when raid5_get_active_stripe or stripe_add_to_batch_list was called, because these two functions may call list_del_init(&sh->lru) to delete sh from "conf->inactive_list + hash" which may cause "conf->inactive_list + hash" to be empty when atomic_inc_not_zero(&sh->count) got false. So a check should be done at these two point and increase empty_inactive_list_nr accordingly. Otherwise the counter may get to be negative number which would influence async readahead from VFS. Signed-off-by: ZhengYuan Liu Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d189e894b921..e379b89fd8b1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -659,6 +659,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, { struct stripe_head *sh; int hash = stripe_hash_locks_hash(sector); + int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -703,7 +704,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, atomic_inc(&conf->active_stripes); BUG_ON(list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; list_del_init(&sh->lru); + if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; @@ -762,6 +768,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh sector_t head_sector, tmp_sec; int hash; int dd_idx; + int inc_empty_inactive_list_flag; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; @@ -779,7 +786,12 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh atomic_inc(&conf->active_stripes); BUG_ON(list_empty(&head->lru) && !test_bit(STRIPE_EXPANDING, &head->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; list_del_init(&head->lru); + if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); if (head->group) { head->group->stripes_cnt--; head->group = NULL; From d9dd26b20cff88b45d861ec786d86b1c9bd2ee60 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sat, 30 Jul 2016 10:05:31 -0700 Subject: [PATCH 03/12] MD: hold mddev lock to change bitmap location Changing the location changes a lot of things. Holding the lock to avoid race. This makes the .quiesce called with mddev lock hold too. Acked-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/bitmap.c | 47 +++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6fff794e0c72..13041ee37ad6 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -2183,19 +2183,29 @@ location_show(struct mddev *mddev, char *page) static ssize_t location_store(struct mddev *mddev, const char *buf, size_t len) { + int rv; + rv = mddev_lock(mddev); + if (rv) + return rv; if (mddev->pers) { - if (!mddev->pers->quiesce) - return -EBUSY; - if (mddev->recovery || mddev->sync_thread) - return -EBUSY; + if (!mddev->pers->quiesce) { + rv = -EBUSY; + goto out; + } + if (mddev->recovery || mddev->sync_thread) { + rv = -EBUSY; + goto out; + } } if (mddev->bitmap || mddev->bitmap_info.file || mddev->bitmap_info.offset) { /* bitmap already configured. Only option is to clear it */ - if (strncmp(buf, "none", 4) != 0) - return -EBUSY; + if (strncmp(buf, "none", 4) != 0) { + rv = -EBUSY; + goto out; + } if (mddev->pers) { mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); @@ -2214,21 +2224,25 @@ location_store(struct mddev *mddev, const char *buf, size_t len) /* nothing to be done */; else if (strncmp(buf, "file:", 5) == 0) { /* Not supported yet */ - return -EINVAL; + rv = -EINVAL; + goto out; } else { - int rv; if (buf[0] == '+') rv = kstrtoll(buf+1, 10, &offset); else rv = kstrtoll(buf, 10, &offset); if (rv) - return rv; - if (offset == 0) - return -EINVAL; + goto out; + if (offset == 0) { + rv = -EINVAL; + goto out; + } if (mddev->bitmap_info.external == 0 && mddev->major_version == 0 && - offset != mddev->bitmap_info.default_offset) - return -EINVAL; + offset != mddev->bitmap_info.default_offset) { + rv = -EINVAL; + goto out; + } mddev->bitmap_info.offset = offset; if (mddev->pers) { struct bitmap *bitmap; @@ -2245,7 +2259,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 0); if (rv) { bitmap_destroy(mddev); - return rv; + goto out; } } } @@ -2257,6 +2271,11 @@ location_store(struct mddev *mddev, const char *buf, size_t len) set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); } + rv = 0; +out: + mddev_unlock(mddev); + if (rv) + return rv; return len; } From 11367799f3d12a5074c4a3c0fa4ea8da2a21a2a4 Mon Sep 17 00:00:00 2001 From: Alexey Obitotskiy Date: Wed, 3 Aug 2016 10:02:56 +0200 Subject: [PATCH 04/12] md: Prevent IO hold during accessing to faulty raid5 array After array enters in faulty state (e.g. number of failed drives becomes more then accepted for raid5 level) it sets error flags (one of this flags is MD_CHANGE_PENDING). For internal metadata arrays MD_CHANGE_PENDING cleared into md_update_sb, but not for external metadata arrays. MD_CHANGE_PENDING flag set prevents to finish all new or non-finished IOs to array and hold them in pending state. In some cases this can leads to deadlock situation. For example, we have faulty array (2 of 4 drives failed) and udev handle array state changes and blkid started (or other userspace application that used array to read/write) but unable to finish reads due to IO hold. At the same time we unable to get exclusive access to array (to stop array in our case) because another external application still use this array. Fix makes possible to return IO with errors immediately. So external application can finish working with array and give exclusive access to other applications to perform required management actions with array. Signed-off-by: Alexey Obitotskiy Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e379b89fd8b1..4f8f5242ea3b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4640,7 +4640,9 @@ finish: } if (!bio_list_empty(&s.return_bi)) { - if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { + if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) && + (s.failed <= conf->max_degraded || + conf->mddev->external == 0)) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->return_bi, &s.return_bi); spin_unlock_irq(&conf->device_lock); From b347af816ad2086c1dacf9f74973b82f83e877be Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 11 Aug 2016 17:14:45 -0700 Subject: [PATCH 05/12] md: do not count journal as spare in GET_ARRAY_INFO GET_ARRAY_INFO counts journal as spare (spare_disks), which is not accurate. This patch fixes this. Reported-by: Yi Zhang Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2c3ab6f5e6be..d750b52376b9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5851,6 +5851,9 @@ static int get_array_info(struct mddev *mddev, void __user *arg) working++; if (test_bit(In_sync, &rdev->flags)) insync++; + else if (test_bit(Journal, &rdev->flags)) + /* TODO: add journal count to md_u.h */ + ; else spare++; } From 207efcd2b55e0460dfee35663fbb3d05efad990a Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:40 +0800 Subject: [PATCH 06/12] md: remove obsolete ret in md_start_sync The ret is not needed anymore since we have already move resync_start into md_do_sync in commit 41a9a0d. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d750b52376b9..19d8e23bda5c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8278,16 +8278,13 @@ no_add: static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - int ret = 0; mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); if (!mddev->sync_thread) { - if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); + printk(KERN_ERR "%s: could not start resync thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); From c622ca543bff8e73efacf4dafa0cc9851ecea511 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Tue, 16 Aug 2016 14:26:08 +0200 Subject: [PATCH 07/12] md: don't print the same repeated messages about delayed sync operation This fixes a long-standing bug that caused a flood of messages like: "md: delaying data-check of md1 until md2 has finished (they share one or more physical units)" It can be reproduced like this: 1. Create at least 3 raid1 arrays on a pair of disks, each on different partitions. 2. Request a sync operation like 'check' or 'repair' on 2 arrays by writing to their md/sync_action attribute files. One operation should start and one should be delayed and a message like the above will be printed. 3. Issue a write to the third array. Each write will cause 2 copies of the message to be printed. This happens when wake_up(&resync_wait) is called, usually by md_check_recovery(). Then the delayed sync thread again prints the message and is put to sleep. This patch adds a check in md_do_sync() to prevent printing this message more than once for the same pair of devices. Reported-by: Sven Koehler Link: https://bugzilla.kernel.org/show_bug.cgi?id=151801 Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/md.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 19d8e23bda5c..cc25cbcf10b2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7865,6 +7865,7 @@ void md_do_sync(struct md_thread *thread) */ do { + int mddev2_minor = -1; mddev->curr_resync = 2; try_again: @@ -7894,10 +7895,14 @@ void md_do_sync(struct md_thread *thread) prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { - printk(KERN_INFO "md: delaying %s of %s" - " until %s has finished (they" - " share one or more physical units)\n", - desc, mdname(mddev), mdname(mddev2)); + if (mddev2_minor != mddev2->md_minor) { + mddev2_minor = mddev2->md_minor; + printk(KERN_INFO "md: delaying %s of %s" + " until %s has finished (they" + " share one or more physical units)\n", + desc, mdname(mddev), + mdname(mddev2)); + } mddev_put(mddev2); if (signal_pending(current)) flush_signals(current); From 486b0f7bcd64be027535811ef44195bc1027fbd3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 19 Aug 2016 15:34:01 -0700 Subject: [PATCH 08/12] r5cache: set MD_JOURNAL_CLEAN correctly Currently, the code sets MD_JOURNAL_CLEAN when the array has MD_FEATURE_JOURNAL and the recovery_cp is MaxSector. The array will be MD_JOURNAL_CLEAN even if the journal device is missing. With this patch, the MD_JOURNAL_CLEAN is only set when the journal device presents. Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/md.c | 5 +---- drivers/md/raid5.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index cc25cbcf10b2..4f6cf3b849e3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1604,11 +1604,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) { + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); - if (mddev->recovery_cp == MaxSector) - set_bit(MD_JOURNAL_CLEAN, &mddev->flags); - } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count) */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4f8f5242ea3b..2119e094dfb3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6840,11 +6840,14 @@ static int raid5_run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) { - printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n", - mdname(mddev)); - mddev->ro = 1; - set_disk_ro(mddev->gendisk, 1); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + if (!journal_dev) { + pr_err("md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + } else if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); } conf->min_offset_diff = min_offset_diff; From 0f6187dbe542d71ace8ba0908954b0f4f8a30a1e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 21 Aug 2016 14:42:25 +0000 Subject: [PATCH 09/12] md-cluster: fix error return code in join() Fix to return error code -ENOMEM from the lockres_init() error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 41573f1f626f..34a840d9df76 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -834,8 +834,10 @@ static int join(struct mddev *mddev, int nodes) goto err; } cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); - if (!cinfo->ack_lockres) + if (!cinfo->ack_lockres) { + ret = -ENOMEM; goto err; + } /* get sync CR lock on ACK. */ if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", @@ -849,8 +851,10 @@ static int join(struct mddev *mddev, int nodes) pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); - if (!cinfo->bitmap_lockres) + if (!cinfo->bitmap_lockres) { + ret = -ENOMEM; goto err; + } if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { pr_err("Failed to get bitmap lock\n"); ret = -EINVAL; @@ -858,8 +862,10 @@ static int join(struct mddev *mddev, int nodes) } cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0); - if (!cinfo->resync_lockres) + if (!cinfo->resync_lockres) { + ret = -ENOMEM; goto err; + } return 0; err: From 27028626b4b9022dcac23688e09ea43b36e1183c Mon Sep 17 00:00:00 2001 From: Tomasz Majchrzak Date: Tue, 23 Aug 2016 10:53:57 +0200 Subject: [PATCH 10/12] raid10: record correct address of bad block For failed write request record block address on a device, not block address in an array. Signed-off-by: Tomasz Majchrzak Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1a632a8c8005..4589866257d5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2465,20 +2465,21 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) while (sect_to_write) { struct bio *wbio; + sector_t wsector; if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); - wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, rdev) + - (sector - r10_bio->sector)); + wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); + wbio->bi_iter.bi_sector = wsector + + choose_data_offset(r10_bio, rdev); wbio->bi_bdev = rdev->bdev; bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); if (submit_bio_wait(wbio) < 0) /* Failure! */ - ok = rdev_set_badblocks(rdev, sector, + ok = rdev_set_badblocks(rdev, wsector, sectors, 0) && ok; From 5f9d1fde7d54a5d5fd8cccbee9c9c31474fcdcf2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 22 Aug 2016 21:14:01 -0700 Subject: [PATCH 11/12] raid5: fix memory leak of bio integrity data Yi reported a memory leak of raid5 with DIF/DIX enabled disks. raid5 doesn't alloc/free bio, instead it reuses bios. There are two issues in current code: 1. the code calls bio_init (from init_stripe->raid5_build_block->bio_init) then bio_reset (ops_run_io). The bio is reused, so likely there is integrity data attached. bio_init will clear a pointer to integrity data and makes bio_reset can't release the data 2. bio_reset is called before dispatching bio. After bio is finished, it's possible we don't free bio's integrity data (eg, we don't call bio_reset again) Both issues will cause memory leak. The patch moves bio_init to stripe creation and bio_reset to bio end io. This will fix the two issues. Reported-by: Yi Zhang Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2119e094dfb3..d1a279b1916b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1005,7 +1005,6 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bio_reset(bi); bi->bi_bdev = rdev->bdev; bio_set_op_attrs(bi, op, op_flags); bi->bi_end_io = op_is_write(op) @@ -1057,7 +1056,6 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bio_reset(rbi); rbi->bi_bdev = rrdev->bdev; bio_set_op_attrs(rbi, op, op_flags); BUG_ON(!op_is_write(op)); @@ -1990,9 +1988,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, + int disks) { struct stripe_head *sh; + int i; sh = kmem_cache_zalloc(sc, gfp); if (sh) { @@ -2001,6 +2001,12 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) INIT_LIST_HEAD(&sh->batch_list); INIT_LIST_HEAD(&sh->lru); atomic_set(&sh->count, 1); + for (i = 0; i < disks; i++) { + struct r5dev *dev = &sh->dev[i]; + + bio_init(&dev->req); + bio_init(&dev->rreq); + } } return sh; } @@ -2008,7 +2014,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) { struct stripe_head *sh; - sh = alloc_stripe(conf->slab_cache, gfp); + sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); if (!sh) return 0; @@ -2179,7 +2185,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) mutex_lock(&conf->cache_size_mutex); for (i = conf->max_nr_stripes; i; i--) { - nsh = alloc_stripe(sc, GFP_KERNEL); + nsh = alloc_stripe(sc, GFP_KERNEL, newsize); if (!nsh) break; @@ -2311,6 +2317,7 @@ static void raid5_end_read_request(struct bio * bi) (unsigned long long)sh->sector, i, atomic_read(&sh->count), bi->bi_error); if (i == disks) { + bio_reset(bi); BUG(); return; } @@ -2414,6 +2421,7 @@ static void raid5_end_read_request(struct bio * bi) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); + bio_reset(bi); } static void raid5_end_write_request(struct bio *bi) @@ -2448,6 +2456,7 @@ static void raid5_end_write_request(struct bio *bi) (unsigned long long)sh->sector, i, atomic_read(&sh->count), bi->bi_error); if (i == disks) { + bio_reset(bi); BUG(); return; } @@ -2491,18 +2500,17 @@ static void raid5_end_write_request(struct bio *bi) if (sh->batch_head && sh != sh->batch_head) raid5_release_stripe(sh->batch_head); + bio_reset(bi); } static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; - bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; dev->req.bi_max_vecs = 1; dev->req.bi_private = sh; - bio_init(&dev->rreq); dev->rreq.bi_io_vec = &dev->rvec; dev->rreq.bi_max_vecs = 1; dev->rreq.bi_private = sh; From 45c91d808ff989d950e260dab9f89e8f4a3c9c2c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 22 Aug 2016 21:14:02 -0700 Subject: [PATCH 12/12] raid5: avoid unnecessary bio data set bio_reset doesn't change bi_io_vec and bi_max_vecs, so we don't need to set them every time. bi_private will be set before the bio is dispatched. Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d1a279b1916b..62febe8d4919 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2005,7 +2005,12 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, struct r5dev *dev = &sh->dev[i]; bio_init(&dev->req); + dev->req.bi_io_vec = &dev->vec; + dev->req.bi_max_vecs = 1; + bio_init(&dev->rreq); + dev->rreq.bi_io_vec = &dev->rvec; + dev->rreq.bi_max_vecs = 1; } } return sh; @@ -2507,14 +2512,6 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_max_vecs = 1; - dev->req.bi_private = sh; - - dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_max_vecs = 1; - dev->rreq.bi_private = sh; - dev->flags = 0; dev->sector = raid5_compute_blocknr(sh, i, previous); }