diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9978b9f07fe9..ec6d675bf766 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -146,6 +146,7 @@ struct mirror_set { region_t nr_regions; int in_sync; int log_failure; + atomic_t suspend; atomic_t default_mirror; /* Default mirror */ @@ -372,6 +373,16 @@ static void complete_resync_work(struct region *reg, int success) struct region_hash *rh = reg->rh; rh->log->type->set_region_sync(rh->log, reg->key, success); + + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ dispatch_bios(rh->ms, ®->delayed_bios); if (atomic_dec_and_test(&rh->recovery_in_flight)) wake_up_all(&_kmirrord_recovery_stopped); @@ -1069,11 +1080,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) /* * Dispatch io. */ - if (unlikely(ms->log_failure)) + if (unlikely(ms->log_failure)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->failures, &sync); + spin_unlock_irq(&ms->lock); + } else while ((bio = bio_list_pop(&sync))) - bio_endio(bio, -EIO); - else while ((bio = bio_list_pop(&sync))) - do_write(ms, bio); + do_write(ms, bio); while ((bio = bio_list_pop(&recover))) rh_delay(&ms->rh, bio); @@ -1091,8 +1104,46 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) if (!failures->head) return; - while ((bio = bio_list_pop(failures))) - __bio_mark_nosync(ms, bio, bio->bi_size, 0); + if (!ms->log_failure) { + while ((bio = bio_list_pop(failures))) + __bio_mark_nosync(ms, bio, bio->bi_size, 0); + return; + } + + /* + * If the log has failed, unattempted writes are being + * put on the failures list. We can't issue those writes + * until a log has been marked, so we must store them. + * + * If a 'noflush' suspend is in progress, we can requeue + * the I/O's to the core. This give userspace a chance + * to reconfigure the mirror, at which point the core + * will reissue the writes. If the 'noflush' flag is + * not set, we have no choice but to return errors. + * + * Some writes on the failures list may have been + * submitted before the log failure and represent a + * failure to write to one of the devices. It is ok + * for us to treat them the same and requeue them + * as well. + */ + if (dm_noflush_suspending(ms->ti)) { + while ((bio = bio_list_pop(failures))) + bio_endio(bio, DM_ENDIO_REQUEUE); + return; + } + + if (atomic_read(&ms->suspend)) { + while ((bio = bio_list_pop(failures))) + bio_endio(bio, -EIO); + return; + } + + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->failures, failures); + spin_unlock_irq(&ms->lock); + + wake(ms); } static void trigger_event(struct work_struct *work) @@ -1176,6 +1227,8 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ms->nr_mirrors = nr_mirrors; ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->in_sync = 0; + ms->log_failure = 0; + atomic_set(&ms->suspend, 0); atomic_set(&ms->default_mirror, DEFAULT_MIRROR); ms->io_client = dm_io_client_create(DM_IO_PAGES); @@ -1511,26 +1564,51 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, return 0; } -static void mirror_postsuspend(struct dm_target *ti) +static void mirror_presuspend(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; struct dirty_log *log = ms->rh.log; + atomic_set(&ms->suspend, 1); + + /* + * We must finish up all the work that we've + * generated (i.e. recovery work). + */ rh_stop_recovery(&ms->rh); - /* Wait for all I/O we generated to complete */ wait_event(_kmirrord_recovery_stopped, !atomic_read(&ms->rh.recovery_in_flight)); + if (log->type->presuspend && log->type->presuspend(log)) + /* FIXME: need better error handling */ + DMWARN("log presuspend failed"); + + /* + * Now that recovery is complete/stopped and the + * delayed bios are queued, we need to wait for + * the worker thread to complete. This way, + * we know that all of our I/O has been pushed. + */ + flush_workqueue(ms->kmirrord_wq); +} + +static void mirror_postsuspend(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + struct dirty_log *log = ms->rh.log; + if (log->type->postsuspend && log->type->postsuspend(log)) /* FIXME: need better error handling */ - DMWARN("log suspend failed"); + DMWARN("log postsuspend failed"); } static void mirror_resume(struct dm_target *ti) { - struct mirror_set *ms = (struct mirror_set *) ti->private; + struct mirror_set *ms = ti->private; struct dirty_log *log = ms->rh.log; + + atomic_set(&ms->suspend, 0); if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); @@ -1564,7 +1642,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type, DMEMIT("%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) DMEMIT(" %s %llu", ms->mirror[m].dev->name, - (unsigned long long)ms->mirror[m].offset); + (unsigned long long)ms->mirror[m].offset); if (ms->features & DM_RAID1_HANDLE_ERRORS) DMEMIT(" 1 handle_errors"); @@ -1581,6 +1659,7 @@ static struct target_type mirror_target = { .dtr = mirror_dtr, .map = mirror_map, .end_io = mirror_end_io, + .presuspend = mirror_presuspend, .postsuspend = mirror_postsuspend, .resume = mirror_resume, .status = mirror_status,