diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ec6d675bf766..38efa7071dd7 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -6,6 +6,7 @@ #include "dm.h" #include "dm-bio-list.h" +#include "dm-bio-record.h" #include "dm-io.h" #include "dm-log.h" #include "kcopyd.h" @@ -141,6 +142,7 @@ struct mirror_set { struct bio_list failures; struct dm_io_client *io_client; + mempool_t *read_record_pool; /* recovery */ region_t nr_regions; @@ -647,24 +649,30 @@ static void rh_start_recovery(struct region_hash *rh) wake(rh->ms); } +#define MIN_READ_RECORDS 20 +struct dm_raid1_read_record { + struct mirror *m; + struct dm_bio_details details; +}; + /* * Every mirror should look like this one. */ #define DEFAULT_MIRROR 0 /* - * This is yucky. We squirrel the mirror_set struct away inside - * bi_next for write buffers. This is safe since the bh + * This is yucky. We squirrel the mirror struct away inside + * bi_next for read/write buffers. This is safe since the bh * doesn't get submitted to the lower levels of block layer. */ -static struct mirror_set *bio_get_ms(struct bio *bio) +static struct mirror *bio_get_m(struct bio *bio) { - return (struct mirror_set *) bio->bi_next; + return (struct mirror *) bio->bi_next; } -static void bio_set_ms(struct bio *bio, struct mirror_set *ms) +static void bio_set_m(struct bio *bio, struct mirror *m) { - bio->bi_next = (struct bio *) ms; + bio->bi_next = (struct bio *) m; } static struct mirror *get_default_mirror(struct mirror_set *ms) @@ -857,17 +865,105 @@ static void do_recovery(struct mirror_set *ms) *---------------------------------------------------------------*/ static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) { - /* FIXME: add read balancing */ - return get_default_mirror(ms); + struct mirror *m = get_default_mirror(ms); + + do { + if (likely(!atomic_read(&m->error_count))) + return m; + + if (m-- == ms->mirror) + m += ms->nr_mirrors; + } while (m != get_default_mirror(ms)); + + return NULL; +} + +static int default_ok(struct mirror *m) +{ + struct mirror *default_mirror = get_default_mirror(m->ms); + + return !atomic_read(&default_mirror->error_count); +} + +static int mirror_available(struct mirror_set *ms, struct bio *bio) +{ + region_t region = bio_to_region(&ms->rh, bio); + + if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) + return choose_mirror(ms, bio->bi_sector) ? 1 : 0; + + return 0; } /* * remap a buffer to a particular mirror. */ -static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) +static sector_t map_sector(struct mirror *m, struct bio *bio) +{ + return m->offset + (bio->bi_sector - m->ms->ti->begin); +} + +static void map_bio(struct mirror *m, struct bio *bio) { bio->bi_bdev = m->dev->bdev; - bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); + bio->bi_sector = map_sector(m, bio); +} + +static void map_region(struct io_region *io, struct mirror *m, + struct bio *bio) +{ + io->bdev = m->dev->bdev; + io->sector = map_sector(m, bio); + io->count = bio->bi_size >> 9; +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static void read_callback(unsigned long error, void *context) +{ + struct bio *bio = context; + struct mirror *m; + + m = bio_get_m(bio); + bio_set_m(bio, NULL); + + if (likely(!error)) { + bio_endio(bio, 0); + return; + } + + fail_mirror(m, DM_RAID1_READ_ERROR); + + if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { + DMWARN_LIMIT("Read failure on mirror device %s. " + "Trying alternative device.", + m->dev->name); + queue_bio(m->ms, bio, bio_rw(bio)); + return; + } + + DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", + m->dev->name); + bio_endio(bio, -EIO); +} + +/* Asynchronous read. */ +static void read_async_bio(struct mirror *m, struct bio *bio) +{ + struct io_region io; + struct dm_io_request io_req = { + .bi_rw = READ, + .mem.type = DM_IO_BVEC, + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, + .notify.fn = read_callback, + .notify.context = bio, + .client = m->ms->io_client, + }; + + map_region(&io, m, bio); + bio_set_m(bio, m); + (void) dm_io(&io_req, 1, &io, NULL); } static void do_reads(struct mirror_set *ms, struct bio_list *reads) @@ -878,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) while ((bio = bio_list_pop(reads))) { region = bio_to_region(&ms->rh, bio); + m = get_default_mirror(ms); /* * We can only read balance if the region is in sync. */ - if (rh_in_sync(&ms->rh, region, 1)) + if (likely(rh_in_sync(&ms->rh, region, 1))) m = choose_mirror(ms, bio->bi_sector); - else - m = get_default_mirror(ms); + else if (m && atomic_read(&m->error_count)) + m = NULL; - map_bio(ms, m, bio); - generic_make_request(bio); + if (likely(m)) + read_async_bio(m, bio); + else + bio_endio(bio, -EIO); } } @@ -964,8 +1063,8 @@ static void write_callback(unsigned long error, void *context) int should_wake = 0; unsigned long flags; - ms = bio_get_ms(bio); - bio_set_ms(bio, NULL); + ms = bio_get_m(bio)->ms; + bio_set_m(bio, NULL); /* * NOTE: We don't decrement the pending count here, @@ -1008,7 +1107,7 @@ out: static void do_write(struct mirror_set *ms, struct bio *bio) { unsigned int i; - struct io_region io[KCOPYD_MAX_REGIONS+1]; + struct io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { .bi_rw = WRITE, @@ -1019,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio) .client = ms->io_client, }; - for (i = 0; i < ms->nr_mirrors; i++) { - m = ms->mirror + i; + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) + map_region(dest++, m, bio); - io[i].bdev = m->dev->bdev; - io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); - io[i].count = bio->bi_size >> 9; - } - - bio_set_ms(bio, ms); + /* + * Use default mirror because we only need it to retrieve the reference + * to the mirror set in write_callback(). + */ + bio_set_m(bio, get_default_mirror(ms)); (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); } @@ -1092,7 +1190,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) rh_delay(&ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(ms, get_default_mirror(ms), bio); + map_bio(get_default_mirror(ms), bio); generic_make_request(bio); } } @@ -1231,9 +1329,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, atomic_set(&ms->suspend, 0); atomic_set(&ms->default_mirror, DEFAULT_MIRROR); + len = sizeof(struct dm_raid1_read_record); + ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, + len); + if (!ms->read_record_pool) { + ti->error = "Error creating mirror read_record_pool"; + kfree(ms); + return NULL; + } + ms->io_client = dm_io_client_create(DM_IO_PAGES); if (IS_ERR(ms->io_client)) { ti->error = "Error creating dm_io client"; + mempool_destroy(ms->read_record_pool); kfree(ms); return NULL; } @@ -1241,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { ti->error = "Error creating dirty region hash"; dm_io_client_destroy(ms->io_client); + mempool_destroy(ms->read_record_pool); kfree(ms); return NULL; } @@ -1256,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, dm_io_client_destroy(ms->io_client); rh_exit(&ms->rh); + mempool_destroy(ms->read_record_pool); kfree(ms); } @@ -1510,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, int r, rw = bio_rw(bio); struct mirror *m; struct mirror_set *ms = ti->private; - - map_context->ll = bio_to_region(&ms->rh, bio); + struct dm_raid1_read_record *read_record = NULL; if (rw == WRITE) { + /* Save region for mirror_end_io() handler */ + map_context->ll = bio_to_region(&ms->rh, bio); queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } @@ -1523,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, if (r < 0 && r != -EWOULDBLOCK) return r; - if (r == -EWOULDBLOCK) /* FIXME: ugly */ - r = DM_MAPIO_SUBMITTED; - /* - * We don't want to fast track a recovery just for a read - * ahead. So we just let it silently fail. - * FIXME: get rid of this. + * If region is not in-sync queue the bio. */ - if (!r && rw == READA) - return -EIO; + if (!r || (r == -EWOULDBLOCK)) { + if (rw == READA) + return -EWOULDBLOCK; - if (!r) { - /* Pass this io over to the daemon */ queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } + /* + * The region is in-sync and we can perform reads directly. + * Store enough information so we can retry if it fails. + */ m = choose_mirror(ms, bio->bi_sector); - if (!m) + if (unlikely(!m)) return -EIO; - map_bio(ms, m, bio); + read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); + if (likely(read_record)) { + dm_bio_record(&read_record->details, bio); + map_context->ptr = read_record; + read_record->m = m; + } + + map_bio(m, bio); + return DM_MAPIO_REMAPPED; } @@ -1553,15 +1670,64 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, { int rw = bio_rw(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; - region_t region = map_context->ll; + struct mirror *m = NULL; + struct dm_bio_details *bd = NULL; + struct dm_raid1_read_record *read_record = map_context->ptr; /* * We need to dec pending if this was a write. */ - if (rw == WRITE) - rh_dec(&ms->rh, region); + if (rw == WRITE) { + rh_dec(&ms->rh, map_context->ll); + return error; + } - return 0; + if (error == -EOPNOTSUPP) + goto out; + + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + goto out; + + if (unlikely(error)) { + if (!read_record) { + /* + * There wasn't enough memory to record necessary + * information for a retry or there was no other + * mirror in-sync. + */ + DMERR_LIMIT("Mirror read failed from %s.", + m->dev->name); + return -EIO; + } + DMERR("Mirror read failed from %s. Trying alternative device.", + m->dev->name); + + m = read_record->m; + fail_mirror(m, DM_RAID1_READ_ERROR); + + /* + * A failed read is requeued for another attempt using an intact + * mirror. + */ + if (default_ok(m) || mirror_available(ms, bio)) { + bd = &read_record->details; + + dm_bio_restore(bd, bio); + mempool_free(read_record, ms->read_record_pool); + map_context->ptr = NULL; + queue_bio(ms, bio, rw); + return 1; + } + DMERR("All replicated volumes dead, failing I/O"); + } + +out: + if (read_record) { + mempool_free(read_record, ms->read_record_pool); + map_context->ptr = NULL; + } + + return error; } static void mirror_presuspend(struct dm_target *ti)