diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d9521aa69461..42439a4c1c51 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2077,36 +2077,101 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, } +/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks + * to process + */ +static int __handle_issuing_new_read_requests5(struct stripe_head *sh, + struct stripe_head_state *s, int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5dev *failed_dev = &sh->dev[s->failed_num]; + + /* don't schedule compute operations or reads on the parity block while + * a check is in flight + */ + if ((disk_idx == sh->pd_idx) && + test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + return ~0; + + /* is the data in this block needed, and can we get it? */ + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || (s->failed && + (failed_dev->toread || (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags) + ))))) { + /* 1/ We would like to get this block, possibly by computing it, + * but we might not be able to. + * + * 2/ Since parity check operations potentially make the parity + * block !uptodate it will need to be refreshed before any + * compute operations on data disks are scheduled. + * + * 3/ We hold off parity block re-reads until check operations + * have quiesced. + */ + if ((s->uptodate == disks - 1) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + s->req_compute = 1; + sh->ops.count++; + /* Careful: from this point on 'uptodate' is in the eye + * of raid5_run_ops which services 'compute' operations + * before writes. R5_Wantcompute flags a block that will + * be R5_UPTODATE by the time it is needed for a + * subsequent operation. + */ + s->uptodate++; + return 0; /* uptodate + compute == disks */ + } else if ((s->uptodate < disks - 1) && + test_bit(R5_Insync, &dev->flags)) { + /* Note: we hold off compute operations while checks are + * in flight, but we still prefer 'compute' over 'read' + * hence we only read if (uptodate < * disks-1) + */ + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", disk_idx, + s->syncing); + } + } + + return ~0; +} + static void handle_issuing_new_read_requests5(struct stripe_head *sh, struct stripe_head_state *s, int disks) { int i; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->failed && (sh->dev[s->failed_num].toread || - (sh->dev[s->failed_num].towrite && - !test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags)) - )))) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to - */ - if (s->uptodate == disks-1) { - pr_debug("Computing block %d\n", i); - compute_block(sh, i); - s->uptodate++; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", - i, s->syncing); - } - } + + /* Clear completed compute operations. Parity recovery + * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled + * later on in this routine + */ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write + */ + if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + for (i = disks; i--; ) + if (__handle_issuing_new_read_requests5( + sh, s, i, disks) == 0) + break; } set_bit(STRIPE_HANDLE, &sh->state); } @@ -2223,7 +2288,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags)) { + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rmw++; else @@ -2232,9 +2298,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags)) - rcw++; + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + if (test_bit(R5_Insync, &dev->flags)) rcw++; else rcw += 2*disks; } @@ -2248,7 +2314,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -2270,7 +2337,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -2288,8 +2356,17 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, /* now if nothing is locked, and if we have enough data, * we can start a write request */ - if (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state)) + /* since handle_stripe can be called at any time we need to handle the + * case where a compute block operation has been submitted and then a + * subsequent call wants to start a write request. raid5_run_ops only + * handles the case where compute block and postxor are requested + * simultaneously. If this is not the case then new writes need to be + * held off until the compute completes. + */ + if ((s->req_compute || + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) s->locked += handle_write_operations5(sh, rcw == 0, 0); } @@ -2650,6 +2727,7 @@ static void handle_stripe5(struct stripe_head *sh) /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; if (dev->toread) s.to_read++; @@ -2706,7 +2784,8 @@ static void handle_stripe5(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate < disks)) || s.expanding) + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || + test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) handle_issuing_new_read_requests5(sh, &s, disks); /* Now we check to see if any write operations have recently diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 6fb9d94e6f2e..2293015de1d5 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -200,7 +200,7 @@ struct stripe_head { struct stripe_head_state { int syncing, expanding, expanded; int locked, uptodate, to_read, to_write, failed, written; - int non_overwrite; + int compute, req_compute, non_overwrite; int failed_num; };