From 0e13fe23a00ad88c737d91d94a050707c6139ce4 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:31:20 +1000 Subject: use bio_endio instead of a call to bi_end_io Turn calls to bi->bi_end_io() into bio_endio(). Apparently bio_endio does exactly the same error processing as is hardcoded at these places. bio_endio() avoids recursion (or will soon), so it should be used. Signed-off-by: Mikulas Patocka Signed-off-by: Neil Brown --- drivers/md/raid5.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54c8ee2..214b441 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -115,9 +115,7 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); bi = return_bi; } } @@ -3700,9 +3698,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); } return 0; } @@ -4005,12 +4001,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) spin_lock_irq(&conf->device_lock); remaining = --raid_bio->bi_phys_segments; spin_unlock_irq(&conf->device_lock); - if (remaining == 0) { - - raid_bio->bi_end_io(raid_bio, - test_bit(BIO_UPTODATE, &raid_bio->bi_flags) - ? 0 : -EIO); - } + if (remaining == 0) + bio_endio(raid_bio, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; -- cgit v1.1 From 6c2fce2ef6b4821c21b5c42c7207cb9cf8c87eda Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:31:31 +1000 Subject: Support adding a spare to a live md array with external metadata. i.e. extend the 'md/dev-XXX/slot' attribute so that you can tell a device to fill an vacant slot in an and md array. Signed-off-by: Neil Brown --- drivers/md/raid5.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 214b441..002f33b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4607,21 +4607,27 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int found = 0; int disk; struct disk_info *p; + int first = 0; + int last = conf->raid_disks - 1; if (mddev->degraded > conf->max_degraded) /* no point adding a device */ return 0; + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + /* * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) disk = rdev->saved_raid_disk; else - disk = 0; - for ( ; disk < conf->raid_disks; disk++) + disk = first; + for ( ; disk <= last ; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; -- cgit v1.1 From 199050ea1ff2270174ee525b73bc4c3323098897 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 28 Jun 2008 08:31:33 +1000 Subject: rationalise return value for ->hot_add_disk method. For all array types but linear, ->hot_add_disk returns 1 on success, 0 on failure. For linear, it returns 0 on success and -errno on failure. This doesn't cause a functional problem because the ->hot_add_disk function of linear is used quite differently to the others. However it is confusing. So convert all to return 0 for success or -errno on failure and fix call sites to match. Signed-off-by: Neil Brown --- drivers/md/raid5.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 002f33b..8c4e614 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4604,7 +4604,7 @@ abort: static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { raid5_conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int disk; struct disk_info *p; int first = 0; @@ -4612,7 +4612,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->degraded > conf->max_degraded) /* no point adding a device */ - return 0; + return -EINVAL; if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -4631,14 +4631,14 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; - found = 1; + err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } print_raid5_conf(conf); - return found; + return err; } static int raid5_resize(mddev_t *mddev, sector_t sectors) @@ -4739,7 +4739,7 @@ static int raid5_start_reshape(mddev_t *mddev) rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev)) { + if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; set_bit(In_sync, &rdev->flags); added_devices++; -- cgit v1.1 From b203886edbcaac3ca427cf4dbcb50b18bdb346fd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:31:50 +1000 Subject: md: kill STRIPE_OP_MOD_DMA in raid5 offload From: Dan Williams This micro-optimization allowed the raid code to skip a re-read of the parity block after checking parity. It took advantage of the fact that xor-offload-engines have their own internal result buffer and can check parity without writing to memory. Remove it for the following reasons: 1/ It is a layering violation for MD to need to manage the DMA and non-DMA paths within async_xor_zero_sum 2/ Bad precedent to toggle the 'ops' flags outside the lock 3/ Hard to realize a performance gain as reads will not need an updated parity block and writes will dirty it anyways. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8c4e614..60e61d2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -837,15 +837,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, static void ops_complete_check(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && - sh->ops.zero_sum_result == 0) - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -873,11 +868,6 @@ static void ops_run_check(struct stripe_head *sh) tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); - if (tx) - set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - else - clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - atomic_inc(&sh->count); tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, ops_complete_check, sh); -- cgit v1.1 From 2b7497f0e0a0b9cf21d822e427d5399b2056501a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:31:52 +1000 Subject: md: kill STRIPE_OP_IO flag From: Dan Williams The R5_Want{Read,Write} flags already gate i/o. So, this flag is superfluous and we can unconditionally call ops_run_io(). Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 60e61d2..cac9708 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -373,8 +373,6 @@ static unsigned long get_stripe_work(struct stripe_head *sh) test_and_ack_op(STRIPE_OP_BIODRAIN, pending); test_and_ack_op(STRIPE_OP_POSTXOR, pending); test_and_ack_op(STRIPE_OP_CHECK, pending); - if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) - ack++; sh->ops.count -= ack; if (unlikely(sh->ops.count < 0)) { @@ -399,7 +397,6 @@ static void ops_run_io(struct stripe_head *sh) might_sleep(); - set_bit(STRIPE_IO_STARTED, &sh->state); for (i = disks; i--; ) { int rw; struct bio *bi; @@ -433,6 +430,8 @@ static void ops_run_io(struct stripe_head *sh) test_bit(STRIPE_EXPAND_READY, &sh->state)) md_sync_acct(rdev->bdev, STRIPE_SECTORS); + set_bit(STRIPE_IO_STARTED, &sh->state); + bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", __func__, (unsigned long long)sh->sector, @@ -900,9 +899,6 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) if (test_bit(STRIPE_OP_CHECK, &pending)) ops_run_check(sh); - if (test_bit(STRIPE_OP_IO, &pending)) - ops_run_io(sh); - if (overlap_clear) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -2013,8 +2009,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, */ set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; pr_debug("Reading block %d (sync=%d)\n", disk_idx, s->syncing); @@ -2208,9 +2202,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2234,9 +2225,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2444,8 +2432,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); s->locked++; @@ -2801,9 +2787,6 @@ static void handle_stripe5(struct stripe_head *sh) (i == sh->pd_idx || dev->written)) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; if (prexor) continue; if (!test_bit(R5_Insync, &dev->flags) || @@ -2857,16 +2840,12 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[s.failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); s.locked++; } @@ -2884,13 +2863,10 @@ static void handle_stripe5(struct stripe_head *sh) clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - for (i = conf->raid_disks; i--; ) { + for (i = conf->raid_disks; i--; ) set_bit(R5_Wantwrite, &sh->dev[i].flags); set_bit(R5_LOCKED, &dev->flags); s.locked++; - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; - } } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && @@ -2926,6 +2902,8 @@ static void handle_stripe5(struct stripe_head *sh) if (pending) raid5_run_ops(sh, pending); + ops_run_io(sh); + return_io(return_bi); } -- cgit v1.1 From c4e5ac0a22e664eecf29249553cf16c2433f5f25 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:31:53 +1000 Subject: md: use stripe_head_state in ops_run_io() From: Dan Williams In handle_stripe after taking sh->lock we sample some bits into 's' (struct stripe_head_state): s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); Use these values from 's' in ops_run_io() rather than re-sampling the bits. This ensures a consistent snapshot (as seen under sh->lock) is used. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cac9708..c4ef307 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -390,7 +390,7 @@ raid5_end_read_request(struct bio *bi, int error); static void raid5_end_write_request(struct bio *bi, int error); -static void ops_run_io(struct stripe_head *sh) +static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { raid5_conf_t *conf = sh->raid_conf; int i, disks = sh->disks; @@ -425,9 +425,7 @@ static void ops_run_io(struct stripe_head *sh) rcu_read_unlock(); if (rdev) { - if (test_bit(STRIPE_SYNCING, &sh->state) || - test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || - test_bit(STRIPE_EXPAND_READY, &sh->state)) + if (s->syncing || s->expanding || s->expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -2902,10 +2900,9 @@ static void handle_stripe5(struct stripe_head *sh) if (pending) raid5_run_ops(sh, pending); - ops_run_io(sh); + ops_run_io(sh, &s); return_io(return_bi); - } static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) -- cgit v1.1 From f0e43bcdebf709d747a3effb210aff1941e819ab Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:31:55 +1000 Subject: md: unify raid5/6 i/o submission From: Dan Williams Let the raid6 path call ops_run_io to get pending i/o submitted. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 63 ++---------------------------------------------------- 1 file changed, 2 insertions(+), 61 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c4ef307..6f3dd12 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3114,68 +3114,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - return_io(return_bi); - - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else - continue; - - set_bit(STRIPE_IO_STARTED, &sh->state); - - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (rdev) { - if (s.syncing || s.expanding || s.expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); + ops_run_io(sh, &s); - bi->bi_bdev = rdev->bdev; - pr_debug("for %llu schedule op %ld on disc %d\n", - (unsigned long long)sh->sector, bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - generic_make_request(bi); - } else { - if (rw == WRITE) - set_bit(STRIPE_DEGRADED, &sh->state); - pr_debug("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } - } + return_io(return_bi); } static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) -- cgit v1.1 From ecc65c9b3f9b9d740a5deade3d85b39be56401b6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:31:57 +1000 Subject: md: replace STRIPE_OP_CHECK with 'check_states' From: Dan Williams The STRIPE_OP_* flags record the state of stripe operations which are performed outside the stripe lock. Their use in indicating which operations need to be run is straightforward; however, interpolating what the next state of the stripe should be based on a given combination of these flags is not straightforward, and has led to bugs. An easier to read implementation with minimal degrees of freedom is needed. Towards this goal, this patch introduces explicit states to replace what was previously interpolated from the STRIPE_OP_* flags. For now this only converts the handle_parity_checks5 path, removing a user of the ops.{pending,ack,complete,count} fields of struct stripe_operations. This conversion also found a remaining issue with the current code. There is a small window for a drive to fail between when we schedule a repair and when the parity calculation for that repair completes. When this happens we will writeback to 'failed_num' when we really want to write back to 'pd_idx'. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 172 ++++++++++++++++++++++++++--------------------------- 1 file changed, 83 insertions(+), 89 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6f3dd12..544e160 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -605,7 +605,11 @@ static void ops_complete_compute5(void *stripe_head_ref) set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; + else + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -838,7 +842,7 @@ static void ops_complete_check(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); + sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -870,7 +874,8 @@ static void ops_run_check(struct stripe_head *sh) ops_complete_check, sh); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) +static void raid5_run_ops(struct stripe_head *sh, unsigned long pending, + unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; @@ -880,7 +885,8 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) + if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) || + test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) tx = ops_run_compute5(sh, pending); if (test_bit(STRIPE_OP_PREXOR, &pending)) @@ -894,7 +900,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) if (test_bit(STRIPE_OP_POSTXOR, &pending)) ops_run_postxor(sh, tx, pending); - if (test_bit(STRIPE_OP_CHECK, &pending)) + if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); if (overlap_clear) @@ -1961,8 +1967,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, /* don't schedule compute operations or reads on the parity block while * a check is in flight */ - if ((disk_idx == sh->pd_idx) && - test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + if (disk_idx == sh->pd_idx && sh->check_state) return ~0; /* is the data in this block needed, and can we get it? */ @@ -1983,9 +1988,8 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * 3/ We hold off parity block re-reads until check operations * have quiesced. */ - if ((s->uptodate == disks - 1) && - (s->failed && disk_idx == s->failed_num) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + if ((s->uptodate == disks - 1) && !sh->check_state && + (s->failed && disk_idx == s->failed_num)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; @@ -2021,12 +2025,8 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh, { int i; - /* Clear completed compute operations. Parity recovery - * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled - * later on in this routine - */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + /* Clear completed compute operations */ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) { clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); @@ -2350,90 +2350,85 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { - int canceled_check = 0; + struct r5dev *dev = NULL; set_bit(STRIPE_HANDLE, &sh->state); - /* complete a check operation */ - if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ if (s->failed == 0) { - if (sh->ops.zero_sum_result == 0) - /* parity is correct (on disc, - * not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - conf->mddev->resync_mismatches += - STRIPE_SECTORS; - if (test_bit( - MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - set_bit(STRIPE_OP_COMPUTE_BLK, - &sh->ops.pending); - set_bit(STRIPE_OP_MOD_REPAIR_PD, - &sh->ops.pending); - set_bit(R5_Wantcompute, - &sh->dev[sh->pd_idx].flags); - sh->ops.target = sh->pd_idx; - sh->ops.count++; - s->uptodate++; - } - } - } else - canceled_check = 1; /* STRIPE_INSYNC is not set */ - } - - /* start a new check operation if there are no failures, the stripe is - * not insync, and a repair is not in flight - */ - if (s->failed == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; s->uptodate--; + break; } - } - - /* check if we can clear a parity disk reconstruct */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - - clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - + dev = &sh->dev[s->failed_num]; + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; - /* Wait for check parity and compute block operations to complete - * before write-back. If a failure occurred while the check operation - * was in flight we need to cycle this stripe through handle_stripe - * since the parity block may not be uptodate - */ - if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { - struct r5dev *dev; /* either failed parity check, or recovery is happening */ - if (s->failed == 0) - s->failed_num = sh->pd_idx; - dev = &sh->dev[s->failed_num]; BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(s->uptodate != disks); set_bit(R5_LOCKED, &dev->flags); + s->locked++; set_bit(R5_Wantwrite, &dev->flags); clear_bit(STRIPE_DEGRADED, &sh->state); - s->locked++; set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } @@ -2807,7 +2802,7 @@ static void handle_stripe5(struct stripe_head *sh) * block. */ if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + !sh->check_state) handle_issuing_new_write_requests5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe @@ -2815,11 +2810,10 @@ static void handle_stripe5(struct stripe_head *sh) * data is available. The parity check is held off while parity * dependent operations are in flight. */ - if ((s.syncing && s.locked == 0 && + if (sh->check_state || + (s.syncing && s.locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_INSYNC, &sh->state)) || - test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) + !test_bit(STRIPE_INSYNC, &sh->state))) handle_parity_checks5(conf, sh, &s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { @@ -2897,8 +2891,8 @@ static void handle_stripe5(struct stripe_head *sh) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - if (pending) - raid5_run_ops(sh, pending); + if (pending || s.ops_request) + raid5_run_ops(sh, pending, s.ops_request); ops_run_io(sh, &s); -- cgit v1.1 From 83de75cc92be599850e5ef3928e07cd840833499 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:31:58 +1000 Subject: md: replace STRIPE_OP_BIOFILL with STRIPE_BIOFILL_RUN From: Dan Williams Track the state of read operations (copying data from the stripe cache to bio buffers outside the lock) with a state flag. Reduce the scope of the STRIPE_OP_BIOFILL flag to only tracking whether a biofill operation has been requested via the ops_request field of struct stripe_head_state. This is another step towards the removal of ops.{pending,ack,complete,count}, i.e. STRIPE_OP_BIOFILL only requests an operation and does not track the state of the operation. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 544e160..b9c0a32 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -523,38 +523,34 @@ static void ops_complete_biofill(void *stripe_head_ref) (unsigned long long)sh->sector); /* clear completed biofills */ + spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* acknowledge completion of a biofill operation */ /* and check if we need to reply to a read request, * new R5_Wantfill requests are held off until - * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) + * !STRIPE_BIOFILL_RUN */ if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi, *rbi2; - /* The access to dev->read is outside of the - * spin_lock_irq(&conf->device_lock), but is protected - * by the STRIPE_OP_BIOFILL pending bit - */ BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); if (--rbi->bi_phys_segments == 0) { rbi->bi_next = return_bi; return_bi = rbi; } - spin_unlock_irq(&conf->device_lock); rbi = rbi2; } } } - set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); + spin_unlock_irq(&conf->device_lock); + clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@ -880,7 +876,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending, int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; - if (test_bit(STRIPE_OP_BIOFILL, &pending)) { + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } @@ -2630,15 +2626,8 @@ static void handle_stripe5(struct stripe_head *sh) s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ - - /* clean-up completed biofill operations */ - if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); - } + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; @@ -2652,10 +2641,10 @@ static void handle_stripe5(struct stripe_head *sh) /* maybe we can request a biofill operation * * new wantfill requests are only permitted while - * STRIPE_OP_BIOFILL is clear + * ops_complete_biofill is guaranteed to be inactive */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ @@ -2699,8 +2688,10 @@ static void handle_stripe5(struct stripe_head *sh) goto unlock; } - if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) - sh->ops.count++; + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", -- cgit v1.1 From 976ea8d475675da6e86bd434328814ccbf5ae641 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:32:03 +1000 Subject: md: replace STRIPE_OP_COMPUTE_BLK with STRIPE_COMPUTE_RUN From: Dan Williams Track the state of compute operations (recalculating a block from all the other blocks in a stripe) with a state flag. Reduces the scope of the STRIPE_OP_COMPUTE_BLK flag to only tracking whether a compute operation has been requested via the ops_request field of struct stripe_head_state. Note, the compute operation that is performed in the course of doing a 'repair' operation (check the parity block, recalculate it and write it back if the check result is not zero) is tracked separately with the 'check_state' variable. Compute operations are held off while a 'check' is in progress, and moving this check out to handle_issuing_new_read_requests5 the helper routine __handle_issuing_new_read_requests5 can be simplified. This is another step towards the removal of ops.{pending,ack,complete,count}, i.e. STRIPE_OP_COMPUTE_BLK only requests an operation and does not track the state of the operation. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 65 +++++++++++++++--------------------------------------- 1 file changed, 18 insertions(+), 47 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b9c0a32..835046b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -604,8 +604,6 @@ static void ops_complete_compute5(void *stripe_head_ref) clear_bit(STRIPE_COMPUTE_RUN, &sh->state); if (sh->check_state == check_state_compute_run) sh->check_state = check_state_compute_result; - else - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -881,8 +879,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending, overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) || - test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) tx = ops_run_compute5(sh, pending); if (test_bit(STRIPE_OP_PREXOR, &pending)) @@ -1960,12 +1957,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *failed_dev = &sh->dev[s->failed_num]; - /* don't schedule compute operations or reads on the parity block while - * a check is in flight - */ - if (disk_idx == sh->pd_idx && sh->check_state) - return ~0; - /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || @@ -1974,23 +1965,16 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, (failed_dev->toread || (failed_dev->towrite && !test_bit(R5_OVERWRITE, &failed_dev->flags) ))))) { - /* 1/ We would like to get this block, possibly by computing it, - * but we might not be able to. - * - * 2/ Since parity check operations potentially make the parity - * block !uptodate it will need to be refreshed before any - * compute operations on data disks are scheduled. - * - * 3/ We hold off parity block re-reads until check operations - * have quiesced. + /* We would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync */ - if ((s->uptodate == disks - 1) && !sh->check_state && + if ((s->uptodate == disks - 1) && (s->failed && disk_idx == s->failed_num)) { - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; s->req_compute = 1; - sh->ops.count++; /* Careful: from this point on 'uptodate' is in the eye * of raid5_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will @@ -1999,12 +1983,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, */ s->uptodate++; return 0; /* uptodate + compute == disks */ - } else if ((s->uptodate < disks - 1) && - test_bit(R5_Insync, &dev->flags)) { - /* Note: we hold off compute operations while checks are - * in flight, but we still prefer 'compute' over 'read' - * hence we only read if (uptodate < * disks-1) - */ + } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; @@ -2021,20 +2000,13 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh, { int i; - /* Clear completed compute operations */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write */ - if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { for (i = disks; i--; ) if (__handle_issuing_new_read_requests5( sh, s, i, disks) == 0) @@ -2236,10 +2208,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ - if ((s->req_compute || - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && - (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) s->locked += handle_write_operations5(sh, rcw == 0, 0); } @@ -2410,6 +2381,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, set_bit(STRIPE_INSYNC, &sh->state); else { sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &sh->dev[sh->pd_idx].flags); @@ -2725,8 +2697,7 @@ static void handle_stripe5(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || - test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) handle_issuing_new_read_requests5(sh, &s, disks); /* Now we check to see if any write operations have recently @@ -2803,7 +2774,7 @@ static void handle_stripe5(struct stripe_head *sh) */ if (sh->check_state || (s.syncing && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !test_bit(STRIPE_INSYNC, &sh->state))) handle_parity_checks5(conf, sh, &s, disks); @@ -2869,7 +2840,7 @@ static void handle_stripe5(struct stripe_head *sh) } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, NULL); if (sh->ops.count) @@ -3089,7 +3060,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, &r6s); unlock: -- cgit v1.1 From 600aa10993012ff2dd5617720dac081e4f992017 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:32:05 +1000 Subject: md: replace STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} with 'reconstruct_states' From: Dan Williams Track the state of reconstruct operations (recalculating the parity block usually due to incoming writes, or as part of array expansion) Reduces the scope of the STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} flags to only tracking whether a reconstruct operation has been requested via the ops_request field of struct stripe_head_state. This is the final step in the removal of ops.{pending,ack,complete,count}, i.e. the STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} flags only request an operation and do not track the state of the operation. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 204 ++++++++++++++++------------------------------------- 1 file changed, 62 insertions(+), 142 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 835046b..b915936 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -122,6 +122,13 @@ static void return_io(struct bio *return_bi) static void print_raid5_conf (raid5_conf_t *conf); +static int stripe_operations_active(struct stripe_head *sh) +{ + return sh->check_state || sh->reconstruct_state || + test_bit(STRIPE_BIOFILL_RUN, &sh->state) || + test_bit(STRIPE_COMPUTE_RUN, &sh->state); +} + static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) { if (atomic_dec_and_test(&sh->count)) { @@ -141,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } md_wakeup_thread(conf->mddev->thread); } else { - BUG_ON(sh->ops.pending); + BUG_ON(stripe_operations_active(sh)); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -243,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + BUG_ON(stripe_operations_active(sh)); CHECK_DEVLOCK(); pr_debug("init_stripe called, stripe %llu\n", @@ -344,47 +351,6 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } -/* test_and_ack_op() ensures that we only dequeue an operation once */ -#define test_and_ack_op(op, pend) \ -do { \ - if (test_bit(op, &sh->ops.pending) && \ - !test_bit(op, &sh->ops.complete)) { \ - if (test_and_set_bit(op, &sh->ops.ack)) \ - clear_bit(op, &pend); \ - else \ - ack++; \ - } else \ - clear_bit(op, &pend); \ -} while (0) - -/* find new work to run, do not resubmit work that is already - * in flight - */ -static unsigned long get_stripe_work(struct stripe_head *sh) -{ - unsigned long pending; - int ack = 0; - - pending = sh->ops.pending; - - test_and_ack_op(STRIPE_OP_BIOFILL, pending); - test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); - test_and_ack_op(STRIPE_OP_PREXOR, pending); - test_and_ack_op(STRIPE_OP_BIODRAIN, pending); - test_and_ack_op(STRIPE_OP_POSTXOR, pending); - test_and_ack_op(STRIPE_OP_CHECK, pending); - - sh->ops.count -= ack; - if (unlikely(sh->ops.count < 0)) { - printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " - "ops.complete: %#lx\n", pending, sh->ops.pending, - sh->ops.ack, sh->ops.complete); - BUG(); - } - - return pending; -} - static void raid5_end_read_request(struct bio *bi, int error); static void @@ -609,7 +575,7 @@ static void ops_complete_compute5(void *stripe_head_ref) } static struct dma_async_tx_descriptor * -ops_run_compute5(struct stripe_head *sh, unsigned long pending) +ops_run_compute5(struct stripe_head *sh, unsigned long ops_request) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -640,7 +606,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) ops_complete_compute5, sh); /* ack now if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) + if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) async_tx_ack(tx); return tx; @@ -652,8 +618,6 @@ static void ops_complete_prexor(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); } static struct dma_async_tx_descriptor * @@ -686,7 +650,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) static struct dma_async_tx_descriptor * ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) + unsigned long ops_request) { int disks = sh->disks; int pd_idx = sh->pd_idx, i; @@ -694,7 +658,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, /* check if prexor is active which means only process blocks * that are part of a read-modify-write (Wantprexor) */ - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request); pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -744,7 +708,7 @@ static void ops_complete_postxor(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + sh->reconstruct_state = reconstruct_state_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -763,16 +727,14 @@ static void ops_complete_write(void *stripe_head_ref) set_bit(R5_UPTODATE, &dev->flags); } - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - + sh->reconstruct_state = reconstruct_state_drain_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } static void ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) + unsigned long ops_request) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -780,7 +742,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request); unsigned long flags; dma_async_tx_callback callback; @@ -807,7 +769,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, } /* check whether this postxor is part of a write */ - callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? + callback = test_bit(STRIPE_OP_BIODRAIN, &ops_request) ? ops_complete_write : ops_complete_postxor; /* 1/ if we prexor'd then the dest is reused as a source @@ -868,8 +830,7 @@ static void ops_run_check(struct stripe_head *sh) ops_complete_check, sh); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long pending, - unsigned long ops_request) +static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; @@ -880,18 +841,18 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending, } if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) - tx = ops_run_compute5(sh, pending); + tx = ops_run_compute5(sh, ops_request); - if (test_bit(STRIPE_OP_PREXOR, &pending)) + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) tx = ops_run_prexor(sh, tx); - if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { - tx = ops_run_biodrain(sh, tx, pending); + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + tx = ops_run_biodrain(sh, tx, ops_request); overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &pending)) - ops_run_postxor(sh, tx, pending); + if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) + ops_run_postxor(sh, tx, ops_request); if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); @@ -1684,11 +1645,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } } -static int -handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +static void +handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s, + int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; - int locked = 0; if (rcw) { /* if we are not expanding this is a proper write request, and @@ -1696,12 +1657,12 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) * stripe cache */ if (!expand) { - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - sh->ops.count++; - } + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - sh->ops.count++; + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1710,21 +1671,20 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) set_bit(R5_LOCKED, &dev->flags); if (!expand) clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } - if (locked + 1 == disks) + if (s->locked + 1 == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&sh->raid_conf->pending_full_writes); } else { BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - - sh->ops.count += 3; + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1742,7 +1702,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) set_bit(R5_Wantprexor, &dev->flags); set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } } @@ -1752,13 +1712,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - locked++; + s->locked++; - pr_debug("%s: stripe %llu locked: %d pending: %lx\n", + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, - locked, sh->ops.pending); - - return locked; + s->locked, s->ops_request); } /* @@ -2005,8 +1963,7 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh, * midst of changing due to a write */ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && - !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + !sh->reconstruct_state) { for (i = disks; i--; ) if (__handle_issuing_new_read_requests5( sh, s, i, disks) == 0) @@ -2211,7 +2168,7 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state))) - s->locked += handle_write_operations5(sh, rcw == 0, 0); + handle_write_operations5(sh, s, rcw == 0, 0); } static void handle_issuing_new_write_requests6(raid5_conf_t *conf, @@ -2581,15 +2538,14 @@ static void handle_stripe5(struct stripe_head *sh) struct bio *return_bi = NULL; struct stripe_head_state s; struct r5dev *dev; - unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; int prexor; memset(&s, 0, sizeof(s)); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " - "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, - sh->ops.pending, sh->ops.ack, sh->ops.complete); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " + "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->check_state, + sh->reconstruct_state); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); @@ -2703,34 +2659,12 @@ static void handle_stripe5(struct stripe_head *sh) /* Now we check to see if any write operations have recently * completed */ - - /* leave prexor set until postxor is done, allows us to distinguish - * a rmw from a rcw during biodrain - */ prexor = 0; - if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - - prexor = 1; - clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - + if (sh->reconstruct_state == reconstruct_state_drain_result) { + sh->reconstruct_state = reconstruct_state_idle; for (i = disks; i--; ) - clear_bit(R5_Wantprexor, &sh->dev[i].flags); - } - - /* if only POSTXOR is set then this is an 'expand' postxor */ - if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + prexor += test_and_clear_bit(R5_Wantprexor, + &sh->dev[i].flags); /* All the 'written' buffers and the parity block are ready to * be written back to disk @@ -2763,8 +2697,7 @@ static void handle_stripe5(struct stripe_head *sh) * 2/ A 'check' operation is in flight, as it may clobber the parity * block. */ - if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && - !sh->check_state) + if (s.to_write && !sh->reconstruct_state && !sh->check_state) handle_issuing_new_write_requests5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe @@ -2805,18 +2738,10 @@ static void handle_stripe5(struct stripe_head *sh) } } - /* Finish postxor operations initiated by the expansion - * process - */ - if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && - !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { - + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - for (i = conf->raid_disks; i--; ) set_bit(R5_Wantwrite, &sh->dev[i].flags); set_bit(R5_LOCKED, &dev->flags); @@ -2824,15 +2749,13 @@ static void handle_stripe5(struct stripe_head *sh) } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - s.locked += handle_write_operations5(sh, 1, 1); - } else if (s.expanded && - s.locked == 0 && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + handle_write_operations5(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2843,9 +2766,6 @@ static void handle_stripe5(struct stripe_head *sh) !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, NULL); - if (sh->ops.count) - pending = get_stripe_work(sh); - unlock: spin_unlock(&sh->lock); @@ -2853,8 +2773,8 @@ static void handle_stripe5(struct stripe_head *sh) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - if (pending || s.ops_request) - raid5_run_ops(sh, pending, s.ops_request); + if (s.ops_request) + raid5_run_ops(sh, s.ops_request); ops_run_io(sh, &s); -- cgit v1.1 From d8ee0728b5b30d7a6f62c399a95e953616d31f23 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:32:06 +1000 Subject: md: replace R5_WantPrexor with R5_WantDrain, add 'prexor' reconstruct_states From: Dan Williams Currently ops_run_biodrain and other locations have extra logic to determine which blocks are processed in the prexor and non-prexor cases. This can be eliminated if handle_write_operations5 flags the blocks to be processed in all cases via R5_Wantdrain. The presence of the prexor operation is tracked in sh->reconstruct_state. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 89 ++++++++++++++++++------------------------------------ 1 file changed, 29 insertions(+), 60 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b915936..c712460 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -637,7 +637,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) + if (test_bit(R5_Wantdrain, &dev->flags)) xor_srcs[count++] = dev->page; } @@ -649,16 +649,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } static struct dma_async_tx_descriptor * -ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long ops_request) +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - int pd_idx = sh->pd_idx, i; - - /* check if prexor is active which means only process blocks - * that are part of a read-modify-write (Wantprexor) - */ - int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request); + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -666,20 +660,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; struct bio *chosen; - int towrite; - towrite = 0; - if (prexor) { /* rmw */ - if (dev->towrite && - test_bit(R5_Wantprexor, &dev->flags)) - towrite = 1; - } else { /* rcw */ - if (i != pd_idx && dev->towrite && - test_bit(R5_LOCKED, &dev->flags)) - towrite = 1; - } - - if (towrite) { + if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; spin_lock(&sh->lock); @@ -704,18 +686,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, static void ops_complete_postxor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - sh->reconstruct_state = reconstruct_state_result; - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void ops_complete_write(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; int disks = sh->disks, i, pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, @@ -727,14 +697,21 @@ static void ops_complete_write(void *stripe_head_ref) set_bit(R5_UPTODATE, &dev->flags); } - sh->reconstruct_state = reconstruct_state_drain_result; + if (sh->reconstruct_state == reconstruct_state_drain_run) + sh->reconstruct_state = reconstruct_state_drain_result; + else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) + sh->reconstruct_state = reconstruct_state_prexor_drain_result; + else { + BUG_ON(sh->reconstruct_state != reconstruct_state_run); + sh->reconstruct_state = reconstruct_state_result; + } + set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } static void -ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long ops_request) +ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -742,9 +719,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; - int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request); + int prexor = 0; unsigned long flags; - dma_async_tx_callback callback; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -752,7 +728,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ - if (prexor) { + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + prexor = 1; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -768,10 +745,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, } } - /* check whether this postxor is part of a write */ - callback = test_bit(STRIPE_OP_BIODRAIN, &ops_request) ? - ops_complete_write : ops_complete_postxor; - /* 1/ if we prexor'd then the dest is reused as a source * 2/ if we did not prexor then we are redoing the parity * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST @@ -785,10 +758,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, if (unlikely(count == 1)) { flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } else tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } static void ops_complete_check(void *stripe_head_ref) @@ -847,12 +820,12 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) tx = ops_run_prexor(sh, tx); if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { - tx = ops_run_biodrain(sh, tx, ops_request); + tx = ops_run_biodrain(sh, tx); overlap_clear++; } if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) - ops_run_postxor(sh, tx, ops_request); + ops_run_postxor(sh, tx); if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); @@ -1669,6 +1642,7 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s, if (dev->towrite) { set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); if (!expand) clear_bit(R5_UPTODATE, &dev->flags); s->locked++; @@ -1681,7 +1655,7 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s, BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - sh->reconstruct_state = reconstruct_state_drain_run; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; set_bit(STRIPE_OP_PREXOR, &s->ops_request); set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); set_bit(STRIPE_OP_POSTXOR, &s->ops_request); @@ -1691,15 +1665,10 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s, if (i == pd_idx) continue; - /* For a read-modify write there may be blocks that are - * locked for reading while others are ready to be - * written so we distinguish these blocks by the - * R5_Wantprexor bit - */ if (dev->towrite && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - set_bit(R5_Wantprexor, &dev->flags); + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantdrain, &dev->flags); set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); s->locked++; @@ -2660,11 +2629,11 @@ static void handle_stripe5(struct stripe_head *sh) * completed */ prexor = 0; - if (sh->reconstruct_state == reconstruct_state_drain_result) { + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) + prexor = 1; + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { sh->reconstruct_state = reconstruct_state_idle; - for (i = disks; i--; ) - prexor += test_and_clear_bit(R5_Wantprexor, - &sh->dev[i].flags); /* All the 'written' buffers and the parity block are ready to * be written back to disk -- cgit v1.1 From 7b3a871ed995270268a481404454ceafe1a87478 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 08:32:09 +1000 Subject: md: handle operation chaining in raid5_run_ops From: Dan Williams Neil said: > At the end of ops_run_compute5 you have: > /* ack now if postxor is not set to be run */ > if (tx && !test_bit(STRIPE_OP_POSTXOR, &s->ops_run)) > async_tx_ack(tx); > > It looks odd having that test there. Would it fit in raid5_run_ops > better? The intended global interpretation is that raid5_run_ops can build a chain of xor and memcpy operations. When MD registers the compute-xor it tells async_tx to keep the operation handle around so that another item in the dependency chain can be submitted. If we are just computing a block to satisfy a read then we can terminate the chain immediately. raid5_run_ops gives a better context for this test since it cares about the entire chain. Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c712460..456c3c2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -574,8 +574,7 @@ static void ops_complete_compute5(void *stripe_head_ref) release_stripe(sh); } -static struct dma_async_tx_descriptor * -ops_run_compute5(struct stripe_head *sh, unsigned long ops_request) +static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -605,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long ops_request) ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute5, sh); - /* ack now if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) - async_tx_ack(tx); - return tx; } @@ -813,8 +808,12 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) - tx = ops_run_compute5(sh, ops_request); + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + tx = ops_run_compute5(sh); + /* terminate the chain if postxor is not set to be run */ + if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + async_tx_ack(tx); + } if (test_bit(STRIPE_OP_PREXOR, &ops_request)) tx = ops_run_prexor(sh, tx); -- cgit v1.1 From 1fe797e67fb07d605b82300934d0de67068a0aca Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 28 Jun 2008 09:16:30 +1000 Subject: md: rationalize raid5 function names From: Dan Williams Commit a4456856 refactored some of the deep code paths in raid5.c into separate functions. The names chosen at the time do not consistently indicate what is going to happen to the stripe. So, update the names, and since a stripe is a cache element use cache semantics like fill, dirty, and clean. (also, fix up the indentation in fetch_block5) Signed-off-by: Dan Williams Signed-off-by: Neil Brown --- drivers/md/raid5.c | 76 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 456c3c2..4426220 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1618,7 +1618,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } static void -handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s, +schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; @@ -1783,7 +1783,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) } static void -handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, +handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, struct bio **return_bi) { @@ -1874,23 +1874,28 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, md_wakeup_thread(conf->mddev->thread); } -/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks - * to process +/* fetch_block5 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill5 to continue */ -static int __handle_issuing_new_read_requests5(struct stripe_head *sh, - struct stripe_head_state *s, int disk_idx, int disks) +static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *failed_dev = &sh->dev[s->failed_num]; /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || (s->failed && - (failed_dev->toread || (failed_dev->towrite && - !test_bit(R5_OVERWRITE, &failed_dev->flags) - ))))) { + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed && + (failed_dev->toread || + (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { /* We would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -1908,7 +1913,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * subsequent operation. */ s->uptodate++; - return 0; /* uptodate + compute == disks */ + return 1; /* uptodate + compute == disks */ } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); @@ -1918,10 +1923,13 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, } } - return ~0; + return 0; } -static void handle_issuing_new_read_requests5(struct stripe_head *sh, +/** + * handle_stripe_fill5 - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill5(struct stripe_head *sh, struct stripe_head_state *s, int disks) { int i; @@ -1931,16 +1939,14 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh, * midst of changing due to a write */ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && - !sh->reconstruct_state) { + !sh->reconstruct_state) for (i = disks; i--; ) - if (__handle_issuing_new_read_requests5( - sh, s, i, disks) == 0) + if (fetch_block5(sh, s, i, disks)) break; - } set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_issuing_new_read_requests6(struct stripe_head *sh, +static void handle_stripe_fill6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -1999,12 +2005,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, } -/* handle_completed_write_requests +/* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */ -static void handle_completed_write_requests(raid5_conf_t *conf, +static void handle_stripe_clean_event(raid5_conf_t *conf, struct stripe_head *sh, int disks, struct bio **return_bi) { int i; @@ -2049,7 +2055,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, md_wakeup_thread(conf->mddev->thread); } -static void handle_issuing_new_write_requests5(raid5_conf_t *conf, +static void handle_stripe_dirtying5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { int rmw = 0, rcw = 0, i; @@ -2136,10 +2142,10 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state))) - handle_write_operations5(sh, s, rcw == 0, 0); + schedule_reconstruction5(sh, s, rcw == 0, 0); } -static void handle_issuing_new_write_requests6(raid5_conf_t *conf, +static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2597,8 +2603,7 @@ static void handle_stripe5(struct stripe_head *sh) * need to be failed */ if (s.failed > 1 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 1 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -2614,7 +2619,7 @@ static void handle_stripe5(struct stripe_head *sh) !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) || (s.failed == 1 && s.failed_num == sh->pd_idx))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests @@ -2622,7 +2627,7 @@ static void handle_stripe5(struct stripe_head *sh) */ if (s.to_read || s.non_overwrite || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) - handle_issuing_new_read_requests5(sh, &s, disks); + handle_stripe_fill5(sh, &s, disks); /* Now we check to see if any write operations have recently * completed @@ -2666,7 +2671,7 @@ static void handle_stripe5(struct stripe_head *sh) * block. */ if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_issuing_new_write_requests5(conf, sh, &s, disks); + handle_stripe_dirtying5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -2722,7 +2727,7 @@ static void handle_stripe5(struct stripe_head *sh) sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - handle_write_operations5(sh, &s, 1, 1); + schedule_reconstruction5(sh, &s, 1, 1); } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -2854,8 +2859,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) * might need to be failed */ if (s.failed > 2 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 2 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -2880,7 +2884,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests @@ -2888,11 +2892,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || (s.syncing && (s.uptodate < disks)) || s.expanding) - handle_issuing_new_read_requests6(sh, &s, &r6s, disks); + handle_stripe_fill6(sh, &s, &r6s, disks); /* now to consider writing and what else, if anything should be read */ if (s.to_write) - handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); + handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough -- cgit v1.1 From b5470dc5fc18a8ff6517c3bb538d1479e58ecb02 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 27 Jun 2008 21:44:04 -0700 Subject: md: resolve external metadata handling deadlock in md_allow_write md_allow_write() marks the metadata dirty while holding mddev->lock and then waits for the write to complete. For externally managed metadata this causes a deadlock as userspace needs to take the lock to communicate that the metadata update has completed. Change md_allow_write() in the 'external' case to start the 'mark active' operation and then return -EAGAIN. The expected side effects while waiting for userspace to write 'active' to 'array_state' are holding off reshape (code currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and cause updates to 'raid_disks' to fail. Except for 'stripe_cache_size' changes these failures can be mitigated by coordinating with mdmon. md_write_start() still prevents writes from occurring until the metadata handler has had a chance to take action as it unconditionally waits for MD_CHANGE_CLEAN to be cleared. [neilb@suse.de: return -EAGAIN, try GFP_NOIO] Signed-off-by: Dan Williams --- drivers/md/raid5.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4426220..8f4c70a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err = 0; + int err; struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ - md_allow_write(conf->mddev); + err = md_allow_write(conf->mddev); + if (err) + return err; /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; if (!conf) @@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) else break; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; -- cgit v1.1 From f233ea5c9e0d8b95e4283bf6a3436b88f6fd3586 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Mon, 21 Jul 2008 17:05:22 +1000 Subject: md: Make mddev->array_size sector-based. This patch renames the array_size field of struct mddev_s to array_sectors and converts all instances to use units of 512 byte sectors instead of 1k blocks. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/raid5.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8f4c70a..42a480b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3540,7 +3540,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped j == raid6_next_disk(sh->pd_idx, sh->disks)) continue; s = compute_blocknr(sh, j); - if (s < (mddev->array_size<<1)) { + if (s < mddev->array_sectors) { skipped = 1; continue; } @@ -4189,7 +4189,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_size = mddev->size * (conf->previous_raid_disks - + mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - conf->max_degraded); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4413,8 +4413,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) raid5_conf_t *conf = mddev_to_conf(mddev); sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = sectors * (mddev->raid_disks + - conf->max_degraded); + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->size << 1; @@ -4547,15 +4548,16 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_size = conf->mddev->size * + conf->mddev->array_sectors = 2 * conf->mddev->size * (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); conf->mddev->changed = 1; bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); + i_size_write(bdev->bd_inode, + (loff_t)conf->mddev->array_sectors << 9); mutex_unlock(&bdev->bd_inode->i_mutex); bdput(bdev); } -- cgit v1.1