summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r--drivers/md/raid1.c164
1 files changed, 121 insertions, 43 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36a8fc0..9f7f8be 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
*/
#define NR_RAID1_BIOS 256
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
/* When there are this many requests queue to be written by
* the raid1 thread, we become 'congested' to provide back-pressure
* for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
const sector_t this_sector = r1_bio->sector;
int sectors;
int best_good_sectors;
- int start_disk;
- int best_disk;
- int i;
+ int best_disk, best_dist_disk, best_pending_disk;
+ int has_nonrot_disk;
+ int disk;
sector_t best_dist;
+ unsigned int min_pending;
struct md_rdev *rdev;
int choose_first;
+ int choose_next_idle;
rcu_read_lock();
/*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
retry:
sectors = r1_bio->sectors;
best_disk = -1;
+ best_dist_disk = -1;
best_dist = MaxSector;
+ best_pending_disk = -1;
+ min_pending = UINT_MAX;
best_good_sectors = 0;
+ has_nonrot_disk = 0;
+ choose_next_idle = 0;
if (conf->mddev->recovery_cp < MaxSector &&
- (this_sector + sectors >= conf->next_resync)) {
+ (this_sector + sectors >= conf->next_resync))
choose_first = 1;
- start_disk = 0;
- } else {
+ else
choose_first = 0;
- start_disk = conf->last_used;
- }
- for (i = 0 ; i < conf->raid_disks * 2 ; i++) {
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist;
sector_t first_bad;
int bad_sectors;
-
- int disk = start_disk + i;
- if (disk >= conf->raid_disks * 2)
- disk -= conf->raid_disks * 2;
+ unsigned int pending;
+ bool nonrot;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
} else
best_good_sectors = sectors;
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+ has_nonrot_disk |= nonrot;
+ pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position);
- if (choose_first
- /* Don't change to another disk for sequential reads */
- || conf->next_seq_sect == this_sector
- || dist == 0
- /* If device is idle, use it */
- || atomic_read(&rdev->nr_pending) == 0) {
+ if (choose_first) {
+ best_disk = disk;
+ break;
+ }
+ /* Don't change to another disk for sequential reads */
+ if (conf->mirrors[disk].next_seq_sect == this_sector
+ || dist == 0) {
+ int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
+ struct raid1_info *mirror = &conf->mirrors[disk];
+
+ best_disk = disk;
+ /*
+ * If buffered sequential IO size exceeds optimal
+ * iosize, check if there is idle disk. If yes, choose
+ * the idle disk. read_balance could already choose an
+ * idle disk before noticing it's a sequential IO in
+ * this disk. This doesn't matter because this disk
+ * will idle, next time it will be utilized after the
+ * first disk has IO size exceeds optimal iosize. In
+ * this way, iosize of the first disk will be optimal
+ * iosize at least. iosize of the second disk might be
+ * small, but not a big deal since when the second disk
+ * starts IO, the first disk is likely still busy.
+ */
+ if (nonrot && opt_iosize > 0 &&
+ mirror->seq_start != MaxSector &&
+ mirror->next_seq_sect > opt_iosize &&
+ mirror->next_seq_sect - opt_iosize >=
+ mirror->seq_start) {
+ choose_next_idle = 1;
+ continue;
+ }
+ break;
+ }
+ /* If device is idle, use it */
+ if (pending == 0) {
best_disk = disk;
break;
}
+
+ if (choose_next_idle)
+ continue;
+
+ if (min_pending > pending) {
+ min_pending = pending;
+ best_pending_disk = disk;
+ }
+
if (dist < best_dist) {
best_dist = dist;
- best_disk = disk;
+ best_dist_disk = disk;
}
}
+ /*
+ * If all disks are rotational, choose the closest disk. If any disk is
+ * non-rotational, choose the disk with less pending request even the
+ * disk is rotational, which might/might not be optimal for raids with
+ * mixed ratation/non-rotational disks depending on workload.
+ */
+ if (best_disk == -1) {
+ if (has_nonrot_disk)
+ best_disk = best_pending_disk;
+ else
+ best_disk = best_dist_disk;
+ }
+
if (best_disk >= 0) {
rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
goto retry;
}
sectors = best_good_sectors;
- conf->next_seq_sect = this_sector + sectors;
- conf->last_used = best_disk;
+
+ if (conf->mirrors[best_disk].next_seq_sect != this_sector)
+ conf->mirrors[best_disk].seq_start = this_sector;
+
+ conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
}
rcu_read_unlock();
*max_sectors = sectors;
@@ -873,7 +947,7 @@ do_sync_io:
static void make_request(struct mddev *mddev, struct bio * bio)
{
struct r1conf *conf = mddev->private;
- struct mirror_info *mirror;
+ struct raid1_info *mirror;
struct r1bio *r1_bio;
struct bio *read_bio;
int i, disks;
@@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r1conf *conf = mddev->private;
int err = -EEXIST;
int mirror = 0;
- struct mirror_info *p;
+ struct raid1_info *p;
int first = 0;
int last = conf->raid_disks - 1;
struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r1conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
- struct mirror_info *p = conf->mirrors+ number;
+ struct raid1_info *p = conf->mirrors + number;
if (rdev != p->rdev)
p = conf->mirrors + conf->raid_disks + number;
@@ -2370,6 +2444,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bio->bi_rw = READ;
bio->bi_end_io = end_sync_read;
read_targets++;
+ } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+ /*
+ * The device is suitable for reading (InSync),
+ * but has bad block(s) here. Let's try to correct them,
+ * if we are doing resync or repair. Otherwise, leave
+ * this device alone for this sync request.
+ */
+ bio->bi_rw = WRITE;
+ bio->bi_end_io = end_sync_write;
+ write_targets++;
}
}
if (bio->bi_end_io) {
@@ -2427,7 +2513,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
/* There is nowhere to write, so all non-sync
* drives must be failed - so we are finished
*/
- sector_t rv = max_sector - sector_nr;
+ sector_t rv;
+ if (min_bad > 0)
+ max_sector = sector_nr + min_bad;
+ rv = max_sector - sector_nr;
*skipped = 1;
put_buf(r1_bio);
return rv;
@@ -2520,7 +2609,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
{
struct r1conf *conf;
int i;
- struct mirror_info *disk;
+ struct raid1_info *disk;
struct md_rdev *rdev;
int err = -ENOMEM;
@@ -2528,7 +2617,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf)
goto abort;
- conf->mirrors = kzalloc(sizeof(struct mirror_info)
+ conf->mirrors = kzalloc(sizeof(struct raid1_info)
* mddev->raid_disks * 2,
GFP_KERNEL);
if (!conf->mirrors)
@@ -2571,6 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
mddev->merge_check_needed = 1;
disk->head_position = 0;
+ disk->seq_start = MaxSector;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
@@ -2584,7 +2674,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->recovery_disabled = mddev->recovery_disabled - 1;
err = -EIO;
- conf->last_used = -1;
for (i = 0; i < conf->raid_disks * 2; i++) {
disk = conf->mirrors + i;
@@ -2610,19 +2699,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (disk->rdev &&
(disk->rdev->saved_raid_disk < 0))
conf->fullsync = 1;
- } else if (conf->last_used < 0)
- /*
- * The first working device is used as a
- * starting point to read balancing.
- */
- conf->last_used = i;
+ }
}
- if (conf->last_used < 0) {
- printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
- mdname(mddev));
- goto abort;
- }
err = -ENOMEM;
conf->thread = md_register_thread(raid1d, mddev, "raid1");
if (!conf->thread) {
@@ -2797,7 +2876,7 @@ static int raid1_reshape(struct mddev *mddev)
*/
mempool_t *newpool, *oldpool;
struct pool_info *newpoolinfo;
- struct mirror_info *newmirrors;
+ struct raid1_info *newmirrors;
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
@@ -2840,7 +2919,7 @@ static int raid1_reshape(struct mddev *mddev)
kfree(newpoolinfo);
return -ENOMEM;
}
- newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2,
+ newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
GFP_KERNEL);
if (!newmirrors) {
kfree(newpoolinfo);
@@ -2879,7 +2958,6 @@ static int raid1_reshape(struct mddev *mddev)
conf->raid_disks = mddev->raid_disks = raid_disks;
mddev->delta_disks = 0;
- conf->last_used = 0; /* just make sure it is in-range */
lower_barrier(conf);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
OpenPOWER on IntegriCloud