summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-02 13:10:25 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-02 13:10:25 -0700
commit1081230b748de8f03f37f80c53dfa89feda9b8de (patch)
tree7238d60e01f0843bad8f03b5d84e4220fbba5e76 /drivers/md/raid5.c
parentdf910390e2db07a76c87f258475f6c96253cee6c (diff)
parent2ca495ac27d245513c11fed70591b1838250e240 (diff)
downloadop-kernel-dev-1081230b748de8f03f37f80c53dfa89feda9b8de.zip
op-kernel-dev-1081230b748de8f03f37f80c53dfa89feda9b8de.tar.gz
Merge branch 'for-4.3/core' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: "This first core part of the block IO changes contains: - Cleanup of the bio IO error signaling from Christoph. We used to rely on the uptodate bit and passing around of an error, now we store the error in the bio itself. - Improvement of the above from myself, by shrinking the bio size down again to fit in two cachelines on x86-64. - Revert of the max_hw_sectors cap removal from a revision again, from Jeff Moyer. This caused performance regressions in various tests. Reinstate the limit, bump it to a more reasonable size instead. - Make /sys/block/<dev>/queue/discard_max_bytes writeable, by me. Most devices have huge trim limits, which can cause nasty latencies when deleting files. Enable the admin to configure the size down. We will look into having a more sane default instead of UINT_MAX sectors. - Improvement of the SGP gaps logic from Keith Busch. - Enable the block core to handle arbitrarily sized bios, which enables a nice simplification of bio_add_page() (which is an IO hot path). From Kent. - Improvements to the partition io stats accounting, making it faster. From Ming Lei. - Also from Ming Lei, a basic fixup for overflow of the sysfs pending file in blk-mq, as well as a fix for a blk-mq timeout race condition. - Ming Lin has been carrying Kents above mentioned patches forward for a while, and testing them. Ming also did a few fixes around that. - Sasha Levin found and fixed a use-after-free problem introduced by the bio->bi_error changes from Christoph. - Small blk cgroup cleanup from Viresh Kumar" * 'for-4.3/core' of git://git.kernel.dk/linux-block: (26 commits) blk: Fix bio_io_vec index when checking bvec gaps block: Replace SG_GAPS with new queue limits mask block: bump BLK_DEF_MAX_SECTORS to 2560 Revert "block: remove artifical max_hw_sectors cap" blk-mq: fix race between timeout and freeing request blk-mq: fix buffer overflow when reading sysfs file of 'pending' Documentation: update notes in biovecs about arbitrarily sized bios block: remove bio_get_nr_vecs() fs: use helper bio_add_page() instead of open coding on bi_io_vec block: kill merge_bvec_fn() completely md/raid5: get rid of bio_fits_rdev() md/raid5: split bio for chunk_aligned_read block: remove split code in blkdev_issue_{discard,write_same} btrfs: remove bio splitting and merge_bvec_fn() calls bcache: remove driver private bio splitting code block: simplify bio_add_page() block: make generic_make_request handle arbitrarily sized bios blk-cgroup: Drop unlikely before IS_ERR(_OR_NULL) block: don't access bio->bi_error after bio_put() block: shrink struct bio down to 2 cache lines again ...
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c147
1 files changed, 61 insertions, 86 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f757023..b29e89c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -233,7 +233,7 @@ static void return_io(struct bio *return_bi)
bi->bi_iter.bi_size = 0;
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
- bio_endio(bi, 0);
+ bio_endio(bi);
bi = return_bi;
}
}
@@ -887,9 +887,9 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
}
static void
-raid5_end_read_request(struct bio *bi, int error);
+raid5_end_read_request(struct bio *bi);
static void
-raid5_end_write_request(struct bio *bi, int error);
+raid5_end_write_request(struct bio *bi);
static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{
@@ -2282,12 +2282,11 @@ static void shrink_stripes(struct r5conf *conf)
conf->slab_cache = NULL;
}
-static void raid5_end_read_request(struct bio * bi, int error)
+static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE];
struct md_rdev *rdev = NULL;
sector_t s;
@@ -2296,9 +2295,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
if (bi == &sh->dev[i].req)
break;
- pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
+ pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- uptodate);
+ bi->bi_error);
if (i == disks) {
BUG();
return;
@@ -2317,7 +2316,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
s = sh->sector + rdev->new_data_offset;
else
s = sh->sector + rdev->data_offset;
- if (uptodate) {
+ if (!bi->bi_error) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
/* Note that this cannot happen on a
@@ -2405,13 +2404,12 @@ static void raid5_end_read_request(struct bio * bi, int error)
release_stripe(sh);
}
-static void raid5_end_write_request(struct bio *bi, int error)
+static void raid5_end_write_request(struct bio *bi)
{
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
struct md_rdev *uninitialized_var(rdev);
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
sector_t first_bad;
int bad_sectors;
int replacement = 0;
@@ -2434,23 +2432,23 @@ static void raid5_end_write_request(struct bio *bi, int error)
break;
}
}
- pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+ pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- uptodate);
+ bi->bi_error);
if (i == disks) {
BUG();
return;
}
if (replacement) {
- if (!uptodate)
+ if (bi->bi_error)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
- if (!uptodate) {
+ if (bi->bi_error) {
set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
@@ -2471,7 +2469,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
rdev_dec_pending(rdev, conf->mddev);
- if (sh->batch_head && !uptodate && !replacement)
+ if (sh->batch_head && bi->bi_error && !replacement)
set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
@@ -3112,7 +3110,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
bi->bi_next = *return_bi;
@@ -3136,7 +3135,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
bi->bi_next = *return_bi;
@@ -3161,7 +3161,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
bi->bi_next = *return_bi;
*return_bi = bi;
@@ -4669,35 +4670,6 @@ static int raid5_congested(struct mddev *mddev, int bits)
return 0;
}
-/* We want read requests to align with chunks where possible,
- * but write requests don't need to.
- */
-static int raid5_mergeable_bvec(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int max;
- unsigned int chunk_sectors = mddev->chunk_sectors;
- unsigned int bio_sectors = bvm->bi_size >> 9;
-
- /*
- * always allow writes to be mergeable, read as well if array
- * is degraded as we'll go through stripe cache anyway.
- */
- if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
- return biovec->bv_len;
-
- if (mddev->new_chunk_sectors < mddev->chunk_sectors)
- chunk_sectors = mddev->new_chunk_sectors;
- max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
- if (max < 0) max = 0;
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- else
- return max;
-}
-
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
@@ -4756,13 +4728,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
* first).
* If the read failed..
*/
-static void raid5_align_endio(struct bio *bi, int error)
+static void raid5_align_endio(struct bio *bi)
{
struct bio* raid_bi = bi->bi_private;
struct mddev *mddev;
struct r5conf *conf;
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
struct md_rdev *rdev;
+ int error = bi->bi_error;
bio_put(bi);
@@ -4773,10 +4745,10 @@ static void raid5_align_endio(struct bio *bi, int error)
rdev_dec_pending(rdev, conf->mddev);
- if (!error && uptodate) {
+ if (!error) {
trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
raid_bi, 0);
- bio_endio(raid_bi, 0);
+ bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
return;
@@ -4787,26 +4759,7 @@ static void raid5_align_endio(struct bio *bi, int error)
add_bio_to_retry(raid_bi, conf);
}
-static int bio_fits_rdev(struct bio *bi)
-{
- struct request_queue *q = bdev_get_queue(bi->bi_bdev);
-
- if (bio_sectors(bi) > queue_max_sectors(q))
- return 0;
- blk_recount_segments(q, bi);
- if (bi->bi_phys_segments > queue_max_segments(q))
- return 0;
-
- if (q->merge_bvec_fn)
- /* it's too hard to apply the merge_bvec_fn at this stage,
- * just just give up
- */
- return 0;
-
- return 1;
-}
-
-static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
+static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
{
struct r5conf *conf = mddev->private;
int dd_idx;
@@ -4815,7 +4768,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
sector_t end_sector;
if (!in_chunk_boundary(mddev, raid_bio)) {
- pr_debug("chunk_aligned_read : non aligned\n");
+ pr_debug("%s: non aligned\n", __func__);
return 0;
}
/*
@@ -4857,13 +4810,11 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
rcu_read_unlock();
raid_bio->bi_next = (void*)rdev;
align_bi->bi_bdev = rdev->bdev;
- __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags);
+ bio_clear_flag(align_bi, BIO_SEG_VALID);
- if (!bio_fits_rdev(align_bi) ||
- is_badblock(rdev, align_bi->bi_iter.bi_sector,
+ if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
bio_sectors(align_bi),
&first_bad, &bad_sectors)) {
- /* too big in some way, or has a known bad block */
bio_put(align_bi);
rdev_dec_pending(rdev, mddev);
return 0;
@@ -4892,6 +4843,31 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
}
}
+static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
+{
+ struct bio *split;
+
+ do {
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
+
+ if (sectors < bio_sectors(raid_bio)) {
+ split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
+ bio_chain(split, raid_bio);
+ } else
+ split = raid_bio;
+
+ if (!raid5_read_one_chunk(mddev, split)) {
+ if (split != raid_bio)
+ generic_make_request(raid_bio);
+ return split;
+ }
+ } while (split != raid_bio);
+
+ return NULL;
+}
+
/* __get_priority_stripe - get the next stripe to process
*
* Full stripe writes are allowed to pass preread active stripes up until
@@ -5140,7 +5116,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
remaining = raid5_dec_bi_active_stripes(bi);
if (remaining == 0) {
md_write_end(mddev);
- bio_endio(bi, 0);
+ bio_endio(bi);
}
}
@@ -5169,9 +5145,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* data on failed drives.
*/
if (rw == READ && mddev->degraded == 0 &&
- mddev->reshape_position == MaxSector &&
- chunk_aligned_read(mddev,bi))
- return;
+ mddev->reshape_position == MaxSector) {
+ bi = chunk_aligned_read(mddev, bi);
+ if (!bi)
+ return;
+ }
if (unlikely(bi->bi_rw & REQ_DISCARD)) {
make_discard_request(mddev, bi);
@@ -5304,7 +5282,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
release_stripe_plug(mddev, sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ bi->bi_error = -EIO;
break;
}
}
@@ -5318,7 +5296,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
- bio_endio(bi, 0);
+ bio_endio(bi);
}
}
@@ -5714,7 +5692,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
if (remaining == 0) {
trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
raid_bio, 0);
- bio_endio(raid_bio, 0);
+ bio_endio(raid_bio);
}
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -7779,7 +7757,6 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.congested = raid5_congested,
- .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid5_personality =
{
@@ -7803,7 +7780,6 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.congested = raid5_congested,
- .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid4_personality =
@@ -7828,7 +7804,6 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.congested = raid5_congested,
- .mergeable_bvec = raid5_mergeable_bvec,
};
static int __init raid5_init(void)
OpenPOWER on IntegriCloud