summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Makefile2
-rw-r--r--block/badblocks.c585
-rw-r--r--block/bio-integrity.c13
-rw-r--r--block/bio.c4
-rw-r--r--block/blk-core.c43
-rw-r--r--block/blk-merge.c22
-rw-r--r--block/blk-mq-cpumap.c2
-rw-r--r--block/blk-mq-tag.c11
-rw-r--r--block/blk-mq.c44
-rw-r--r--block/blk-mq.h11
-rw-r--r--block/blk-timeout.c17
-rw-r--r--block/blk.h2
-rw-r--r--block/genhd.c32
-rw-r--r--block/ioctl.c71
14 files changed, 789 insertions, 70 deletions
diff --git a/block/Makefile b/block/Makefile
index e850474..9eda232 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
- partitions/
+ badblocks.o partitions/
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
diff --git a/block/badblocks.c b/block/badblocks.c
new file mode 100644
index 0000000..7be53cb
--- /dev/null
+++ b/block/badblocks.c
@@ -0,0 +1,585 @@
+/*
+ * Bad block management
+ *
+ * - Heavily based on MD badblocks code from Neil Brown
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/badblocks.h>
+#include <linux/seqlock.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: sector (start) at which to check for badblocks
+ * @sectors: number of sectors to check for badblocks
+ * @first_bad: pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ * 0: there are no known bad blocks in the range
+ * 1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int hi;
+ int lo;
+ u64 *p = bb->page;
+ int rv;
+ sector_t target = s + sectors;
+ unsigned seq;
+
+ if (bb->shift > 0) {
+ /* round the start down, and the end up */
+ s >>= bb->shift;
+ target += (1<<bb->shift) - 1;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+ /* 'target' is now the first block after the bad range */
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+ lo = 0;
+ rv = 0;
+ hi = bb->count;
+
+ /* Binary search between lo and hi for 'target'
+ * i.e. for the last range that starts before 'target'
+ */
+ /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
+ * are known not to be the last range before target.
+ * VARIANT: hi-lo is the number of possible
+ * ranges, and decreases until it reaches 1
+ */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a < target)
+ /* This could still be the one, earlier ranges
+ * could not.
+ */
+ lo = mid;
+ else
+ /* This and later ranges are definitely out. */
+ hi = mid;
+ }
+ /* 'lo' might be the last that started before target, but 'hi' isn't */
+ if (hi > lo) {
+ /* need to check all range that end after 's' to see if
+ * any are unacknowledged.
+ */
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ if (BB_OFFSET(p[lo]) < target) {
+ /* starts before the end, and finishes after
+ * the start, so they must overlap
+ */
+ if (rv != -1 && BB_ACK(p[lo]))
+ rv = 1;
+ else
+ rv = -1;
+ *first_bad = BB_OFFSET(p[lo]);
+ *bad_sectors = BB_LEN(p[lo]);
+ }
+ lo--;
+ }
+ }
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ u64 *p;
+ int lo, hi;
+ int rv = 0;
+ unsigned long flags;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 0;
+
+ if (bb->shift) {
+ /* round the start down, and the end up */
+ sector_t next = s + sectors;
+
+ s >>= bb->shift;
+ next += (1<<bb->shift) - 1;
+ next >>= bb->shift;
+ sectors = next - s;
+ }
+
+ write_seqlock_irqsave(&bb->lock, flags);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts at-or-before 's' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a <= s)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo && BB_OFFSET(p[lo]) > s)
+ hi = lo;
+
+ if (hi > lo) {
+ /* we found a range that might merge with the start
+ * of our new range
+ */
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t e = a + BB_LEN(p[lo]);
+ int ack = BB_ACK(p[lo]);
+
+ if (e >= s) {
+ /* Yes, we can merge with a previous range */
+ if (s == a && s + sectors >= e)
+ /* new range covers old */
+ ack = acknowledged;
+ else
+ ack = ack && acknowledged;
+
+ if (e < s + sectors)
+ e = s + sectors;
+ if (e - a <= BB_MAX_LEN) {
+ p[lo] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ /* does not all fit in one range,
+ * make p[lo] maximal
+ */
+ if (BB_LEN(p[lo]) != BB_MAX_LEN)
+ p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ }
+ }
+ if (sectors && hi < bb->count) {
+ /* 'hi' points to the first range that starts after 's'.
+ * Maybe we can merge with the start of that range
+ */
+ sector_t a = BB_OFFSET(p[hi]);
+ sector_t e = a + BB_LEN(p[hi]);
+ int ack = BB_ACK(p[hi]);
+
+ if (a <= s + sectors) {
+ /* merging is possible */
+ if (e <= s + sectors) {
+ /* full overlap */
+ e = s + sectors;
+ ack = acknowledged;
+ } else
+ ack = ack && acknowledged;
+
+ a = s;
+ if (e - a <= BB_MAX_LEN) {
+ p[hi] = BB_MAKE(a, e-a, ack);
+ s = e;
+ } else {
+ p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
+ s = a + BB_MAX_LEN;
+ }
+ sectors = e - s;
+ lo = hi;
+ hi++;
+ }
+ }
+ if (sectors == 0 && hi < bb->count) {
+ /* we might be able to combine lo and hi */
+ /* Note: 's' is at the end of 'lo' */
+ sector_t a = BB_OFFSET(p[hi]);
+ int lolen = BB_LEN(p[lo]);
+ int hilen = BB_LEN(p[hi]);
+ int newlen = lolen + hilen - (s - a);
+
+ if (s >= a && newlen < BB_MAX_LEN) {
+ /* yes, we can combine them */
+ int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
+
+ p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
+ memmove(p + hi, p + hi + 1,
+ (bb->count - hi - 1) * 8);
+ bb->count--;
+ }
+ }
+ while (sectors) {
+ /* didn't merge (it all).
+ * Need to add a range just before 'hi'
+ */
+ if (bb->count >= MAX_BADBLOCKS) {
+ /* No room for more */
+ rv = 1;
+ break;
+ } else {
+ int this_sectors = sectors;
+
+ memmove(p + hi + 1, p + hi,
+ (bb->count - hi) * 8);
+ bb->count++;
+
+ if (this_sectors > BB_MAX_LEN)
+ this_sectors = BB_MAX_LEN;
+ p[hi] = BB_MAKE(s, this_sectors, acknowledged);
+ sectors -= this_sectors;
+ s += this_sectors;
+ }
+ }
+
+ bb->changed = 1;
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ write_sequnlock_irqrestore(&bb->lock, flags);
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ u64 *p;
+ int lo, hi;
+ sector_t target = s + sectors;
+ int rv = 0;
+
+ if (bb->shift > 0) {
+ /* When clearing we round the start up and the end down.
+ * This should not matter as the shift should align with
+ * the block size and no rounding should ever be needed.
+ * However it is better the think a block is bad when it
+ * isn't than to think a block is not bad when it is.
+ */
+ s += (1<<bb->shift) - 1;
+ s >>= bb->shift;
+ target >>= bb->shift;
+ sectors = target - s;
+ }
+
+ write_seqlock_irq(&bb->lock);
+
+ p = bb->page;
+ lo = 0;
+ hi = bb->count;
+ /* Find the last range that starts before 'target' */
+ while (hi - lo > 1) {
+ int mid = (lo + hi) / 2;
+ sector_t a = BB_OFFSET(p[mid]);
+
+ if (a < target)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (hi > lo) {
+ /* p[lo] is the last range that could overlap the
+ * current range. Earlier ranges could also overlap,
+ * but only this one can overlap the end of the range.
+ */
+ if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
+ /* Partial overlap, leave the tail of this range */
+ int ack = BB_ACK(p[lo]);
+ sector_t a = BB_OFFSET(p[lo]);
+ sector_t end = a + BB_LEN(p[lo]);
+
+ if (a < s) {
+ /* we need to split this range */
+ if (bb->count >= MAX_BADBLOCKS) {
+ rv = -ENOSPC;
+ goto out;
+ }
+ memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+ bb->count++;
+ p[lo] = BB_MAKE(a, s-a, ack);
+ lo++;
+ }
+ p[lo] = BB_MAKE(target, end - target, ack);
+ /* there is no longer an overlap */
+ hi = lo;
+ lo--;
+ }
+ while (lo >= 0 &&
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
+ /* This range does overlap */
+ if (BB_OFFSET(p[lo]) < s) {
+ /* Keep the early parts of this range. */
+ int ack = BB_ACK(p[lo]);
+ sector_t start = BB_OFFSET(p[lo]);
+
+ p[lo] = BB_MAKE(start, s - start, ack);
+ /* now low doesn't overlap, so.. */
+ break;
+ }
+ lo--;
+ }
+ /* 'lo' is strictly before, 'hi' is strictly after,
+ * anything between needs to be discarded
+ */
+ if (hi - lo > 1) {
+ memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
+ bb->count -= (hi - lo - 1);
+ }
+ }
+
+ bb->changed = 1;
+out:
+ write_sequnlock_irq(&bb->lock);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(badblocks_clear);
+
+/**
+ * ack_all_badblocks() - Acknowledge all bad blocks in a list.
+ * @bb: the badblocks structure that holds all badblock information
+ *
+ * This only succeeds if ->changed is clear. It is used by
+ * in-kernel metadata updates
+ */
+void ack_all_badblocks(struct badblocks *bb)
+{
+ if (bb->page == NULL || bb->changed)
+ /* no point even trying */
+ return;
+ write_seqlock_irq(&bb->lock);
+
+ if (bb->changed == 0 && bb->unacked_exist) {
+ u64 *p = bb->page;
+ int i;
+
+ for (i = 0; i < bb->count ; i++) {
+ if (!BB_ACK(p[i])) {
+ sector_t start = BB_OFFSET(p[i]);
+ int len = BB_LEN(p[i]);
+
+ p[i] = BB_MAKE(start, len, 1);
+ }
+ }
+ bb->unacked_exist = 0;
+ }
+ write_sequnlock_irq(&bb->lock);
+}
+EXPORT_SYMBOL_GPL(ack_all_badblocks);
+
+/**
+ * badblocks_show() - sysfs access to bad-blocks list
+ * @bb: the badblocks structure that holds all badblock information
+ * @page: buffer received from sysfs
+ * @unack: weather to show unacknowledged badblocks
+ *
+ * Return:
+ * Length of returned data
+ */
+ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
+{
+ size_t len;
+ int i;
+ u64 *p = bb->page;
+ unsigned seq;
+
+ if (bb->shift < 0)
+ return 0;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ len = 0;
+ i = 0;
+
+ while (len < PAGE_SIZE && i < bb->count) {
+ sector_t s = BB_OFFSET(p[i]);
+ unsigned int length = BB_LEN(p[i]);
+ int ack = BB_ACK(p[i]);
+
+ i++;
+
+ if (unack && ack)
+ continue;
+
+ len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
+ (unsigned long long)s << bb->shift,
+ length << bb->shift);
+ }
+ if (unack && len == 0)
+ bb->unacked_exist = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_show);
+
+/**
+ * badblocks_store() - sysfs access to bad-blocks list
+ * @bb: the badblocks structure that holds all badblock information
+ * @page: buffer received from sysfs
+ * @len: length of data received from sysfs
+ * @unack: weather to show unacknowledged badblocks
+ *
+ * Return:
+ * Length of the buffer processed or -ve error.
+ */
+ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
+ int unack)
+{
+ unsigned long long sector;
+ int length;
+ char newline;
+
+ switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
+ case 3:
+ if (newline != '\n')
+ return -EINVAL;
+ case 2:
+ if (length <= 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (badblocks_set(bb, sector, length, !unack))
+ return -ENOSPC;
+ else
+ return len;
+}
+EXPORT_SYMBOL_GPL(badblocks_store);
+
+static int __badblocks_init(struct device *dev, struct badblocks *bb,
+ int enable)
+{
+ bb->dev = dev;
+ bb->count = 0;
+ if (enable)
+ bb->shift = 0;
+ else
+ bb->shift = -1;
+ if (dev)
+ bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
+ else
+ bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!bb->page) {
+ bb->shift = -1;
+ return -ENOMEM;
+ }
+ seqlock_init(&bb->lock);
+
+ return 0;
+}
+
+/**
+ * badblocks_init() - initialize the badblocks structure
+ * @bb: the badblocks structure that holds all badblock information
+ * @enable: weather to enable badblocks accounting
+ *
+ * Return:
+ * 0: success
+ * -ve errno: on error
+ */
+int badblocks_init(struct badblocks *bb, int enable)
+{
+ return __badblocks_init(NULL, bb, enable);
+}
+EXPORT_SYMBOL_GPL(badblocks_init);
+
+int devm_init_badblocks(struct device *dev, struct badblocks *bb)
+{
+ if (!bb)
+ return -EINVAL;
+ return __badblocks_init(dev, bb, 1);
+}
+EXPORT_SYMBOL_GPL(devm_init_badblocks);
+
+/**
+ * badblocks_exit() - free the badblocks structure
+ * @bb: the badblocks structure that holds all badblock information
+ */
+void badblocks_exit(struct badblocks *bb)
+{
+ if (!bb)
+ return;
+ if (bb->dev)
+ devm_kfree(bb->dev, bb->page);
+ else
+ kfree(bb->page);
+ bb->page = NULL;
+}
+EXPORT_SYMBOL_GPL(badblocks_exit);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index f6325d5..711e4d8d 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
}
if (unlikely(!bip))
- return NULL;
+ return ERR_PTR(-ENOMEM);
memset(bip, 0, sizeof(*bip));
@@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
return bip;
err:
mempool_free(bip, bs->bio_integrity_pool);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(bio_integrity_alloc);
@@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio)
/* Allocate bio integrity payload and integrity vectors */
bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
- if (unlikely(bip == NULL)) {
+ if (IS_ERR(bip)) {
printk(KERN_ERR "could not allocate data integrity bioset\n");
kfree(buf);
- return -EIO;
+ return PTR_ERR(bip);
}
bip->bip_flags |= BIP_BLOCK_INTEGRITY;
@@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
BUG_ON(bip_src == NULL);
bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
- if (bip == NULL)
- return -EIO;
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
memcpy(bip->bip_vec, bip_src->bip_vec,
bip_src->bip_vcnt * sizeof(struct bio_vec));
diff --git a/block/bio.c b/block/bio.c
index 4f184d9..dbabd48 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1125,7 +1125,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
int i, ret;
int nr_pages = 0;
unsigned int len = iter->count;
- unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
+ unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0;
for (i = 0; i < iter->nr_segs; i++) {
unsigned long uaddr;
@@ -1304,7 +1304,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
goto out_unmap;
}
- offset = uaddr & ~PAGE_MASK;
+ offset = offset_in_page(uaddr);
for (j = cur_page; j < page_limit; j++) {
unsigned int bytes = PAGE_SIZE - offset;
diff --git a/block/blk-core.c b/block/blk-core.c
index 3636be4..ab51685 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,7 +51,7 @@ DEFINE_IDA(blk_queue_ida);
/*
* For the allocated request tables
*/
-struct kmem_cache *request_cachep = NULL;
+struct kmem_cache *request_cachep;
/*
* For queue allocation
@@ -207,6 +207,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs)
EXPORT_SYMBOL(blk_delay_queue);
/**
+ * blk_start_queue_async - asynchronously restart a previously stopped queue
+ * @q: The &struct request_queue in question
+ *
+ * Description:
+ * blk_start_queue_async() will clear the stop flag on the queue, and
+ * ensure that the request_fn for the queue is run from an async
+ * context.
+ **/
+void blk_start_queue_async(struct request_queue *q)
+{
+ queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+ blk_run_queue_async(q);
+}
+EXPORT_SYMBOL(blk_start_queue_async);
+
+/**
* blk_start_queue - restart a previously stopped queue
* @q: The &struct request_queue in question
*
@@ -630,7 +646,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
}
EXPORT_SYMBOL(blk_alloc_queue);
-int blk_queue_enter(struct request_queue *q, gfp_t gfp)
+int blk_queue_enter(struct request_queue *q, bool nowait)
{
while (true) {
int ret;
@@ -638,7 +654,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
if (percpu_ref_tryget_live(&q->q_usage_counter))
return 0;
- if (!gfpflags_allow_blocking(gfp))
+ if (nowait)
return -EBUSY;
ret = wait_event_interruptible(q->mq_freeze_wq,
@@ -664,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
wake_up_all(&q->mq_freeze_wq);
}
+static void blk_rq_timed_out_timer(unsigned long data)
+{
+ struct request_queue *q = (struct request_queue *)data;
+
+ kblockd_schedule_work(&q->timeout_work);
+}
+
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
{
struct request_queue *q;
@@ -825,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
goto fail;
+ INIT_WORK(&q->timeout_work, blk_timeout_work);
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unprep_rq_fn = NULL;
@@ -1276,7 +1300,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
{
if (q->mq_ops)
- return blk_mq_alloc_request(q, rw, gfp_mask, false);
+ return blk_mq_alloc_request(q, rw,
+ (gfp_mask & __GFP_DIRECT_RECLAIM) ?
+ 0 : BLK_MQ_REQ_NOWAIT);
else
return blk_old_get_request(q, rw, gfp_mask);
}
@@ -1689,8 +1715,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
struct request *req;
unsigned int request_count = 0;
- blk_queue_split(q, &bio, q->bio_split);
-
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
@@ -1698,6 +1722,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
*/
blk_queue_bounce(q, &bio);
+ blk_queue_split(q, &bio, q->bio_split);
+
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio->bi_error = -EIO;
bio_endio(bio);
@@ -2044,8 +2070,7 @@ blk_qc_t generic_make_request(struct bio *bio)
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
-
+ if (likely(blk_queue_enter(q, false) == 0)) {
ret = q->make_request_fn(q, bio);
blk_queue_exit(q);
@@ -3518,7 +3543,7 @@ int __init blk_dev_init(void)
request_cachep = kmem_cache_create("blkdev_requests",
sizeof(struct request), 0, SLAB_PANIC, NULL);
- blk_requestq_cachep = kmem_cache_create("blkdev_queue",
+ blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
return 0;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e01405a..1699df5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -7,6 +7,8 @@
#include <linux/blkdev.h>
#include <linux/scatterlist.h>
+#include <trace/events/block.h>
+
#include "blk.h"
static struct bio *blk_bio_discard_split(struct request_queue *q,
@@ -81,9 +83,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *new = NULL;
bio_for_each_segment(bv, bio, iter) {
- if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q))
- goto split;
-
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
@@ -91,6 +90,22 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
goto split;
+ if (sectors + (bv.bv_len >> 9) >
+ blk_max_size_offset(q, bio->bi_iter.bi_sector)) {
+ /*
+ * Consider this a new segment if we're splitting in
+ * the middle of this vector.
+ */
+ if (nsegs < queue_max_segments(q) &&
+ sectors < blk_max_size_offset(q,
+ bio->bi_iter.bi_sector)) {
+ nsegs++;
+ sectors = blk_max_size_offset(q,
+ bio->bi_iter.bi_sector);
+ }
+ goto split;
+ }
+
if (bvprvp && blk_queue_cluster(q)) {
if (seg_size + bv.bv_len > queue_max_segment_size(q))
goto new_segment;
@@ -162,6 +177,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
split->bi_rw |= REQ_NOMERGE;
bio_chain(split, *bio);
+ trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
*bio = split;
}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 8764c24..d0634bc 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -113,7 +113,7 @@ int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
for_each_possible_cpu(i) {
if (index == mq_map[i])
- return cpu_to_node(i);
+ return local_memory_node(cpu_to_node(i));
}
return NUMA_NO_NODE;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index a07ca34..abdbb47 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
if (tag != -1)
return tag;
- if (!gfpflags_allow_blocking(data->gfp))
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
return -1;
bs = bt_wait_ptr(bt, hctx);
@@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
data->ctx = blk_mq_get_ctx(data->q);
data->hctx = data->q->mq_ops->map_queue(data->q,
data->ctx->cpu);
- if (data->reserved) {
+ if (data->flags & BLK_MQ_REQ_RESERVED) {
bt = &data->hctx->tags->breserved_tags;
} else {
last_tag = &data->ctx->last_tag;
@@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
- if (!data->reserved)
- return __blk_mq_get_tag(data);
-
- return __blk_mq_get_reserved_tag(data);
+ if (data->flags & BLK_MQ_REQ_RESERVED)
+ return __blk_mq_get_reserved_tag(data);
+ return __blk_mq_get_tag(data);
}
static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6d6f8fe..4c0622f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -229,8 +229,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
return NULL;
}
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
- bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+ unsigned int flags)
{
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
@@ -238,24 +238,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
struct blk_mq_alloc_data alloc_data;
int ret;
- ret = blk_queue_enter(q, gfp);
+ ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
if (ret)
return ERR_PTR(ret);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
- reserved, ctx, hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
- if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
+ if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
__blk_mq_run_hw_queue(hctx);
blk_mq_put_ctx(ctx);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
- hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
ctx = alloc_data.ctx;
}
@@ -605,8 +603,6 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
blk_mq_complete_request(rq, -EIO);
return;
}
- if (rq->cmd_flags & REQ_NO_TIMEOUT)
- return;
if (time_after_eq(jiffies, rq->deadline)) {
if (!blk_mark_rq_complete(rq))
@@ -617,15 +613,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
}
}
-static void blk_mq_rq_timer(unsigned long priv)
+static void blk_mq_timeout_work(struct work_struct *work)
{
- struct request_queue *q = (struct request_queue *)priv;
+ struct request_queue *q =
+ container_of(work, struct request_queue, timeout_work);
struct blk_mq_timeout_data data = {
.next = 0,
.next_set = 0,
};
int i;
+ if (blk_queue_enter(q, true))
+ return;
+
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.next_set) {
@@ -640,6 +640,7 @@ static void blk_mq_rq_timer(unsigned long priv)
blk_mq_tag_idle(hctx);
}
}
+ blk_queue_exit(q);
}
/*
@@ -1175,8 +1176,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
rw |= REQ_SYNC;
trace_block_getrq(q, bio, rw);
- blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
- hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
if (unlikely(!rq)) {
__blk_mq_run_hw_queue(hctx);
@@ -1185,8 +1185,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q,
- __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
ctx = alloc_data.ctx;
hctx = alloc_data.hctx;
@@ -1794,7 +1793,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
* not, we remain on the home node of the device
*/
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
- hctx->numa_node = cpu_to_node(i);
+ hctx->numa_node = local_memory_node(cpu_to_node(i));
}
}
@@ -1854,6 +1853,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
+ cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
/*
* Set the map size to the number of mapped software queues.
* This is more accurate and more efficient than looping
@@ -1867,14 +1867,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx->next_cpu = cpumask_first(hctx->cpumask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
-
- queue_for_each_ctx(q, ctx, i) {
- if (!cpumask_test_cpu(i, online_mask))
- continue;
-
- hctx = q->mq_ops->map_queue(q, i);
- cpumask_set_cpu(i, hctx->tags->cpumask);
- }
}
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
@@ -2019,7 +2011,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
hctxs[i]->queue_num = i;
}
- setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+ INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->nr_queues = nr_cpu_ids;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 713820b..eaede8e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -96,8 +96,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
struct blk_mq_alloc_data {
/* input parameter */
struct request_queue *q;
- gfp_t gfp;
- bool reserved;
+ unsigned int flags;
/* input & output parameter */
struct blk_mq_ctx *ctx;
@@ -105,13 +104,11 @@ struct blk_mq_alloc_data {
};
static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
- struct request_queue *q, gfp_t gfp, bool reserved,
- struct blk_mq_ctx *ctx,
- struct blk_mq_hw_ctx *hctx)
+ struct request_queue *q, unsigned int flags,
+ struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
{
data->q = q;
- data->gfp = gfp;
- data->reserved = reserved;
+ data->flags = flags;
data->ctx = ctx;
data->hctx = hctx;
}
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index aa40aa9..a30441a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
}
}
-void blk_rq_timed_out_timer(unsigned long data)
+void blk_timeout_work(struct work_struct *work)
{
- struct request_queue *q = (struct request_queue *) data;
+ struct request_queue *q =
+ container_of(work, struct request_queue, timeout_work);
unsigned long flags, next = 0;
struct request *rq, *tmp;
int next_set = 0;
+ if (blk_queue_enter(q, true))
+ return;
spin_lock_irqsave(q->queue_lock, flags);
list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data)
mod_timer(&q->timeout, round_jiffies_up(next));
spin_unlock_irqrestore(q->queue_lock, flags);
+ blk_queue_exit(q);
}
/**
@@ -186,15 +190,13 @@ unsigned long blk_rq_timeout(unsigned long timeout)
* Notes:
* Each request has its own timer, and as it is added to the queue, we
* set up the timer. When the request completes, we cancel the timer.
+ * Queue lock must be held for the non-mq case, mq case doesn't care.
*/
void blk_add_timer(struct request *req)
{
struct request_queue *q = req->q;
unsigned long expiry;
- if (req->cmd_flags & REQ_NO_TIMEOUT)
- return;
-
/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
if (!q->mq_ops && !q->rq_timed_out_fn)
return;
@@ -209,6 +211,11 @@ void blk_add_timer(struct request *req)
req->timeout = q->rq_timeout;
req->deadline = jiffies + req->timeout;
+
+ /*
+ * Only the non-mq case needs to add the request to a protected list.
+ * For the mq case we simply scan the tag map.
+ */
if (!q->mq_ops)
list_add_tail(&req->timeout_list, &req->q->timeout_list);
diff --git a/block/blk.h b/block/blk.h
index c43926d..70e4aee 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void)
}
#endif
-void blk_rq_timed_out_timer(unsigned long data);
+void blk_timeout_work(struct work_struct *work);
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
void blk_delete_timer(struct request *);
diff --git a/block/genhd.c b/block/genhd.c
index e5cafa5..9f42526 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
+#include <linux/badblocks.h>
#include "blk.h"
@@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
- disk->driverfs_dev = NULL;
if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk)
}
EXPORT_SYMBOL(del_gendisk);
+/* sysfs access to bad-blocks list. */
+static ssize_t disk_badblocks_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->bb)
+ return sprintf(page, "\n");
+
+ return badblocks_show(disk->bb, page, 0);
+}
+
+static ssize_t disk_badblocks_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *page, size_t len)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->bb)
+ return -ENXIO;
+
+ return badblocks_store(disk->bb, page, len, 0);
+}
+
/**
* get_gendisk - get partitioning information for a given device
* @devt: device to get partitioning information for
@@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
+ disk_badblocks_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
+ &dev_attr_badblocks.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1421,7 +1449,7 @@ static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);
/* disable in-kernel polling by default */
-static unsigned long disk_events_dfl_poll_msecs = 0;
+static unsigned long disk_events_dfl_poll_msecs;
static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
diff --git a/block/ioctl.c b/block/ioctl.c
index 0918aed..77f5d177 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -4,6 +4,7 @@
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
+#include <linux/badblocks.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
@@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret)
ret == -ENOIOCTLCMD;
}
+#ifdef CONFIG_FS_DAX
+bool blkdev_dax_capable(struct block_device *bdev)
+{
+ struct gendisk *disk = bdev->bd_disk;
+
+ if (!disk->fops->direct_access)
+ return false;
+
+ /*
+ * If the partition is not aligned on a page boundary, we can't
+ * do dax I/O to it.
+ */
+ if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+ || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ return false;
+
+ /*
+ * If the device has known bad blocks, force all I/O through the
+ * driver / page cache.
+ *
+ * TODO: support finer grained dax error handling
+ */
+ if (disk->bb && disk->bb->count)
+ return false;
+
+ return true;
+}
+
+static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
+{
+ unsigned long arg;
+ int rc = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (get_user(arg, (int __user *)(argp)))
+ return -EFAULT;
+ arg = !!arg;
+ if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
+ return 0;
+
+ if (arg)
+ arg = S_DAX;
+
+ if (arg && !blkdev_dax_capable(bdev))
+ return -ENOTTY;
+
+ inode_lock(bdev->bd_inode);
+ if (bdev->bd_map_count == 0)
+ inode_set_flags(bdev->bd_inode, arg, S_DAX);
+ else
+ rc = -EBUSY;
+ inode_unlock(bdev->bd_inode);
+ return rc;
+}
+#else
+static int blkdev_daxset(struct block_device *bdev, int arg)
+{
+ if (arg)
+ return -ENOTTY;
+ return 0;
+}
+#endif
+
static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
@@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKDAXSET:
+ return blkdev_daxset(bdev, arg);
+ case BLKDAXGET:
+ return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
+ break;
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE:
OpenPOWER on IntegriCloud