From eef35c2d41ddcc653c20d26b977acaa45c811e1f Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 6 Aug 2010 21:11:15 +0200 Subject: Fix spelling fuction -> function in comments To avoid more patches, I also fixed other spelling and grammar bugs when they were in the same or following line: successfull -> successful parse -> parses controler -> controller controlers -> controllers Cc: Jiri Kosina Cc: linux-kernel@vger.kernel.org Signed-off-by: Stefan Weil Signed-off-by: Jiri Kosina --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f0640d7..7ac24fa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1618,7 +1618,7 @@ EXPORT_SYMBOL(submit_bio); * the insertion using this generic function. * * This function should also be useful for request stacking drivers - * in some cases below, so export this fuction. + * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue * limits while requests are in the queue (e.g. dm's table swapping). * Such request stacking drivers should check those requests agaist -- cgit v1.1 From 6958f145459ca7ad9715024de97445addacb8510 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:16 +0200 Subject: block: kill QUEUE_ORDERED_BY_TAG Nobody is making meaningful use of ORDERED_BY_TAG now and queue draining for barrier requests will be removed soon which will render the advantage of tag ordering moot. Kill ORDERED_BY_TAG. The following users are affected. * brd: converted to ORDERED_DRAIN. * virtio_blk: ORDERED_TAG path was already marked deprecated. Removed. * xen-blkfront: ORDERED_TAG case dropped. Signed-off-by: Tejun Heo Cc: Christoph Hellwig Cc: Nick Piggin Cc: Michael S. Tsirkin Cc: Jeremy Fitzhardinge Cc: Chris Wright Signed-off-by: Jens Axboe --- block/blk-barrier.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) (limited to 'block') diff --git a/block/blk-barrier.c b/block/blk-barrier.c index f0faefc..c807e9c 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -26,10 +26,7 @@ int blk_queue_ordered(struct request_queue *q, unsigned ordered) if (ordered != QUEUE_ORDERED_NONE && ordered != QUEUE_ORDERED_DRAIN && ordered != QUEUE_ORDERED_DRAIN_FLUSH && - ordered != QUEUE_ORDERED_DRAIN_FUA && - ordered != QUEUE_ORDERED_TAG && - ordered != QUEUE_ORDERED_TAG_FLUSH && - ordered != QUEUE_ORDERED_TAG_FUA) { + ordered != QUEUE_ORDERED_DRAIN_FUA) { printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); return -EINVAL; } @@ -155,21 +152,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) * For an empty barrier, there's no actual BAR request, which * in turn makes POSTFLUSH unnecessary. Mask them off. */ - if (!blk_rq_sectors(rq)) { + if (!blk_rq_sectors(rq)) q->ordered &= ~(QUEUE_ORDERED_DO_BAR | QUEUE_ORDERED_DO_POSTFLUSH); - /* - * Empty barrier on a write-through device w/ ordered - * tag has no command to issue and without any command - * to issue, ordering by tag can't be used. Drain - * instead. - */ - if ((q->ordered & QUEUE_ORDERED_BY_TAG) && - !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) { - q->ordered &= ~QUEUE_ORDERED_BY_TAG; - q->ordered |= QUEUE_ORDERED_BY_DRAIN; - } - } /* stash away the original request */ blk_dequeue_request(rq); @@ -210,7 +195,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) } else skip |= QUEUE_ORDSEQ_PREFLUSH; - if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q)) + if (queue_in_flight(q)) rq = NULL; else skip |= QUEUE_ORDSEQ_DRAIN; @@ -257,16 +242,10 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp) rq != &q->pre_flush_rq && rq != &q->post_flush_rq) return true; - if (q->ordered & QUEUE_ORDERED_BY_TAG) { - /* Ordered by tag. Blocking the next barrier is enough. */ - if (is_barrier && rq != &q->bar_rq) - *rqp = NULL; - } else { - /* Ordered by draining. Wait for turn. */ - WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); - if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) - *rqp = NULL; - } + /* Ordered by draining. Wait for turn. */ + WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); + if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) + *rqp = NULL; return true; } -- cgit v1.1 From 4913efe456c987057e5d36a3f0a55422a9072cae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:16 +0200 Subject: block: deprecate barrier and replace blk_queue_ordered() with blk_queue_flush() Barrier is deemed too heavy and will soon be replaced by FLUSH/FUA requests. Deprecate barrier. All REQ_HARDBARRIERs are failed with -EOPNOTSUPP and blk_queue_ordered() is replaced with simpler blk_queue_flush(). blk_queue_flush() takes combinations of REQ_FLUSH and FUA. If a device has write cache and can flush it, it should set REQ_FLUSH. If the device can handle FUA writes, it should also set REQ_FUA. All blk_queue_ordered() users are converted. * ORDERED_DRAIN is mapped to 0 which is the default value. * ORDERED_DRAIN_FLUSH is mapped to REQ_FLUSH. * ORDERED_DRAIN_FLUSH_FUA is mapped to REQ_FLUSH | REQ_FUA. Signed-off-by: Tejun Heo Acked-by: Boaz Harrosh Cc: Christoph Hellwig Cc: Nick Piggin Cc: Michael S. Tsirkin Cc: Jeremy Fitzhardinge Cc: Chris Wright Cc: FUJITA Tomonori Cc: Geert Uytterhoeven Cc: David S. Miller Cc: Alasdair G Kergon Cc: Pierre Ossman Cc: Stefan Weinhuber Signed-off-by: Jens Axboe --- block/blk-barrier.c | 29 ----------------------------- block/blk-core.c | 6 ++++-- block/blk-settings.c | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 31 deletions(-) (limited to 'block') diff --git a/block/blk-barrier.c b/block/blk-barrier.c index c807e9c..ed0aba5 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -9,35 +9,6 @@ #include "blk.h" -/** - * blk_queue_ordered - does this queue support ordered writes - * @q: the request queue - * @ordered: one of QUEUE_ORDERED_* - * - * Description: - * For journalled file systems, doing ordered writes on a commit - * block instead of explicitly doing wait_on_buffer (which is bad - * for performance) can be a big win. Block drivers supporting this - * feature should call this function and indicate so. - * - **/ -int blk_queue_ordered(struct request_queue *q, unsigned ordered) -{ - if (ordered != QUEUE_ORDERED_NONE && - ordered != QUEUE_ORDERED_DRAIN && - ordered != QUEUE_ORDERED_DRAIN_FLUSH && - ordered != QUEUE_ORDERED_DRAIN_FUA) { - printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); - return -EINVAL; - } - - q->ordered = ordered; - q->next_ordered = ordered; - - return 0; -} -EXPORT_SYMBOL(blk_queue_ordered); - /* * Cache flushing for ordered writes handling */ diff --git a/block/blk-core.c b/block/blk-core.c index ee1a1e7..f063541 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1203,11 +1203,13 @@ static int __make_request(struct request_queue *q, struct bio *bio) const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; - if ((bio->bi_rw & REQ_HARDBARRIER) && - (q->next_ordered == QUEUE_ORDERED_NONE)) { + /* REQ_HARDBARRIER is no more */ + if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER, + "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) { bio_endio(bio, -EOPNOTSUPP); return 0; } + /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even diff --git a/block/blk-settings.c b/block/blk-settings.c index a234f4b..9b18afc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -794,6 +794,26 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) } EXPORT_SYMBOL(blk_queue_update_dma_alignment); +/** + * blk_queue_flush - configure queue's cache flush capability + * @q: the request queue for the device + * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA + * + * Tell block layer cache flush capability of @q. If it supports + * flushing, REQ_FLUSH should be set. If it supports bypassing + * write cache for individual writes, REQ_FUA should be set. + */ +void blk_queue_flush(struct request_queue *q, unsigned int flush) +{ + WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA)); + + if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA))) + flush &= ~REQ_FUA; + + q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); +} +EXPORT_SYMBOL_GPL(blk_queue_flush); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; -- cgit v1.1 From dd831006d5be7f74c3fe7aef82380c51c3637960 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:16 +0200 Subject: block: misc cleanups in barrier code Make the following cleanups in preparation of barrier/flush update. * blk_do_ordered() declaration is moved from include/linux/blkdev.h to block/blk.h. * blk_do_ordered() now returns pointer to struct request, with %NULL meaning "try the next request" and ERR_PTR(-EAGAIN) "try again later". The third case will be dropped with further changes. * In the initialization of proxy barrier request, data direction is already set by init_request_from_bio(). Drop unnecessary explicit REQ_WRITE setting and move init_request_from_bio() above REQ_FUA flag setting. * add_request() is collapsed into __make_request(). These changes don't make any functional difference. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-barrier.c | 32 ++++++++++++++------------------ block/blk-core.c | 21 ++++----------------- block/blk.h | 7 +++++-- 3 files changed, 23 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/blk-barrier.c b/block/blk-barrier.c index ed0aba5..f1be85b 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -110,9 +110,9 @@ static void queue_flush(struct request_queue *q, unsigned which) elv_insert(q, rq, ELEVATOR_INSERT_FRONT); } -static inline bool start_ordered(struct request_queue *q, struct request **rqp) +static inline struct request *start_ordered(struct request_queue *q, + struct request *rq) { - struct request *rq = *rqp; unsigned skip = 0; q->orderr = 0; @@ -149,11 +149,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) /* initialize proxy request and queue it */ blk_rq_init(q, rq); - if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) - rq->cmd_flags |= REQ_WRITE; + init_request_from_bio(rq, q->orig_bar_rq->bio); if (q->ordered & QUEUE_ORDERED_DO_FUA) rq->cmd_flags |= REQ_FUA; - init_request_from_bio(rq, q->orig_bar_rq->bio); rq->end_io = bar_end_io; elv_insert(q, rq, ELEVATOR_INSERT_FRONT); @@ -171,27 +169,26 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) else skip |= QUEUE_ORDSEQ_DRAIN; - *rqp = rq; - /* * Complete skipped sequences. If whole sequence is complete, - * return false to tell elevator that this request is gone. + * return %NULL to tell elevator that this request is gone. */ - return !blk_ordered_complete_seq(q, skip, 0); + if (blk_ordered_complete_seq(q, skip, 0)) + rq = NULL; + return rq; } -bool blk_do_ordered(struct request_queue *q, struct request **rqp) +struct request *blk_do_ordered(struct request_queue *q, struct request *rq) { - struct request *rq = *rqp; const int is_barrier = rq->cmd_type == REQ_TYPE_FS && (rq->cmd_flags & REQ_HARDBARRIER); if (!q->ordseq) { if (!is_barrier) - return true; + return rq; if (q->next_ordered != QUEUE_ORDERED_NONE) - return start_ordered(q, rqp); + return start_ordered(q, rq); else { /* * Queue ordering not supported. Terminate @@ -199,8 +196,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp) */ blk_dequeue_request(rq); __blk_end_request_all(rq, -EOPNOTSUPP); - *rqp = NULL; - return false; + return NULL; } } @@ -211,14 +207,14 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp) /* Special requests are not subject to ordering rules. */ if (rq->cmd_type != REQ_TYPE_FS && rq != &q->pre_flush_rq && rq != &q->post_flush_rq) - return true; + return rq; /* Ordered by draining. Wait for turn. */ WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) - *rqp = NULL; + rq = ERR_PTR(-EAGAIN); - return true; + return rq; } static void bio_end_empty_barrier(struct bio *bio, int err) diff --git a/block/blk-core.c b/block/blk-core.c index f063541..f8d37a8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1037,22 +1037,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_insert_request); -/* - * add-request adds a request to the linked list. - * queue lock is held and interrupts disabled, as we muck with the - * request queue list. - */ -static inline void add_request(struct request_queue *q, struct request *req) -{ - drive_stat_acct(req, 1); - - /* - * elevator indicated where it wants this request to be - * inserted at elevator_merge time - */ - __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); -} - static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { @@ -1316,7 +1300,10 @@ get_rq: req->cpu = blk_cpu_to_group(smp_processor_id()); if (queue_should_plug(q) && elv_queue_empty(q)) blk_plug_device(q); - add_request(q, req); + + /* insert the request into the elevator */ + drive_stat_acct(req, 1); + __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); out: if (unplug || !queue_should_plug(q)) __generic_unplug_device(q); diff --git a/block/blk.h b/block/blk.h index 6e7dc87..874eb4e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq) */ #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +struct request *blk_do_ordered(struct request_queue *q, struct request *rq); + static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; @@ -58,8 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q) while (1) { while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - if (blk_do_ordered(q, &rq)) - return rq; + rq = blk_do_ordered(q, rq); + if (rq) + return !IS_ERR(rq) ? rq : NULL; } if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) -- cgit v1.1 From 28e7d1845216538303bb95d679d8fd4de50e2f1a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:16 +0200 Subject: block: drop barrier ordering by queue draining Filesystems will take all the responsibilities for ordering requests around commit writes and will only indicate how the commit writes themselves should be handled by block layers. This patch drops barrier ordering by queue draining from block layer. Ordering by draining implementation was somewhat invasive to request handling. List of notable changes follow. * Each queue has 1 bit color which is flipped on each barrier issue. This is used to track whether a given request is issued before the current barrier or not. REQ_ORDERED_COLOR flag and coloring implementation in __elv_add_request() are removed. * Requests which shouldn't be processed yet for draining were stalled by returning -EAGAIN from blk_do_ordered() according to the test result between blk_ordered_req_seq() and blk_blk_ordered_cur_seq(). This logic is removed. * Draining completion logic in elv_completed_request() removed. * All barrier sequence requests were queued to request queue and then trckled to lower layer according to progress and thus maintaining request orders during requeue was necessary. This is replaced by queueing the next request in the barrier sequence only after the current one is complete from blk_ordered_complete_seq(), which removes the need for multiple proxy requests in struct request_queue and the request sorting logic in the ELEVATOR_INSERT_REQUEUE path of elv_insert(). * As barriers no longer have ordering constraints, there's no need to dump the whole elevator onto the dispatch queue on each barrier. Insert barriers at the front instead. * If other barrier requests come to the front of the dispatch queue while one is already in progress, they are stored in q->pending_barriers and restored to dispatch queue one-by-one after each barrier completion from blk_ordered_complete_seq(). Signed-off-by: Tejun Heo Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-barrier.c | 220 ++++++++++++++++++++-------------------------------- block/blk-core.c | 11 ++- block/blk.h | 2 +- block/elevator.c | 79 +++---------------- 4 files changed, 105 insertions(+), 207 deletions(-) (limited to 'block') diff --git a/block/blk-barrier.c b/block/blk-barrier.c index f1be85b..e8b2e5c 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -9,6 +9,8 @@ #include "blk.h" +static struct request *queue_next_ordseq(struct request_queue *q); + /* * Cache flushing for ordered writes handling */ @@ -19,38 +21,10 @@ unsigned blk_ordered_cur_seq(struct request_queue *q) return 1 << ffz(q->ordseq); } -unsigned blk_ordered_req_seq(struct request *rq) -{ - struct request_queue *q = rq->q; - - BUG_ON(q->ordseq == 0); - - if (rq == &q->pre_flush_rq) - return QUEUE_ORDSEQ_PREFLUSH; - if (rq == &q->bar_rq) - return QUEUE_ORDSEQ_BAR; - if (rq == &q->post_flush_rq) - return QUEUE_ORDSEQ_POSTFLUSH; - - /* - * !fs requests don't need to follow barrier ordering. Always - * put them at the front. This fixes the following deadlock. - * - * http://thread.gmane.org/gmane.linux.kernel/537473 - */ - if (rq->cmd_type != REQ_TYPE_FS) - return QUEUE_ORDSEQ_DRAIN; - - if ((rq->cmd_flags & REQ_ORDERED_COLOR) == - (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) - return QUEUE_ORDSEQ_DRAIN; - else - return QUEUE_ORDSEQ_DONE; -} - -bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) +static struct request *blk_ordered_complete_seq(struct request_queue *q, + unsigned seq, int error) { - struct request *rq; + struct request *next_rq = NULL; if (error && !q->orderr) q->orderr = error; @@ -58,16 +32,22 @@ bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) BUG_ON(q->ordseq & seq); q->ordseq |= seq; - if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) - return false; - - /* - * Okay, sequence complete. - */ - q->ordseq = 0; - rq = q->orig_bar_rq; - __blk_end_request_all(rq, q->orderr); - return true; + if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) { + /* not complete yet, queue the next ordered sequence */ + next_rq = queue_next_ordseq(q); + } else { + /* complete this barrier request */ + __blk_end_request_all(q->orig_bar_rq, q->orderr); + q->orig_bar_rq = NULL; + q->ordseq = 0; + + /* dispatch the next barrier if there's one */ + if (!list_empty(&q->pending_barriers)) { + next_rq = list_entry_rq(q->pending_barriers.next); + list_move(&next_rq->queuelist, &q->queue_head); + } + } + return next_rq; } static void pre_flush_end_io(struct request *rq, int error) @@ -88,133 +68,105 @@ static void post_flush_end_io(struct request *rq, int error) blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); } -static void queue_flush(struct request_queue *q, unsigned which) +static void queue_flush(struct request_queue *q, struct request *rq, + rq_end_io_fn *end_io) { - struct request *rq; - rq_end_io_fn *end_io; - - if (which == QUEUE_ORDERED_DO_PREFLUSH) { - rq = &q->pre_flush_rq; - end_io = pre_flush_end_io; - } else { - rq = &q->post_flush_rq; - end_io = post_flush_end_io; - } - blk_rq_init(q, rq); rq->cmd_type = REQ_TYPE_FS; - rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH; + rq->cmd_flags = REQ_FLUSH; rq->rq_disk = q->orig_bar_rq->rq_disk; rq->end_io = end_io; elv_insert(q, rq, ELEVATOR_INSERT_FRONT); } -static inline struct request *start_ordered(struct request_queue *q, - struct request *rq) +static struct request *queue_next_ordseq(struct request_queue *q) { - unsigned skip = 0; - - q->orderr = 0; - q->ordered = q->next_ordered; - q->ordseq |= QUEUE_ORDSEQ_STARTED; - - /* - * For an empty barrier, there's no actual BAR request, which - * in turn makes POSTFLUSH unnecessary. Mask them off. - */ - if (!blk_rq_sectors(rq)) - q->ordered &= ~(QUEUE_ORDERED_DO_BAR | - QUEUE_ORDERED_DO_POSTFLUSH); - - /* stash away the original request */ - blk_dequeue_request(rq); - q->orig_bar_rq = rq; - rq = NULL; - - /* - * Queue ordered sequence. As we stack them at the head, we - * need to queue in reverse order. Note that we rely on that - * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs - * request gets inbetween ordered sequence. - */ - if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) { - queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH); - rq = &q->post_flush_rq; - } else - skip |= QUEUE_ORDSEQ_POSTFLUSH; + struct request *rq = &q->bar_rq; - if (q->ordered & QUEUE_ORDERED_DO_BAR) { - rq = &q->bar_rq; + switch (blk_ordered_cur_seq(q)) { + case QUEUE_ORDSEQ_PREFLUSH: + queue_flush(q, rq, pre_flush_end_io); + break; + case QUEUE_ORDSEQ_BAR: /* initialize proxy request and queue it */ blk_rq_init(q, rq); init_request_from_bio(rq, q->orig_bar_rq->bio); + rq->cmd_flags &= ~REQ_HARDBARRIER; if (q->ordered & QUEUE_ORDERED_DO_FUA) rq->cmd_flags |= REQ_FUA; rq->end_io = bar_end_io; elv_insert(q, rq, ELEVATOR_INSERT_FRONT); - } else - skip |= QUEUE_ORDSEQ_BAR; + break; - if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) { - queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH); - rq = &q->pre_flush_rq; - } else - skip |= QUEUE_ORDSEQ_PREFLUSH; + case QUEUE_ORDSEQ_POSTFLUSH: + queue_flush(q, rq, post_flush_end_io); + break; - if (queue_in_flight(q)) - rq = NULL; - else - skip |= QUEUE_ORDSEQ_DRAIN; - - /* - * Complete skipped sequences. If whole sequence is complete, - * return %NULL to tell elevator that this request is gone. - */ - if (blk_ordered_complete_seq(q, skip, 0)) - rq = NULL; + default: + BUG(); + } return rq; } struct request *blk_do_ordered(struct request_queue *q, struct request *rq) { - const int is_barrier = rq->cmd_type == REQ_TYPE_FS && - (rq->cmd_flags & REQ_HARDBARRIER); - - if (!q->ordseq) { - if (!is_barrier) - return rq; - - if (q->next_ordered != QUEUE_ORDERED_NONE) - return start_ordered(q, rq); - else { - /* - * Queue ordering not supported. Terminate - * with prejudice. - */ - blk_dequeue_request(rq); - __blk_end_request_all(rq, -EOPNOTSUPP); - return NULL; - } + unsigned skip = 0; + + if (!(rq->cmd_flags & REQ_HARDBARRIER)) + return rq; + + if (q->ordseq) { + /* + * Barrier is already in progress and they can't be + * processed in parallel. Queue for later processing. + */ + list_move_tail(&rq->queuelist, &q->pending_barriers); + return NULL; + } + + if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) { + /* + * Queue ordering not supported. Terminate + * with prejudice. + */ + blk_dequeue_request(rq); + __blk_end_request_all(rq, -EOPNOTSUPP); + return NULL; } /* - * Ordered sequence in progress + * Start a new ordered sequence */ + q->orderr = 0; + q->ordered = q->next_ordered; + q->ordseq |= QUEUE_ORDSEQ_STARTED; - /* Special requests are not subject to ordering rules. */ - if (rq->cmd_type != REQ_TYPE_FS && - rq != &q->pre_flush_rq && rq != &q->post_flush_rq) - return rq; + /* + * For an empty barrier, there's no actual BAR request, which + * in turn makes POSTFLUSH unnecessary. Mask them off. + */ + if (!blk_rq_sectors(rq)) + q->ordered &= ~(QUEUE_ORDERED_DO_BAR | + QUEUE_ORDERED_DO_POSTFLUSH); - /* Ordered by draining. Wait for turn. */ - WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); - if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) - rq = ERR_PTR(-EAGAIN); + /* stash away the original request */ + blk_dequeue_request(rq); + q->orig_bar_rq = rq; - return rq; + if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) + skip |= QUEUE_ORDSEQ_PREFLUSH; + + if (!(q->ordered & QUEUE_ORDERED_DO_BAR)) + skip |= QUEUE_ORDSEQ_BAR; + + if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH)) + skip |= QUEUE_ORDSEQ_POSTFLUSH; + + /* complete skipped sequences and return the first sequence */ + return blk_ordered_complete_seq(q, skip, 0); } static void bio_end_empty_barrier(struct bio *bio, int err) diff --git a/block/blk-core.c b/block/blk-core.c index f8d37a8..d316662 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); + INIT_LIST_HEAD(&q->pending_barriers); INIT_WORK(&q->unplug_work, blk_unplug_work); kobject_init(&q->kobj, &blk_queue_ktype); @@ -1185,6 +1186,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) const bool sync = (bio->bi_rw & REQ_SYNC); const bool unplug = (bio->bi_rw & REQ_UNPLUG); const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; + int where = ELEVATOR_INSERT_SORT; int rw_flags; /* REQ_HARDBARRIER is no more */ @@ -1203,7 +1205,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) + if (bio->bi_rw & REQ_HARDBARRIER) { + where = ELEVATOR_INSERT_FRONT; + goto get_rq; + } + + if (elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1303,7 +1310,7 @@ get_rq: /* insert the request into the elevator */ drive_stat_acct(req, 1); - __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); + __elv_add_request(q, req, where, 0); out: if (unplug || !queue_should_plug(q)) __generic_unplug_device(q); diff --git a/block/blk.h b/block/blk.h index 874eb4e..08081e4b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -62,7 +62,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) rq = list_entry_rq(q->queue_head.next); rq = blk_do_ordered(q, rq); if (rq) - return !IS_ERR(rq) ? rq : NULL; + return rq; } if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) diff --git a/block/elevator.c b/block/elevator.c index ec585c9..241c69c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -617,8 +617,6 @@ void elv_quiesce_end(struct request_queue *q) void elv_insert(struct request_queue *q, struct request *rq, int where) { - struct list_head *pos; - unsigned ordseq; int unplug_it = 1; trace_block_rq_insert(q, rq); @@ -626,9 +624,16 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) rq->q = q; switch (where) { + case ELEVATOR_INSERT_REQUEUE: + /* + * Most requeues happen because of a busy condition, + * don't force unplug of the queue for that case. + * Clear unplug_it and fall through. + */ + unplug_it = 0; + case ELEVATOR_INSERT_FRONT: rq->cmd_flags |= REQ_SOFTBARRIER; - list_add(&rq->queuelist, &q->queue_head); break; @@ -668,36 +673,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) q->elevator->ops->elevator_add_req_fn(q, rq); break; - case ELEVATOR_INSERT_REQUEUE: - /* - * If ordered flush isn't in progress, we do front - * insertion; otherwise, requests should be requeued - * in ordseq order. - */ - rq->cmd_flags |= REQ_SOFTBARRIER; - - /* - * Most requeues happen because of a busy condition, - * don't force unplug of the queue for that case. - */ - unplug_it = 0; - - if (q->ordseq == 0) { - list_add(&rq->queuelist, &q->queue_head); - break; - } - - ordseq = blk_ordered_req_seq(rq); - - list_for_each(pos, &q->queue_head) { - struct request *pos_rq = list_entry_rq(pos); - if (ordseq <= blk_ordered_req_seq(pos_rq)) - break; - } - - list_add_tail(&rq->queuelist, pos); - break; - default: printk(KERN_ERR "%s: bad insertion point %d\n", __func__, where); @@ -716,26 +691,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) void __elv_add_request(struct request_queue *q, struct request *rq, int where, int plug) { - if (q->ordcolor) - rq->cmd_flags |= REQ_ORDERED_COLOR; - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { - /* - * toggle ordered color - */ - if (rq->cmd_flags & REQ_HARDBARRIER) - q->ordcolor ^= 1; - - /* - * barriers implicitly indicate back insertion - */ - if (where == ELEVATOR_INSERT_SORT) - where = ELEVATOR_INSERT_BACK; - - /* - * this request is scheduling boundary, update - * end_sector - */ + /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS || (rq->cmd_flags & REQ_DISCARD)) { q->end_sector = rq_end_sector(rq); @@ -855,24 +812,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq) e->ops->elevator_completed_req_fn) e->ops->elevator_completed_req_fn(q, rq); } - - /* - * Check if the queue is waiting for fs requests to be - * drained for flush sequence. - */ - if (unlikely(q->ordseq)) { - struct request *next = NULL; - - if (!list_empty(&q->queue_head)) - next = list_entry_rq(q->queue_head.next); - - if (!queue_in_flight(q) && - blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN && - (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) { - blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0); - __blk_run_queue(q); - } - } } #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) -- cgit v1.1 From 8839a0e055d9abd6c011d533373a8dd266cad011 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:16 +0200 Subject: block: rename blk-barrier.c to blk-flush.c Without ordering requirements, barrier and ordering are minomers. Rename block/blk-barrier.c to block/blk-flush.c. Rename of symbols will follow. Signed-off-by: Tejun Heo Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-barrier.c | 248 ---------------------------------------------------- block/blk-flush.c | 248 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+), 249 deletions(-) delete mode 100644 block/blk-barrier.c create mode 100644 block/blk-flush.c (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 0bb499a..f627e4b 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ - blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ + blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c deleted file mode 100644 index e8b2e5c..0000000 --- a/block/blk-barrier.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Functions related to barrier IO handling - */ -#include -#include -#include -#include -#include - -#include "blk.h" - -static struct request *queue_next_ordseq(struct request_queue *q); - -/* - * Cache flushing for ordered writes handling - */ -unsigned blk_ordered_cur_seq(struct request_queue *q) -{ - if (!q->ordseq) - return 0; - return 1 << ffz(q->ordseq); -} - -static struct request *blk_ordered_complete_seq(struct request_queue *q, - unsigned seq, int error) -{ - struct request *next_rq = NULL; - - if (error && !q->orderr) - q->orderr = error; - - BUG_ON(q->ordseq & seq); - q->ordseq |= seq; - - if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) { - /* not complete yet, queue the next ordered sequence */ - next_rq = queue_next_ordseq(q); - } else { - /* complete this barrier request */ - __blk_end_request_all(q->orig_bar_rq, q->orderr); - q->orig_bar_rq = NULL; - q->ordseq = 0; - - /* dispatch the next barrier if there's one */ - if (!list_empty(&q->pending_barriers)) { - next_rq = list_entry_rq(q->pending_barriers.next); - list_move(&next_rq->queuelist, &q->queue_head); - } - } - return next_rq; -} - -static void pre_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); -} - -static void bar_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); -} - -static void post_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); -} - -static void queue_flush(struct request_queue *q, struct request *rq, - rq_end_io_fn *end_io) -{ - blk_rq_init(q, rq); - rq->cmd_type = REQ_TYPE_FS; - rq->cmd_flags = REQ_FLUSH; - rq->rq_disk = q->orig_bar_rq->rq_disk; - rq->end_io = end_io; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); -} - -static struct request *queue_next_ordseq(struct request_queue *q) -{ - struct request *rq = &q->bar_rq; - - switch (blk_ordered_cur_seq(q)) { - case QUEUE_ORDSEQ_PREFLUSH: - queue_flush(q, rq, pre_flush_end_io); - break; - - case QUEUE_ORDSEQ_BAR: - /* initialize proxy request and queue it */ - blk_rq_init(q, rq); - init_request_from_bio(rq, q->orig_bar_rq->bio); - rq->cmd_flags &= ~REQ_HARDBARRIER; - if (q->ordered & QUEUE_ORDERED_DO_FUA) - rq->cmd_flags |= REQ_FUA; - rq->end_io = bar_end_io; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); - break; - - case QUEUE_ORDSEQ_POSTFLUSH: - queue_flush(q, rq, post_flush_end_io); - break; - - default: - BUG(); - } - return rq; -} - -struct request *blk_do_ordered(struct request_queue *q, struct request *rq) -{ - unsigned skip = 0; - - if (!(rq->cmd_flags & REQ_HARDBARRIER)) - return rq; - - if (q->ordseq) { - /* - * Barrier is already in progress and they can't be - * processed in parallel. Queue for later processing. - */ - list_move_tail(&rq->queuelist, &q->pending_barriers); - return NULL; - } - - if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) { - /* - * Queue ordering not supported. Terminate - * with prejudice. - */ - blk_dequeue_request(rq); - __blk_end_request_all(rq, -EOPNOTSUPP); - return NULL; - } - - /* - * Start a new ordered sequence - */ - q->orderr = 0; - q->ordered = q->next_ordered; - q->ordseq |= QUEUE_ORDSEQ_STARTED; - - /* - * For an empty barrier, there's no actual BAR request, which - * in turn makes POSTFLUSH unnecessary. Mask them off. - */ - if (!blk_rq_sectors(rq)) - q->ordered &= ~(QUEUE_ORDERED_DO_BAR | - QUEUE_ORDERED_DO_POSTFLUSH); - - /* stash away the original request */ - blk_dequeue_request(rq); - q->orig_bar_rq = rq; - - if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) - skip |= QUEUE_ORDSEQ_PREFLUSH; - - if (!(q->ordered & QUEUE_ORDERED_DO_BAR)) - skip |= QUEUE_ORDSEQ_BAR; - - if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH)) - skip |= QUEUE_ORDSEQ_POSTFLUSH; - - /* complete skipped sequences and return the first sequence */ - return blk_ordered_complete_seq(q, skip, 0); -} - -static void bio_end_empty_barrier(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); -} - -/** - * blkdev_issue_flush - queue a flush - * @bdev: blockdev to issue flush for - * @gfp_mask: memory allocation flags (for bio_alloc) - * @error_sector: error sector - * @flags: BLKDEV_IFL_* flags to control behaviour - * - * Description: - * Issue a flush for the block device in question. Caller can supply - * room for storing the error offset in case of a flush error, if they - * wish to. If WAIT flag is not passed then caller may check only what - * request was pushed in some internal queue for later handling. - */ -int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, - sector_t *error_sector, unsigned long flags) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q; - struct bio *bio; - int ret = 0; - - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - /* - * some block devices may not have their queue correctly set up here - * (e.g. loop device without a backing file) and so issuing a flush - * here will panic. Ensure there is a request function before issuing - * the barrier. - */ - if (!q->make_request_fn) - return -ENXIO; - - bio = bio_alloc(gfp_mask, 0); - bio->bi_end_io = bio_end_empty_barrier; - bio->bi_bdev = bdev; - if (test_bit(BLKDEV_WAIT, &flags)) - bio->bi_private = &wait; - - bio_get(bio); - submit_bio(WRITE_BARRIER, bio); - if (test_bit(BLKDEV_WAIT, &flags)) { - wait_for_completion(&wait); - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be - * copied from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_sector; - } - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - - bio_put(bio); - return ret; -} -EXPORT_SYMBOL(blkdev_issue_flush); diff --git a/block/blk-flush.c b/block/blk-flush.c new file mode 100644 index 0000000..e8b2e5c --- /dev/null +++ b/block/blk-flush.c @@ -0,0 +1,248 @@ +/* + * Functions related to barrier IO handling + */ +#include +#include +#include +#include +#include + +#include "blk.h" + +static struct request *queue_next_ordseq(struct request_queue *q); + +/* + * Cache flushing for ordered writes handling + */ +unsigned blk_ordered_cur_seq(struct request_queue *q) +{ + if (!q->ordseq) + return 0; + return 1 << ffz(q->ordseq); +} + +static struct request *blk_ordered_complete_seq(struct request_queue *q, + unsigned seq, int error) +{ + struct request *next_rq = NULL; + + if (error && !q->orderr) + q->orderr = error; + + BUG_ON(q->ordseq & seq); + q->ordseq |= seq; + + if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) { + /* not complete yet, queue the next ordered sequence */ + next_rq = queue_next_ordseq(q); + } else { + /* complete this barrier request */ + __blk_end_request_all(q->orig_bar_rq, q->orderr); + q->orig_bar_rq = NULL; + q->ordseq = 0; + + /* dispatch the next barrier if there's one */ + if (!list_empty(&q->pending_barriers)) { + next_rq = list_entry_rq(q->pending_barriers.next); + list_move(&next_rq->queuelist, &q->queue_head); + } + } + return next_rq; +} + +static void pre_flush_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); +} + +static void bar_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); +} + +static void post_flush_end_io(struct request *rq, int error) +{ + elv_completed_request(rq->q, rq); + blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); +} + +static void queue_flush(struct request_queue *q, struct request *rq, + rq_end_io_fn *end_io) +{ + blk_rq_init(q, rq); + rq->cmd_type = REQ_TYPE_FS; + rq->cmd_flags = REQ_FLUSH; + rq->rq_disk = q->orig_bar_rq->rq_disk; + rq->end_io = end_io; + + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); +} + +static struct request *queue_next_ordseq(struct request_queue *q) +{ + struct request *rq = &q->bar_rq; + + switch (blk_ordered_cur_seq(q)) { + case QUEUE_ORDSEQ_PREFLUSH: + queue_flush(q, rq, pre_flush_end_io); + break; + + case QUEUE_ORDSEQ_BAR: + /* initialize proxy request and queue it */ + blk_rq_init(q, rq); + init_request_from_bio(rq, q->orig_bar_rq->bio); + rq->cmd_flags &= ~REQ_HARDBARRIER; + if (q->ordered & QUEUE_ORDERED_DO_FUA) + rq->cmd_flags |= REQ_FUA; + rq->end_io = bar_end_io; + + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); + break; + + case QUEUE_ORDSEQ_POSTFLUSH: + queue_flush(q, rq, post_flush_end_io); + break; + + default: + BUG(); + } + return rq; +} + +struct request *blk_do_ordered(struct request_queue *q, struct request *rq) +{ + unsigned skip = 0; + + if (!(rq->cmd_flags & REQ_HARDBARRIER)) + return rq; + + if (q->ordseq) { + /* + * Barrier is already in progress and they can't be + * processed in parallel. Queue for later processing. + */ + list_move_tail(&rq->queuelist, &q->pending_barriers); + return NULL; + } + + if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) { + /* + * Queue ordering not supported. Terminate + * with prejudice. + */ + blk_dequeue_request(rq); + __blk_end_request_all(rq, -EOPNOTSUPP); + return NULL; + } + + /* + * Start a new ordered sequence + */ + q->orderr = 0; + q->ordered = q->next_ordered; + q->ordseq |= QUEUE_ORDSEQ_STARTED; + + /* + * For an empty barrier, there's no actual BAR request, which + * in turn makes POSTFLUSH unnecessary. Mask them off. + */ + if (!blk_rq_sectors(rq)) + q->ordered &= ~(QUEUE_ORDERED_DO_BAR | + QUEUE_ORDERED_DO_POSTFLUSH); + + /* stash away the original request */ + blk_dequeue_request(rq); + q->orig_bar_rq = rq; + + if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) + skip |= QUEUE_ORDSEQ_PREFLUSH; + + if (!(q->ordered & QUEUE_ORDERED_DO_BAR)) + skip |= QUEUE_ORDSEQ_BAR; + + if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH)) + skip |= QUEUE_ORDSEQ_POSTFLUSH; + + /* complete skipped sequences and return the first sequence */ + return blk_ordered_complete_seq(q, skip, 0); +} + +static void bio_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/** + * blkdev_issue_flush - queue a flush + * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) + * @error_sector: error sector + * @flags: BLKDEV_IFL_* flags to control behaviour + * + * Description: + * Issue a flush for the block device in question. Caller can supply + * room for storing the error offset in case of a flush error, if they + * wish to. If WAIT flag is not passed then caller may check only what + * request was pushed in some internal queue for later handling. + */ +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + sector_t *error_sector, unsigned long flags) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q; + struct bio *bio; + int ret = 0; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + /* + * some block devices may not have their queue correctly set up here + * (e.g. loop device without a backing file) and so issuing a flush + * here will panic. Ensure there is a request function before issuing + * the barrier. + */ + if (!q->make_request_fn) + return -ENXIO; + + bio = bio_alloc(gfp_mask, 0); + bio->bi_end_io = bio_end_empty_barrier; + bio->bi_bdev = bdev; + if (test_bit(BLKDEV_WAIT, &flags)) + bio->bi_private = &wait; + + bio_get(bio); + submit_bio(WRITE_BARRIER, bio); + if (test_bit(BLKDEV_WAIT, &flags)) { + wait_for_completion(&wait); + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be + * copied from blk_rq_pos(rq). + */ + if (error_sector) + *error_sector = bio->bi_sector; + } + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + else if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(blkdev_issue_flush); -- cgit v1.1 From dd4c133f387c48f526022860ad70354637a80f4c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:16 +0200 Subject: block: rename barrier/ordered to flush With ordering requirements dropped, barrier and ordered are misnomers. Now all block layer does is sequencing FLUSH and FUA. Rename them to flush. Signed-off-by: Tejun Heo Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 21 ++++++------ block/blk-flush.c | 98 +++++++++++++++++++++++++++---------------------------- block/blk.h | 4 +-- 3 files changed, 60 insertions(+), 63 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d316662..8870ae4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -136,7 +136,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio, { struct request_queue *q = rq->q; - if (&q->bar_rq != rq) { + if (&q->flush_rq != rq) { if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) @@ -160,13 +160,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio, if (bio->bi_size == 0) bio_endio(bio, error); } else { - /* - * Okay, this is the barrier request in progress, just - * record the error; + * Okay, this is the sequenced flush request in + * progress, just record the error; */ - if (error && !q->orderr) - q->orderr = error; + if (error && !q->flush_err) + q->flush_err = error; } } @@ -520,7 +519,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); - INIT_LIST_HEAD(&q->pending_barriers); + INIT_LIST_HEAD(&q->pending_flushes); INIT_WORK(&q->unplug_work, blk_unplug_work); kobject_init(&q->kobj, &blk_queue_ktype); @@ -1764,11 +1763,11 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) static void blk_account_io_done(struct request *req) { /* - * Account IO completion. bar_rq isn't accounted as a normal - * IO on queueing nor completion. Accounting the containing - * request is enough. + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. */ - if (blk_do_io_stat(req) && req != &req->q->bar_rq) { + if (blk_do_io_stat(req) && req != &req->q->flush_rq) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; diff --git a/block/blk-flush.c b/block/blk-flush.c index e8b2e5c..dd87322 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -9,41 +9,38 @@ #include "blk.h" -static struct request *queue_next_ordseq(struct request_queue *q); +static struct request *queue_next_fseq(struct request_queue *q); -/* - * Cache flushing for ordered writes handling - */ -unsigned blk_ordered_cur_seq(struct request_queue *q) +unsigned blk_flush_cur_seq(struct request_queue *q) { - if (!q->ordseq) + if (!q->flush_seq) return 0; - return 1 << ffz(q->ordseq); + return 1 << ffz(q->flush_seq); } -static struct request *blk_ordered_complete_seq(struct request_queue *q, - unsigned seq, int error) +static struct request *blk_flush_complete_seq(struct request_queue *q, + unsigned seq, int error) { struct request *next_rq = NULL; - if (error && !q->orderr) - q->orderr = error; + if (error && !q->flush_err) + q->flush_err = error; - BUG_ON(q->ordseq & seq); - q->ordseq |= seq; + BUG_ON(q->flush_seq & seq); + q->flush_seq |= seq; - if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) { - /* not complete yet, queue the next ordered sequence */ - next_rq = queue_next_ordseq(q); + if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) { + /* not complete yet, queue the next flush sequence */ + next_rq = queue_next_fseq(q); } else { - /* complete this barrier request */ - __blk_end_request_all(q->orig_bar_rq, q->orderr); - q->orig_bar_rq = NULL; - q->ordseq = 0; - - /* dispatch the next barrier if there's one */ - if (!list_empty(&q->pending_barriers)) { - next_rq = list_entry_rq(q->pending_barriers.next); + /* complete this flush request */ + __blk_end_request_all(q->orig_flush_rq, q->flush_err); + q->orig_flush_rq = NULL; + q->flush_seq = 0; + + /* dispatch the next flush if there's one */ + if (!list_empty(&q->pending_flushes)) { + next_rq = list_entry_rq(q->pending_flushes.next); list_move(&next_rq->queuelist, &q->queue_head); } } @@ -53,19 +50,19 @@ static struct request *blk_ordered_complete_seq(struct request_queue *q, static void pre_flush_end_io(struct request *rq, int error) { elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); + blk_flush_complete_seq(rq->q, QUEUE_FSEQ_PREFLUSH, error); } -static void bar_end_io(struct request *rq, int error) +static void flush_data_end_io(struct request *rq, int error) { elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); + blk_flush_complete_seq(rq->q, QUEUE_FSEQ_DATA, error); } static void post_flush_end_io(struct request *rq, int error) { elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); + blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error); } static void queue_flush(struct request_queue *q, struct request *rq, @@ -74,34 +71,34 @@ static void queue_flush(struct request_queue *q, struct request *rq, blk_rq_init(q, rq); rq->cmd_type = REQ_TYPE_FS; rq->cmd_flags = REQ_FLUSH; - rq->rq_disk = q->orig_bar_rq->rq_disk; + rq->rq_disk = q->orig_flush_rq->rq_disk; rq->end_io = end_io; elv_insert(q, rq, ELEVATOR_INSERT_FRONT); } -static struct request *queue_next_ordseq(struct request_queue *q) +static struct request *queue_next_fseq(struct request_queue *q) { - struct request *rq = &q->bar_rq; + struct request *rq = &q->flush_rq; - switch (blk_ordered_cur_seq(q)) { - case QUEUE_ORDSEQ_PREFLUSH: + switch (blk_flush_cur_seq(q)) { + case QUEUE_FSEQ_PREFLUSH: queue_flush(q, rq, pre_flush_end_io); break; - case QUEUE_ORDSEQ_BAR: + case QUEUE_FSEQ_DATA: /* initialize proxy request and queue it */ blk_rq_init(q, rq); - init_request_from_bio(rq, q->orig_bar_rq->bio); + init_request_from_bio(rq, q->orig_flush_rq->bio); rq->cmd_flags &= ~REQ_HARDBARRIER; if (q->ordered & QUEUE_ORDERED_DO_FUA) rq->cmd_flags |= REQ_FUA; - rq->end_io = bar_end_io; + rq->end_io = flush_data_end_io; elv_insert(q, rq, ELEVATOR_INSERT_FRONT); break; - case QUEUE_ORDSEQ_POSTFLUSH: + case QUEUE_FSEQ_POSTFLUSH: queue_flush(q, rq, post_flush_end_io); break; @@ -111,19 +108,20 @@ static struct request *queue_next_ordseq(struct request_queue *q) return rq; } -struct request *blk_do_ordered(struct request_queue *q, struct request *rq) +struct request *blk_do_flush(struct request_queue *q, struct request *rq) { unsigned skip = 0; if (!(rq->cmd_flags & REQ_HARDBARRIER)) return rq; - if (q->ordseq) { + if (q->flush_seq) { /* - * Barrier is already in progress and they can't be - * processed in parallel. Queue for later processing. + * Sequenced flush is already in progress and they + * can't be processed in parallel. Queue for later + * processing. */ - list_move_tail(&rq->queuelist, &q->pending_barriers); + list_move_tail(&rq->queuelist, &q->pending_flushes); return NULL; } @@ -138,11 +136,11 @@ struct request *blk_do_ordered(struct request_queue *q, struct request *rq) } /* - * Start a new ordered sequence + * Start a new flush sequence */ - q->orderr = 0; + q->flush_err = 0; q->ordered = q->next_ordered; - q->ordseq |= QUEUE_ORDSEQ_STARTED; + q->flush_seq |= QUEUE_FSEQ_STARTED; /* * For an empty barrier, there's no actual BAR request, which @@ -154,19 +152,19 @@ struct request *blk_do_ordered(struct request_queue *q, struct request *rq) /* stash away the original request */ blk_dequeue_request(rq); - q->orig_bar_rq = rq; + q->orig_flush_rq = rq; if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) - skip |= QUEUE_ORDSEQ_PREFLUSH; + skip |= QUEUE_FSEQ_PREFLUSH; if (!(q->ordered & QUEUE_ORDERED_DO_BAR)) - skip |= QUEUE_ORDSEQ_BAR; + skip |= QUEUE_FSEQ_DATA; if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH)) - skip |= QUEUE_ORDSEQ_POSTFLUSH; + skip |= QUEUE_FSEQ_POSTFLUSH; /* complete skipped sequences and return the first sequence */ - return blk_ordered_complete_seq(q, skip, 0); + return blk_flush_complete_seq(q, skip, 0); } static void bio_end_empty_barrier(struct bio *bio, int err) diff --git a/block/blk.h b/block/blk.h index 08081e4b..24b92bd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,7 +51,7 @@ static inline void blk_clear_rq_complete(struct request *rq) */ #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) -struct request *blk_do_ordered(struct request_queue *q, struct request *rq); +struct request *blk_do_flush(struct request_queue *q, struct request *rq); static inline struct request *__elv_next_request(struct request_queue *q) { @@ -60,7 +60,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) while (1) { while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - rq = blk_do_ordered(q, rq); + rq = blk_do_flush(q, rq); if (rq) return rq; } -- cgit v1.1 From 4fed947cb311e5aa51781d316cefca836352f6ce Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:17 +0200 Subject: block: implement REQ_FLUSH/FUA based interface for FLUSH/FUA requests Now that the backend conversion is complete, export sequenced FLUSH/FUA capability through REQ_FLUSH/FUA flags. REQ_FLUSH means the device cache should be flushed before executing the request. REQ_FUA means that the data in the request should be on non-volatile media on completion. Block layer will choose the correct way of implementing the semantics and execute it. The request may be passed to the device directly if the device can handle it; otherwise, it will be sequenced using one or more proxy requests. Devices will never see REQ_FLUSH and/or FUA which it doesn't support. Also, unlike the original REQ_HARDBARRIER, REQ_FLUSH/FUA requests are never failed with -EOPNOTSUPP. If the underlying device doesn't support FLUSH/FUA, the block layer simply make those noop. IOW, it no longer distinguishes between writeback cache which doesn't support cache flush and writethrough/no cache. Devices which have WB cache w/o flush are very difficult to come by these days and there's nothing much we can do anyway, so it doesn't make sense to require everyone to implement -EOPNOTSUPP handling. This will simplify filesystems and block drivers as they can drop -EOPNOTSUPP retry logic for barriers. * QUEUE_ORDERED_* are removed and QUEUE_FSEQ_* are moved into blk-flush.c. * REQ_FLUSH w/o data can also be directly passed to drivers without sequencing but some drivers assume that zero length requests don't have rq->bio which isn't true for these requests requiring the use of proxy requests. * REQ_COMMON_MASK now includes REQ_FLUSH | REQ_FUA so that they are copied from bio to request. * WRITE_BARRIER is marked deprecated and WRITE_FLUSH, WRITE_FUA and WRITE_FLUSH_FUA are added. Signed-off-by: Tejun Heo Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-flush.c | 85 ++++++++++++++++++++++++++++++------------------------- block/blk.h | 3 ++ 3 files changed, 50 insertions(+), 40 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 8870ae4..18455c4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1204,7 +1204,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - if (bio->bi_rw & REQ_HARDBARRIER) { + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { where = ELEVATOR_INSERT_FRONT; goto get_rq; } diff --git a/block/blk-flush.c b/block/blk-flush.c index dd87322..452c552 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -1,5 +1,5 @@ /* - * Functions related to barrier IO handling + * Functions to sequence FLUSH and FUA writes. */ #include #include @@ -9,6 +9,15 @@ #include "blk.h" +/* FLUSH/FUA sequences */ +enum { + QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ + QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ + QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ + QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ + QUEUE_FSEQ_DONE = (1 << 4), +}; + static struct request *queue_next_fseq(struct request_queue *q); unsigned blk_flush_cur_seq(struct request_queue *q) @@ -79,6 +88,7 @@ static void queue_flush(struct request_queue *q, struct request *rq, static struct request *queue_next_fseq(struct request_queue *q) { + struct request *orig_rq = q->orig_flush_rq; struct request *rq = &q->flush_rq; switch (blk_flush_cur_seq(q)) { @@ -87,12 +97,11 @@ static struct request *queue_next_fseq(struct request_queue *q) break; case QUEUE_FSEQ_DATA: - /* initialize proxy request and queue it */ + /* initialize proxy request, inherit FLUSH/FUA and queue it */ blk_rq_init(q, rq); - init_request_from_bio(rq, q->orig_flush_rq->bio); - rq->cmd_flags &= ~REQ_HARDBARRIER; - if (q->ordered & QUEUE_ORDERED_DO_FUA) - rq->cmd_flags |= REQ_FUA; + init_request_from_bio(rq, orig_rq->bio); + rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); + rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); rq->end_io = flush_data_end_io; elv_insert(q, rq, ELEVATOR_INSERT_FRONT); @@ -110,60 +119,58 @@ static struct request *queue_next_fseq(struct request_queue *q) struct request *blk_do_flush(struct request_queue *q, struct request *rq) { + unsigned int fflags = q->flush_flags; /* may change, cache it */ + bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; + bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH); + bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA); unsigned skip = 0; - if (!(rq->cmd_flags & REQ_HARDBARRIER)) + /* + * Special case. If there's data but flush is not necessary, + * the request can be issued directly. + * + * Flush w/o data should be able to be issued directly too but + * currently some drivers assume that rq->bio contains + * non-zero data if it isn't NULL and empty FLUSH requests + * getting here usually have bio's without data. + */ + if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { + rq->cmd_flags &= ~REQ_FLUSH; + if (!has_fua) + rq->cmd_flags &= ~REQ_FUA; return rq; + } + /* + * Sequenced flushes can't be processed in parallel. If + * another one is already in progress, queue for later + * processing. + */ if (q->flush_seq) { - /* - * Sequenced flush is already in progress and they - * can't be processed in parallel. Queue for later - * processing. - */ list_move_tail(&rq->queuelist, &q->pending_flushes); return NULL; } - if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) { - /* - * Queue ordering not supported. Terminate - * with prejudice. - */ - blk_dequeue_request(rq); - __blk_end_request_all(rq, -EOPNOTSUPP); - return NULL; - } - /* * Start a new flush sequence */ q->flush_err = 0; - q->ordered = q->next_ordered; q->flush_seq |= QUEUE_FSEQ_STARTED; - /* - * For an empty barrier, there's no actual BAR request, which - * in turn makes POSTFLUSH unnecessary. Mask them off. - */ - if (!blk_rq_sectors(rq)) - q->ordered &= ~(QUEUE_ORDERED_DO_BAR | - QUEUE_ORDERED_DO_POSTFLUSH); - - /* stash away the original request */ + /* adjust FLUSH/FUA of the original request and stash it away */ + rq->cmd_flags &= ~REQ_FLUSH; + if (!has_fua) + rq->cmd_flags &= ~REQ_FUA; blk_dequeue_request(rq); q->orig_flush_rq = rq; - if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) + /* skip unneded sequences and return the first one */ + if (!do_preflush) skip |= QUEUE_FSEQ_PREFLUSH; - - if (!(q->ordered & QUEUE_ORDERED_DO_BAR)) + if (!blk_rq_sectors(rq)) skip |= QUEUE_FSEQ_DATA; - - if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH)) + if (!do_postflush) skip |= QUEUE_FSEQ_POSTFLUSH; - - /* complete skipped sequences and return the first sequence */ return blk_flush_complete_seq(q, skip, 0); } diff --git a/block/blk.h b/block/blk.h index 24b92bd..a09c18b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -60,6 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q) while (1) { while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); + if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || + rq == &q->flush_rq) + return rq; rq = blk_do_flush(q, rq); if (rq) return rq; -- cgit v1.1 From 1e87901e189c8f01750d67485009fe3827c691bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:17 +0200 Subject: block: filter flush bio's in __generic_make_request() There are a number of make_request based drivers which don't support cache flushes. Filter out flush bio's in __generic_make_request() so that they don't have to worry about them. All FLUSH/FUA requests with data are converted to regular IO requests and empty ones are completed immediately. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 18455c4..495bdc4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1509,6 +1509,19 @@ static inline void __generic_make_request(struct bio *bio) if (bio_check_eod(bio, nr_sectors)) goto end_io; + /* + * Filter flush bio's early so that make_request based + * drivers without flush support don't have to worry + * about them. + */ + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { + bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); + if (!nr_sectors) { + err = 0; + goto end_io; + } + } + if ((bio->bi_rw & REQ_DISCARD) && (!blk_queue_discard(q) || ((bio->bi_rw & REQ_SECURE) && -- cgit v1.1 From cde4c406d8fb051c5aafc917643adbb9dbd0abc2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Sep 2010 11:56:17 +0200 Subject: block: simplify queue_next_fseq We need to call blk_rq_init and elv_insert for all cases in queue_next_fseq, so take these calls into common code. Also move the end_io initialization from queue_flush into queue_next_fseq and rename queue_flush to init_flush_request now that it's old name doesn't apply anymore. Signed-off-by: Christoph Hellwig Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-flush.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 452c552..72905f8 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -74,16 +74,11 @@ static void post_flush_end_io(struct request *rq, int error) blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error); } -static void queue_flush(struct request_queue *q, struct request *rq, - rq_end_io_fn *end_io) +static void init_flush_request(struct request *rq, struct gendisk *disk) { - blk_rq_init(q, rq); rq->cmd_type = REQ_TYPE_FS; rq->cmd_flags = REQ_FLUSH; - rq->rq_disk = q->orig_flush_rq->rq_disk; - rq->end_io = end_io; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); + rq->rq_disk = disk; } static struct request *queue_next_fseq(struct request_queue *q) @@ -91,29 +86,28 @@ static struct request *queue_next_fseq(struct request_queue *q) struct request *orig_rq = q->orig_flush_rq; struct request *rq = &q->flush_rq; + blk_rq_init(q, rq); + switch (blk_flush_cur_seq(q)) { case QUEUE_FSEQ_PREFLUSH: - queue_flush(q, rq, pre_flush_end_io); + init_flush_request(rq, orig_rq->rq_disk); + rq->end_io = pre_flush_end_io; break; - case QUEUE_FSEQ_DATA: - /* initialize proxy request, inherit FLUSH/FUA and queue it */ - blk_rq_init(q, rq); init_request_from_bio(rq, orig_rq->bio); rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); rq->end_io = flush_data_end_io; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); break; - case QUEUE_FSEQ_POSTFLUSH: - queue_flush(q, rq, post_flush_end_io); + init_flush_request(rq, orig_rq->rq_disk); + rq->end_io = post_flush_end_io; break; - default: BUG(); } + + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); return rq; } -- cgit v1.1 From 337238be1bf52e1242f940fc6fe83fb395e55057 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:17 +0200 Subject: block: initialize flush request with WRITE_FLUSH instead of REQ_FLUSH init_flush_request() only set REQ_FLUSH when initializing flush requests making them READ requests. Use WRITE_FLUSH instead. Signed-off-by: Tejun Heo Reported-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 72905f8..f357f1f 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -77,7 +77,7 @@ static void post_flush_end_io(struct request *rq, int error) static void init_flush_request(struct request *rq, struct gendisk *disk) { rq->cmd_type = REQ_TYPE_FS; - rq->cmd_flags = REQ_FLUSH; + rq->cmd_flags = WRITE_FLUSH; rq->rq_disk = disk; } -- cgit v1.1 From 47f70d5a6ca78c40a1c799d43506efbfed914f7b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:17 +0200 Subject: block: kick queue after sequencing REQ_FLUSH/FUA While completing a request from a REQ_FLUSH/FUA sequence, another request can be pushed to the request queue. If a driver tests elv_queue_empty() before completing a request and runs the queue again only if the queue wasn't empty, this may lead to hang. Please note that most drivers either kick the queue unconditionally or test queue emptiness after completing the current request and don't have this problem. This patch removes this possibility by making REQ_FLUSH/FUA sequence code kick the queue if the queue was empty before completing a request from REQ_FLUSH/FUA sequence. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-flush.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index f357f1f..cb4c8440 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -56,22 +56,38 @@ static struct request *blk_flush_complete_seq(struct request_queue *q, return next_rq; } +static void blk_flush_complete_seq_end_io(struct request_queue *q, + unsigned seq, int error) +{ + bool was_empty = elv_queue_empty(q); + struct request *next_rq; + + next_rq = blk_flush_complete_seq(q, seq, error); + + /* + * Moving a request silently to empty queue_head may stall the + * queue. Kick the queue in those cases. + */ + if (was_empty && next_rq) + __blk_run_queue(q); +} + static void pre_flush_end_io(struct request *rq, int error) { elv_completed_request(rq->q, rq); - blk_flush_complete_seq(rq->q, QUEUE_FSEQ_PREFLUSH, error); + blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); } static void flush_data_end_io(struct request *rq, int error) { elv_completed_request(rq->q, rq); - blk_flush_complete_seq(rq->q, QUEUE_FSEQ_DATA, error); + blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error); } static void post_flush_end_io(struct request *rq, int error) { elv_completed_request(rq->q, rq); - blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error); + blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); } static void init_flush_request(struct request *rq, struct gendisk *disk) -- cgit v1.1 From 09d60c701b64b509f328cac72970eb894f485b9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:17 +0200 Subject: block: make sure FSEQ_DATA request has the same rq_disk as the original rq->rq_disk and bio->bi_bdev->bd_disk may differ if a request has passed through remapping drivers. FSEQ_DATA request incorrectly followed bio->bi_bdev->bd_disk ending up being issued w/ mismatching rq_disk. Make it follow orig_rq->rq_disk. Signed-off-by: Tejun Heo Reported-by: Kiyoshi Ueda Tested-by: Kiyoshi Ueda Signed-off-by: Jens Axboe --- block/blk-flush.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index cb4c8440..7d1fc98 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -111,6 +111,13 @@ static struct request *queue_next_fseq(struct request_queue *q) break; case QUEUE_FSEQ_DATA: init_request_from_bio(rq, orig_rq->bio); + /* + * orig_rq->rq_disk may be different from + * bio->bi_bdev->bd_disk if orig_rq got here through + * remapping drivers. Make sure rq->rq_disk points + * to the same one as orig_rq. + */ + rq->rq_disk = orig_rq->rq_disk; rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); rq->end_io = flush_data_end_io; -- cgit v1.1 From d391a2dda2f1c993f094bdb3a8a342c5e0546553 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:17 +0200 Subject: block: use REQ_FLUSH in blkdev_issue_flush() Update blkdev_issue_flush() to use new REQ_FLUSH interface. Signed-off-by: Tejun Heo Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 7d1fc98..62b7df9 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -191,13 +191,10 @@ struct request *blk_do_flush(struct request_queue *q, struct request *rq) return blk_flush_complete_seq(q, skip, 0); } -static void bio_end_empty_barrier(struct bio *bio, int err) +static void bio_end_flush(struct bio *bio, int err) { - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - } if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -235,19 +232,19 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, * some block devices may not have their queue correctly set up here * (e.g. loop device without a backing file) and so issuing a flush * here will panic. Ensure there is a request function before issuing - * the barrier. + * the flush. */ if (!q->make_request_fn) return -ENXIO; bio = bio_alloc(gfp_mask, 0); - bio->bi_end_io = bio_end_empty_barrier; + bio->bi_end_io = bio_end_flush; bio->bi_bdev = bdev; if (test_bit(BLKDEV_WAIT, &flags)) bio->bi_private = &wait; bio_get(bio); - submit_bio(WRITE_BARRIER, bio); + submit_bio(WRITE_FLUSH, bio); if (test_bit(BLKDEV_WAIT, &flags)) { wait_for_completion(&wait); /* @@ -259,9 +256,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, *error_sector = bio->bi_sector; } - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) + if (!bio_flagged(bio, BIO_UPTODATE)) ret = -EIO; bio_put(bio); -- cgit v1.1 From 3a2edd0d6ddbd5fa3b389ea6db811285415ce6c8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Sep 2010 11:56:18 +0200 Subject: block: make __blk_rq_prep_clone() copy most command flags Currently __blk_rq_prep_clone() copies only REQ_WRITE and REQ_DISCARD. There's no reason to omit other command flags and REQ_FUA needs to be copied to implement FUA support in request-based dm. REQ_COMMON_MASK which specifies flags to be copied from bio to request already identifies all the command flags. Define REQ_CLONE_MASK to be the same as REQ_COMMON_MASK for clarity and make __blk_rq_prep_clone() copy all flags in the mask. Signed-off-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 495bdc4..2a5b192 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2505,9 +2505,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); - if (src->cmd_flags & REQ_DISCARD) - dst->cmd_flags |= REQ_DISCARD; + dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); -- cgit v1.1 From 8c5553678237b7121355108e03c36086037d8975 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2010 05:29:22 -0400 Subject: block: remove the BLKDEV_IFL_BARRIER flag Remove support for barriers on discards, which is unused now. Also remove the DISCARD_NOBARRIER I/O type in favour of just setting the rw flags up locally in blkdev_issue_discard. tj: Also remove DISCARD_SECURE and use REQ_SECURE directly. Signed-off-by: Christoph Hellwig Acked-by: Mike Snitzer Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-lib.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index c392029..fe2e6ed 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, { DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); - int type = flags & BLKDEV_IFL_BARRIER ? - DISCARD_BARRIER : DISCARD_NOBARRIER; + int type = REQ_WRITE | REQ_DISCARD; unsigned int max_discard_sectors; struct bio *bio; int ret = 0; @@ -65,7 +64,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (flags & BLKDEV_IFL_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; - type |= DISCARD_SECURE; + type |= REQ_SECURE; } while (nr_sects && !ret) { @@ -162,12 +161,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, bb.wait = &wait; bb.end_io = NULL; - if (flags & BLKDEV_IFL_BARRIER) { - /* issue async barrier before the data */ - ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0); - if (ret) - return ret; - } submit: ret = 0; while (nr_sects != 0) { @@ -199,13 +192,6 @@ submit: issued++; submit_bio(WRITE, bio); } - /* - * When all data bios are in flight. Send final barrier if requeted. - */ - if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER) - ret = blkdev_issue_flush(bdev, gfp_mask, NULL, - flags & BLKDEV_IFL_WAIT); - if (flags & BLKDEV_IFL_WAIT) /* Wait for bios in-flight */ -- cgit v1.1 From c8bf1336824ebd698d37b71763e1c43190f2229a Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 10 Sep 2010 20:07:38 +0200 Subject: Consolidate min_not_zero We have several users of min_not_zero, each of them using their own definition. Move the define to kernel.h. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index a234f4b..8d592b5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -455,11 +455,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - /** * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers * @t: the stacking driver (top) -- cgit v1.1 From 13f05c8d8e98bbdce89158bfdb2e380940695a88 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 10 Sep 2010 20:50:10 +0200 Subject: block/scsi: Provide a limit on the number of integrity segments Some controllers have a hardware limit on the number of protection information scatter-gather list segments they can handle. Introduce a max_integrity_segments limit in the block layer and provide a new scsi_host_template setting that allows HBA drivers to provide a value suitable for the hardware. Add support for honoring the integrity segment limit when merging both bios and requests. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-integrity.c | 93 +++++++++++++++++++++++++++++++++++++++------------ block/blk-merge.c | 23 ++++++++----- block/blk-settings.c | 3 ++ block/blk-sysfs.c | 11 ++++++ block/blk.h | 8 ----- 5 files changed, 100 insertions(+), 38 deletions(-) (limited to 'block') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index edce1ef..885cbb5 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep; /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements - * @rq: request with integrity metadata attached + * @q: request queue + * @bio: bio with integrity metadata attached * * Description: Returns the number of elements required in a - * scatterlist corresponding to the integrity metadata in a request. + * scatterlist corresponding to the integrity metadata in a bio. */ -int blk_rq_count_integrity_sg(struct request *rq) +int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) { - struct bio_vec *iv, *ivprv; - struct req_iterator iter; - unsigned int segments; + struct bio_vec *iv, *ivprv = NULL; + unsigned int segments = 0; + unsigned int seg_size = 0; + unsigned int i = 0; - ivprv = NULL; - segments = 0; + bio_for_each_integrity_vec(iv, bio, i) { - rq_for_each_integrity_segment(iv, rq, iter) { + if (ivprv) { + if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + goto new_segment; + + if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) + goto new_segment; + + if (seg_size + iv->bv_len > queue_max_segment_size(q)) + goto new_segment; - if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + seg_size += iv->bv_len; + } else { +new_segment: segments++; + seg_size = iv->bv_len; + } ivprv = iv; } @@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg); /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist - * @rq: request with integrity metadata attached + * @q: request queue + * @bio: bio with integrity metadata attached * @sglist: target scatterlist * * Description: Map the integrity vectors in request into a * scatterlist. The scatterlist must be big enough to hold all * elements. I.e. sized using blk_rq_count_integrity_sg(). */ -int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist) { - struct bio_vec *iv, *ivprv; - struct req_iterator iter; - struct scatterlist *sg; - unsigned int segments; + struct bio_vec *iv, *ivprv = NULL; + struct scatterlist *sg = NULL; + unsigned int segments = 0; + unsigned int i = 0; - ivprv = NULL; - sg = NULL; - segments = 0; - - rq_for_each_integrity_segment(iv, rq, iter) { + bio_for_each_integrity_vec(iv, bio, i) { if (ivprv) { if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) + goto new_segment; + + if (sg->length + iv->bv_len > queue_max_segment_size(q)) + goto new_segment; + sg->length += iv->bv_len; } else { new_segment: @@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) } EXPORT_SYMBOL(blk_integrity_compare); +int blk_integrity_merge_rq(struct request_queue *q, struct request *req, + struct request *next) +{ + if (blk_integrity_rq(req) != blk_integrity_rq(next)) + return -1; + + if (req->nr_integrity_segments + next->nr_integrity_segments > + q->limits.max_integrity_segments) + return -1; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_merge_rq); + +int blk_integrity_merge_bio(struct request_queue *q, struct request *req, + struct bio *bio) +{ + int nr_integrity_segs; + struct bio *next = bio->bi_next; + + bio->bi_next = NULL; + nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); + bio->bi_next = next; + + if (req->nr_integrity_segments + nr_integrity_segs > + q->limits.max_integrity_segments) + return -1; + + req->nr_integrity_segments += nr_integrity_segs; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_merge_bio); + struct integrity_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_integrity *, char *); diff --git a/block/blk-merge.c b/block/blk-merge.c index 3b0cd42..6a72546 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q, { int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; - return 0; - } + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) + goto no_merge; + + if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio)) + goto no_merge; /* * This will form the start of a new hw segment. Bump both @@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q, */ req->nr_phys_segments += nr_phys_segs; return 1; + +no_merge: + req->cmd_flags |= REQ_NOMERGE; + if (req == q->last_merge) + q->last_merge = NULL; + return 0; } int ll_back_merge_fn(struct request_queue *q, struct request *req, @@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > queue_max_segments(q)) return 0; + if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next)) + return 0; + /* Merge is OK... */ req->nr_phys_segments = total_phys_segments; return 1; @@ -372,9 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req, || next->special) return 0; - if (blk_integrity_rq(req) != blk_integrity_rq(next)) - return 0; - /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn diff --git a/block/blk-settings.c b/block/blk-settings.c index 8d592b5..f8f2ddf 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); void blk_set_default_limits(struct queue_limits *lim) { lim->max_segments = BLK_MAX_SEGMENTS; + lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; @@ -509,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->seg_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); + t->max_integrity_segments = min_not_zero(t->max_integrity_segments, + b->max_integrity_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 001ab18..b014f77 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page) return queue_var_show(queue_max_segments(q), (page)); } +static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.max_integrity_segments, (page)); +} + static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) @@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = { .show = queue_max_segments_show, }; +static struct queue_sysfs_entry queue_max_integrity_segments_entry = { + .attr = {.name = "max_integrity_segments", .mode = S_IRUGO }, + .show = queue_max_integrity_segments_show, +}; + static struct queue_sysfs_entry queue_max_segment_size_entry = { .attr = {.name = "max_segment_size", .mode = S_IRUGO }, .show = queue_max_segment_size_show, @@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = { &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, + &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, diff --git a/block/blk.h b/block/blk.h index 6e7dc87..6738831 100644 --- a/block/blk.h +++ b/block/blk.h @@ -132,14 +132,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) - -#endif /* BLK_DEV_INTEGRITY */ - static inline int blk_cpu_to_group(int cpu) { #ifdef CONFIG_SCHED_MC -- cgit v1.1 From 8dcbdc742fec9e6c542ff9cb599d2ee620753d07 Mon Sep 17 00:00:00 2001 From: San Mehat Date: Tue, 14 Sep 2010 08:48:01 +0200 Subject: block: block_dump: Add number of sectors to debug output Signed-off-by: San Mehat Signed-off-by: Linus Walleij Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index ee1a1e7..8d07c1b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1612,11 +1612,12 @@ void submit_bio(int rw, struct bio *bio) if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", + printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, - bdevname(bio->bi_bdev, b)); + bdevname(bio->bi_bdev, b), + count); } } -- cgit v1.1 From 144177991ca624841ddbd1e7edff958fc0f6d1fe Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Sep 2010 13:08:27 +0200 Subject: block: fix an address space warning in blk-map.c Change type of 2nd parameter of blk_rq_aligned() into unsigned long and remove unnecessary casting. Now we can call it with 'uaddr' instead of 'ubuf' in __blk_rq_map_user() so that it can remove following warnings from sparse: block/blk-map.c:57:31: warning: incorrect type in argument 2 (different address spaces) block/blk-map.c:57:31: expected void *addr block/blk-map.c:57:31: got void [noderef] *ubuf However blk_rq_map_kern() needs one more local variable to handle it. Signed-off-by: Namhyung Kim Signed-off-by: Jens Axboe --- block/blk-map.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index c65d759..ac0f7d4 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq, * direct dma. else, set up kernel bounce buffers */ uaddr = (unsigned long) ubuf; - if (blk_rq_aligned(q, ubuf, len) && !map_data) + if (blk_rq_aligned(q, uaddr, len) && !map_data) bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); else bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); @@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, unsigned int len, gfp_t gfp_mask) { int reading = rq_data_dir(rq) == READ; + unsigned long addr = (unsigned long) kbuf; int do_copy = 0; struct bio *bio; int ret; @@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); + do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf); if (do_copy) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else -- cgit v1.1 From 6d1d8050b4bc89d0165d29b58e894aeba2564a97 Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Tue, 31 Aug 2010 15:47:05 -0500 Subject: block, partition: add partition_meta_info to hd_struct I'm reposting this patch series as v4 since there have been no additional comments, and I cleaned up one extra bit of unneeded code (in 3/3). The patches are against Linus's tree: 2bfc96a127bc1cc94d26bfaa40159966064f9c8c (2.6.36-rc3). Would this patchset be suitable for inclusion in an mm branch? This changes adds a partition_meta_info struct which itself contains a union of structures that provide partition table specific metadata. This change leaves the union empty. The subsequent patch includes an implementation for CONFIG_EFI_PARTITION-based metadata. Signed-off-by: Will Drewry Signed-off-by: Jens Axboe --- block/genhd.c | 1 + block/ioctl.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 59a2db6..c8da120 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1004,6 +1004,7 @@ static void disk_release(struct device *dev) kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); + free_part_info(&disk->part0); kfree(disk); } struct class block_class = { diff --git a/block/ioctl.c b/block/ioctl.c index d8052f0..2c15fe0 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user /* all seems OK */ part = add_partition(disk, partno, start, length, - ADDPART_FLAG_NONE); + ADDPART_FLAG_NONE, NULL); mutex_unlock(&bdev->bd_mutex); return IS_ERR(part) ? PTR_ERR(part) : 0; case BLKPG_DEL_PARTITION: -- cgit v1.1 From b5af921ec02333e943efb59aca4f56b78fc0e100 Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Tue, 31 Aug 2010 15:47:07 -0500 Subject: init: add support for root devices specified by partition UUID This is the third patch in a series which adds support for storing partition metadata, optionally, off of the hd_struct. One major use for that data is being able to resolve partition by other identities than just the index on a block device. Device enumeration varies by platform and there's a benefit to being able to use something like EFI GPT's GUIDs to determine the correct block device and partition to mount as the root. This change adds that support to root= by adding support for the following syntax: root=PARTUUID=hex-uuid Signed-off-by: Will Drewry Signed-off-by: Jens Axboe --- block/genhd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index c8da120..5c9c503 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -642,6 +642,7 @@ void __init printk_all_partitions(void) struct hd_struct *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; + u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; /* * Don't show empty devices or things that have been @@ -660,10 +661,14 @@ void __init printk_all_partitions(void) while ((part = disk_part_iter_next(&piter))) { bool is_part0 = part == &disk->part0; - printk("%s%s %10llu %s", is_part0 ? "" : " ", + uuid[0] = 0; + if (part->info) + part_unpack_uuid(part->info->uuid, uuid); + + printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf)); + disk_name(disk, part->partno, name_buf), uuid); if (is_part0) { if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) -- cgit v1.1 From af41d7bd9b685ab4e8f930627874ba4f4728e128 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 15 Sep 2010 17:06:32 -0400 Subject: blk-cgroup: Kill the header printed at the start of blkio.weight_device file o Kill extra "dev weight" header which is printed when somebody reads blkio.weight_device file. This really seems to be out of convention. No other blkio files are printing any header at the start of file. I think it is ok to just print values and how to interpret values should be part of documentation. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a680964..c1a39d9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -811,8 +811,6 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, struct blkio_cgroup *blkcg; struct blkio_policy_node *pn; - seq_printf(m, "dev\tweight\n"); - blkcg = cgroup_to_blkio_cgroup(cgrp); if (!list_empty(&blkcg->policy_list)) { spin_lock_irq(&blkcg->lock); -- cgit v1.1 From 062a644d6121d5e2f51c0b2ca0cbc5155ebf845b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 15 Sep 2010 17:06:33 -0400 Subject: blk-cgroup: Prepare the base for supporting more than one IO control policies o This patch prepares the base for introducing new IO control policies. Currently all the code is written knowing there is only one policy and that is proportional bandwidth. Creating infrastructure for newer policies to come in. o Also there were many functions which were generated using macro. It was very confusing. Got rid of those. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 544 +++++++++++++++++++++++++++++++++++++--------------- block/blk-cgroup.h | 36 +++- block/cfq-iosched.c | 1 + block/cfq.h | 2 +- 4 files changed, 429 insertions(+), 154 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c1a39d9..7762987 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); +/* for encoding cft->private value on file */ +#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) +/* What policy owns the file, proportional or throttle */ +#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) +#define BLKIOFILE_ATTR(val) ((val) & 0xffff) + struct cgroup_subsys blkio_subsys = { .name = "blkio", .create = blkiocg_create, @@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, list_add(&pn->node, &blkcg->policy_list); } +static inline bool cftype_blkg_same_policy(struct cftype *cft, + struct blkio_group *blkg) +{ + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + + if (blkg->plid == plid) + return 1; + + return 0; +} + +/* Determines if policy node matches cgroup file being accessed */ +static inline bool pn_matches_cftype(struct cftype *cft, + struct blkio_policy_node *pn) +{ + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); + + return (plid == pn->plid && fileid == pn->fileid); +} + /* Must be called with blkcg->lock held */ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) { @@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) /* Must be called with blkcg->lock held */ static struct blkio_policy_node * -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, + enum blkio_policy_id plid, int fileid) { struct blkio_policy_node *pn; list_for_each_entry(pn, &blkcg->policy_list, node) { - if (pn->dev == dev) + if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) return pn; } @@ -86,6 +114,20 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); +static inline void +blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + if (blkiop->ops.blkio_update_group_weight_fn) + blkiop->ops.blkio_update_group_weight_fn(blkg, weight); + } +} + /* * Add to the appropriate stat variable depending on the request type. * This should be called with the blkg->stats_lock held. @@ -341,7 +383,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) { unsigned long flags; @@ -350,6 +393,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, rcu_assign_pointer(blkg->key, key); blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + blkg->plid = plid; spin_unlock_irqrestore(&blkcg->lock, flags); /* Need to take css reference ? */ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); @@ -408,51 +452,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) } EXPORT_SYMBOL_GPL(blkiocg_lookup_group); -#define SHOW_FUNCTION(__VAR) \ -static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype) \ -{ \ - struct blkio_cgroup *blkcg; \ - \ - blkcg = cgroup_to_blkio_cgroup(cgroup); \ - return (u64)blkcg->__VAR; \ -} - -SHOW_FUNCTION(weight); -#undef SHOW_FUNCTION - -static int -blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) -{ - struct blkio_cgroup *blkcg; - struct blkio_group *blkg; - struct hlist_node *n; - struct blkio_policy_type *blkiop; - struct blkio_policy_node *pn; - - if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) - return -EINVAL; - - blkcg = cgroup_to_blkio_cgroup(cgroup); - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - blkcg->weight = (unsigned int)val; - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - pn = blkio_policy_search_node(blkcg, blkg->dev); - - if (pn) - continue; - - list_for_each_entry(blkiop, &blkio_list, list) - blkiop->ops.blkio_update_group_weight_fn(blkg, - blkcg->weight); - } - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); - return 0; -} - static int blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { @@ -593,52 +592,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, return disk_total; } -#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ -static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype, struct cgroup_map_cb *cb) \ -{ \ - struct blkio_cgroup *blkcg; \ - struct blkio_group *blkg; \ - struct hlist_node *n; \ - uint64_t cgroup_total = 0; \ - \ - if (!cgroup_lock_live_group(cgroup)) \ - return -ENODEV; \ - \ - blkcg = cgroup_to_blkio_cgroup(cgroup); \ - rcu_read_lock(); \ - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ - if (blkg->dev) { \ - spin_lock_irq(&blkg->stats_lock); \ - cgroup_total += blkio_get_stat(blkg, cb, \ - blkg->dev, type); \ - spin_unlock_irq(&blkg->stats_lock); \ - } \ - } \ - if (show_total) \ - cb->fill(cb, "Total", cgroup_total); \ - rcu_read_unlock(); \ - cgroup_unlock(); \ - return 0; \ -} - -SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); -SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); -SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); -SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); -SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); -SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); -SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); -SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); -#ifdef CONFIG_DEBUG_BLK_CGROUP -SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); -SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); -SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); -SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); -SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); -#endif -#undef SHOW_FUNCTION_PER_GROUP - static int blkio_check_dev_num(dev_t dev) { int part = 0; @@ -652,7 +605,7 @@ static int blkio_check_dev_num(dev_t dev) } static int blkio_policy_parse_and_set(char *buf, - struct blkio_policy_node *newpn) + struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) { char *s[4], *p, *major_s = NULL, *minor_s = NULL; int ret; @@ -705,12 +658,20 @@ static int blkio_policy_parse_and_set(char *buf, if (s[1] == NULL) return -EINVAL; - ret = strict_strtoul(s[1], 10, &temp); - if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || - temp > BLKIO_WEIGHT_MAX) - return -EINVAL; + switch (plid) { + case BLKIO_POLICY_PROP: + ret = strict_strtoul(s[1], 10, &temp); + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + return -EINVAL; - newpn->weight = temp; + newpn->plid = plid; + newpn->fileid = fileid; + newpn->weight = temp; + break; + default: + BUG(); + } return 0; } @@ -720,7 +681,8 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, { struct blkio_policy_node *pn; - pn = blkio_policy_search_node(blkcg, dev); + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device); if (pn) return pn->weight; else @@ -728,18 +690,86 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, } EXPORT_SYMBOL_GPL(blkcg_get_weight); +/* Checks whether user asked for deleting a policy rule */ +static bool blkio_delete_rule_command(struct blkio_policy_node *pn) +{ + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->weight == 0) + return 1; + break; + default: + BUG(); + } + + return 0; +} + +static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, + struct blkio_policy_node *newpn) +{ + switch(oldpn->plid) { + case BLKIO_POLICY_PROP: + oldpn->weight = newpn->weight; + break; + default: + BUG(); + } +} + +/* + * Some rules/values in blkg have changed. Propogate those to respective + * policies. + */ +static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, struct blkio_policy_node *pn) +{ + unsigned int weight; + + switch(pn->plid) { + case BLKIO_POLICY_PROP: + weight = pn->weight ? pn->weight : + blkcg->weight; + blkio_update_group_weight(blkg, weight); + break; + default: + BUG(); + } +} -static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +/* + * A policy node rule has been updated. Propogate this update to all the + * block groups which might be affected by this update. + */ +static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) +{ + struct blkio_group *blkg; + struct hlist_node *n; + + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (pn->dev != blkg->dev || pn->plid != blkg->plid) + continue; + blkio_update_blkg_policy(blkcg, blkg, pn); + } + + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); +} + +static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) { int ret = 0; char *buf; struct blkio_policy_node *newpn, *pn; struct blkio_cgroup *blkcg; - struct blkio_group *blkg; int keep_newpn = 0; - struct hlist_node *n; - struct blkio_policy_type *blkiop; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); buf = kstrdup(buffer, GFP_KERNEL); if (!buf) @@ -751,7 +781,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, goto free_buf; } - ret = blkio_policy_parse_and_set(buf, newpn); + ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); if (ret) goto free_newpn; @@ -759,9 +789,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, spin_lock_irq(&blkcg->lock); - pn = blkio_policy_search_node(blkcg, newpn->dev); + pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); if (!pn) { - if (newpn->weight != 0) { + if (!blkio_delete_rule_command(newpn)) { blkio_policy_insert_node(blkcg, newpn); keep_newpn = 1; } @@ -769,33 +799,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, goto update_io_group; } - if (newpn->weight == 0) { - /* weight == 0 means deleteing a specific weight */ + if (blkio_delete_rule_command(newpn)) { blkio_policy_delete_node(pn); spin_unlock_irq(&blkcg->lock); goto update_io_group; } spin_unlock_irq(&blkcg->lock); - pn->weight = newpn->weight; + blkio_update_policy_rule(pn, newpn); update_io_group: - /* update weight for each cfqg */ - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (newpn->dev == blkg->dev) { - list_for_each_entry(blkiop, &blkio_list, list) - blkiop->ops.blkio_update_group_weight_fn(blkg, - newpn->weight ? - newpn->weight : - blkcg->weight); - } - } - - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); + blkio_update_policy_node_blkg(blkcg, newpn); free_newpn: if (!keep_newpn) @@ -805,21 +819,219 @@ free_buf: return ret; } -static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *m) +static void +blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) +{ + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->fileid == BLKIO_PROP_weight_device) + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->weight); + break; + default: + BUG(); + } +} + +/* cgroup files which read their data from policy nodes end up here */ +static void blkio_read_policy_node_files(struct cftype *cft, + struct blkio_cgroup *blkcg, struct seq_file *m) { - struct blkio_cgroup *blkcg; struct blkio_policy_node *pn; - blkcg = cgroup_to_blkio_cgroup(cgrp); if (!list_empty(&blkcg->policy_list)) { spin_lock_irq(&blkcg->lock); list_for_each_entry(pn, &blkcg->policy_list, node) { - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->weight); + if (!pn_matches_cftype(cft, pn)) + continue; + blkio_print_policy_node(m, pn); } spin_unlock_irq(&blkcg->lock); } +} + +static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + default: + BUG(); + } + + return 0; +} + +static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, + struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, + bool show_total) +{ + struct blkio_group *blkg; + struct hlist_node *n; + uint64_t cgroup_total = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (blkg->dev) { + if (!cftype_blkg_same_policy(cft, blkg)) + continue; + spin_lock_irq(&blkg->stats_lock); + cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, + type); + spin_unlock_irq(&blkg->stats_lock); + } + } + if (show_total) + cb->fill(cb, "Total", cgroup_total); + rcu_read_unlock(); + return 0; +} + +/* All map kind of cgroup file get serviced by this function */ +static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_TIME, 0); + case BLKIO_PROP_sectors: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SECTORS, 0); + case BLKIO_PROP_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_BYTES, 1); + case BLKIO_PROP_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICED, 1); + case BLKIO_PROP_io_service_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_TIME, 1); + case BLKIO_PROP_io_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_WAIT_TIME, 1); + case BLKIO_PROP_io_merged: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_MERGED, 1); + case BLKIO_PROP_io_queued: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_QUEUED, 1); +#ifdef CONFIG_DEBUG_BLK_CGROUP + case BLKIO_PROP_dequeue: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_DEQUEUE, 0); + case BLKIO_PROP_avg_queue_size: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_AVG_QUEUE_SIZE, 0); + case BLKIO_PROP_group_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_GROUP_WAIT_TIME, 0); + case BLKIO_PROP_idle_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_IDLE_TIME, 0); + case BLKIO_PROP_empty_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_EMPTY_TIME, 0); +#endif + default: + BUG(); + } + break; + + default: + BUG(); + } + + return 0; +} + +static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) +{ + struct blkio_group *blkg; + struct hlist_node *n; + struct blkio_policy_node *pn; + + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; + + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + blkcg->weight = (unsigned int)val; + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + pn = blkio_policy_search_node(blkcg, blkg->dev, + BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); + if (pn) + continue; + + blkio_update_group_weight(blkg, blkcg->weight); + } + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + return 0; +} + +static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return (u64)blkcg->weight; + } + break; + default: + BUG(); + } + return 0; +} + +static int +blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return blkio_weight_write(blkcg, val); + } + break; + default: + BUG(); + } return 0; } @@ -827,46 +1039,66 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, struct cftype blkio_files[] = { { .name = "weight_device", - .read_seq_string = blkiocg_weight_device_read, - .write_string = blkiocg_weight_device_write, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, .max_write_len = 256, }, { .name = "weight", - .read_u64 = blkiocg_weight_read, - .write_u64 = blkiocg_weight_write, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight), + .read_u64 = blkiocg_file_read_u64, + .write_u64 = blkiocg_file_write_u64, }, { .name = "time", - .read_map = blkiocg_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_time), + .read_map = blkiocg_file_read_map, }, { .name = "sectors", - .read_map = blkiocg_sectors_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_sectors), + .read_map = blkiocg_file_read_map, }, { .name = "io_service_bytes", - .read_map = blkiocg_io_service_bytes_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_bytes), + .read_map = blkiocg_file_read_map, }, { .name = "io_serviced", - .read_map = blkiocg_io_serviced_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_serviced), + .read_map = blkiocg_file_read_map, }, { .name = "io_service_time", - .read_map = blkiocg_io_service_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_time), + .read_map = blkiocg_file_read_map, }, { .name = "io_wait_time", - .read_map = blkiocg_io_wait_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_wait_time), + .read_map = blkiocg_file_read_map, }, { .name = "io_merged", - .read_map = blkiocg_io_merged_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_merged), + .read_map = blkiocg_file_read_map, }, { .name = "io_queued", - .read_map = blkiocg_io_queued_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_queued), + .read_map = blkiocg_file_read_map, }, { .name = "reset_stats", @@ -875,23 +1107,33 @@ struct cftype blkio_files[] = { #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "avg_queue_size", - .read_map = blkiocg_avg_queue_size_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_avg_queue_size), + .read_map = blkiocg_file_read_map, }, { .name = "group_wait_time", - .read_map = blkiocg_group_wait_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_group_wait_time), + .read_map = blkiocg_file_read_map, }, { .name = "idle_time", - .read_map = blkiocg_idle_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_idle_time), + .read_map = blkiocg_file_read_map, }, { .name = "empty_time", - .read_map = blkiocg_empty_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_empty_time), + .read_map = blkiocg_file_read_map, }, { .name = "dequeue", - .read_map = blkiocg_dequeue_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_dequeue), + .read_map = blkiocg_file_read_map, }, #endif }; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2b866ec..c8de259 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,6 +15,10 @@ #include +enum blkio_policy_id { + BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ +}; + #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) #ifndef CONFIG_BLK_CGROUP @@ -65,6 +69,25 @@ enum blkg_state_flags { BLKG_empty, }; +/* cgroup files owned by proportional weight policy */ +enum blkcg_file_name_prop { + BLKIO_PROP_weight = 1, + BLKIO_PROP_weight_device, + BLKIO_PROP_io_service_bytes, + BLKIO_PROP_io_serviced, + BLKIO_PROP_time, + BLKIO_PROP_sectors, + BLKIO_PROP_io_service_time, + BLKIO_PROP_io_wait_time, + BLKIO_PROP_io_merged, + BLKIO_PROP_io_queued, + BLKIO_PROP_avg_queue_size, + BLKIO_PROP_group_wait_time, + BLKIO_PROP_idle_time, + BLKIO_PROP_empty_time, + BLKIO_PROP_dequeue, +}; + struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; @@ -112,6 +135,8 @@ struct blkio_group { char path[128]; /* The device MKDEV(major, minor), this group has been created for */ dev_t dev; + /* policy which owns this blk group */ + enum blkio_policy_id plid; /* Need to serialize the stats in the case of reset/update */ spinlock_t stats_lock; @@ -122,6 +147,10 @@ struct blkio_policy_node { struct list_head node; dev_t dev; unsigned int weight; + /* This node belongs to max bw policy or porportional weight policy */ + enum blkio_policy_id plid; + /* cgroup file to which this rule belongs to */ + int fileid; }; extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, @@ -139,6 +168,7 @@ struct blkio_policy_ops { struct blkio_policy_type { struct list_head list; struct blkio_policy_ops ops; + enum blkio_policy_id plid; }; /* Blkio controller policy registration */ @@ -212,7 +242,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev); + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid); extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); @@ -234,7 +265,8 @@ static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) {} + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) {} static inline int blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index eb4086f..b9f8619 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4013,6 +4013,7 @@ static struct blkio_policy_type blkio_policy_cfq = { .blkio_unlink_group_fn = cfq_unlink_blkio_group, .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, }, + .plid = BLKIO_POLICY_PROP, }; #else static struct blkio_policy_type blkio_policy_cfq; diff --git a/block/cfq.h b/block/cfq.h index 93448e5..54a6d90 100644 --- a/block/cfq.h +++ b/block/cfq.h @@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev) { - blkiocg_add_blkio_group(blkcg, blkg, key, dev); + blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); } static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) -- cgit v1.1 From 4c9eefa16c6f124ffcc736cb719b24ea27f85017 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 15 Sep 2010 17:06:34 -0400 Subject: blk-cgroup: Introduce cgroup changes for throttling policy o cgroup chagnes for throttle policy. o Introduces READ and WRITE bytes per second throttling rules. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++--- block/blk-cgroup.h | 30 ++++++++++- 2 files changed, 167 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7762987..aae8c93 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -128,6 +128,27 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) } } +static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, + int fileid) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_bps_device + && blkiop->ops.blkio_update_group_read_bps_fn) + blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps); + + if (fileid == BLKIO_THROTL_write_bps_device + && blkiop->ops.blkio_update_group_write_bps_fn) + blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps); + } +} + /* * Add to the appropriate stat variable depending on the request type. * This should be called with the blkg->stats_lock held. @@ -612,6 +633,7 @@ static int blkio_policy_parse_and_set(char *buf, unsigned long major, minor, temp; int i = 0; dev_t dev; + u64 bps; memset(s, 0, sizeof(s)); @@ -667,7 +689,16 @@ static int blkio_policy_parse_and_set(char *buf, newpn->plid = plid; newpn->fileid = fileid; - newpn->weight = temp; + newpn->val.weight = temp; + break; + case BLKIO_POLICY_THROTL: + ret = strict_strtoull(s[1], 10, &bps); + if (ret) + return -EINVAL; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.bps = bps; break; default: BUG(); @@ -684,18 +715,45 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); if (pn) - return pn->weight; + return pn->val.weight; else return blkcg->weight; } EXPORT_SYMBOL_GPL(blkcg_get_weight); +uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device); + if (pn) + return pn->val.bps; + else + return -1; +} + +uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device); + if (pn) + return pn->val.bps; + else + return -1; +} + /* Checks whether user asked for deleting a policy rule */ static bool blkio_delete_rule_command(struct blkio_policy_node *pn) { switch(pn->plid) { case BLKIO_POLICY_PROP: - if (pn->weight == 0) + if (pn->val.weight == 0) + return 1; + break; + case BLKIO_POLICY_THROTL: + if (pn->val.bps == 0) return 1; break; default: @@ -710,7 +768,10 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, { switch(oldpn->plid) { case BLKIO_POLICY_PROP: - oldpn->weight = newpn->weight; + oldpn->val.weight = newpn->val.weight; + break; + case BLKIO_POLICY_THROTL: + oldpn->val.bps = newpn->val.bps; break; default: BUG(); @@ -725,13 +786,23 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, struct blkio_group *blkg, struct blkio_policy_node *pn) { unsigned int weight; + u64 bps; switch(pn->plid) { case BLKIO_POLICY_PROP: - weight = pn->weight ? pn->weight : + weight = pn->val.weight ? pn->val.weight : blkcg->weight; blkio_update_group_weight(blkg, weight); break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + bps = pn->val.bps ? pn->val.bps : (-1); + blkio_update_group_bps(blkg, bps, pn->fileid); + break; + } + break; default: BUG(); } @@ -826,7 +897,17 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) case BLKIO_POLICY_PROP: if (pn->fileid == BLKIO_PROP_weight_device) seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->weight); + MINOR(pn->dev), pn->val.weight); + break; + case BLKIO_POLICY_THROTL: + if (pn->fileid == BLKIO_THROTL_read_bps_device) + seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.bps); + else if (pn->fileid == BLKIO_THROTL_write_bps_device) + seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.bps); + else + BUG(); break; default: BUG(); @@ -869,6 +950,16 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, BUG(); } break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; default: BUG(); } @@ -959,7 +1050,18 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, BUG(); } break; - + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_BYTES, 1); + case BLKIO_THROTL_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICED, 1); + default: + BUG(); + } + break; default: BUG(); } @@ -1053,6 +1155,23 @@ struct cftype blkio_files[] = { .write_u64 = blkiocg_file_write_u64, }, { + .name = "throttle.read_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + { .name = "time", .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, BLKIO_PROP_time), @@ -1071,12 +1190,24 @@ struct cftype blkio_files[] = { .read_map = blkiocg_file_read_map, }, { + .name = "throttle.io_service_bytes", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_service_bytes), + .read_map = blkiocg_file_read_map, + }, + { .name = "io_serviced", .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, BLKIO_PROP_io_serviced), .read_map = blkiocg_file_read_map, }, { + .name = "throttle.io_serviced", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_serviced), + .read_map = blkiocg_file_read_map, + }, + { .name = "io_service_time", .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, BLKIO_PROP_io_service_time), diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index c8de259..1b73882 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -17,6 +17,7 @@ enum blkio_policy_id { BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ + BLKIO_POLICY_THROTL, /* Throttling */ }; #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) @@ -88,6 +89,14 @@ enum blkcg_file_name_prop { BLKIO_PROP_dequeue, }; +/* cgroup files owned by throttle policy */ +enum blkcg_file_name_throtl { + BLKIO_THROTL_read_bps_device, + BLKIO_THROTL_write_bps_device, + BLKIO_THROTL_io_service_bytes, + BLKIO_THROTL_io_serviced, +}; + struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; @@ -146,23 +155,42 @@ struct blkio_group { struct blkio_policy_node { struct list_head node; dev_t dev; - unsigned int weight; /* This node belongs to max bw policy or porportional weight policy */ enum blkio_policy_id plid; /* cgroup file to which this rule belongs to */ int fileid; + + union { + unsigned int weight; + /* + * Rate read/write in terms of byptes per second + * Whether this rate represents read or write is determined + * by file type "fileid". + */ + u64 bps; + } val; }; extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, dev_t dev); +extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, + dev_t dev); typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, unsigned int weight); +typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg, + u64 read_bps); +typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg, + u64 write_bps); struct blkio_policy_ops { blkio_unlink_group_fn *blkio_unlink_group_fn; blkio_update_group_weight_fn *blkio_update_group_weight_fn; + blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; + blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; }; struct blkio_policy_type { -- cgit v1.1 From e43473b7f223ec866f7db273697e76c337c390f9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 15 Sep 2010 17:06:35 -0400 Subject: blkio: Core implementation of throttle policy o Actual implementation of throttling policy in block layer. Currently it implements READ and WRITE bytes per second throttling logic. IOPS throttling comes in later patches. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/Kconfig | 12 + block/Makefile | 1 + block/blk-core.c | 24 ++ block/blk-throttle.c | 909 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 946 insertions(+) create mode 100644 block/blk-throttle.c (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 9be0b56..6c9213e 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_THROTTLING + bool "Block layer bio throttling support" + depends on BLK_CGROUP=y && EXPERIMENTAL + default n + ---help--- + Block layer bio throttling support. It can be used to limit + the IO rate to a device. IO rate policies are per cgroup and + one needs to mount and use blkio cgroup controller for creating + cgroups and specifying per device IO rate policies. + + See Documentation/cgroups/blkio-controller.txt for more information. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Makefile b/block/Makefile index 0bb499a..c850d5e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o diff --git a/block/blk-core.c b/block/blk-core.c index 8d07c1b..797d509 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -382,6 +382,7 @@ void blk_sync_queue(struct request_queue *q) del_timer_sync(&q->unplug_timer); del_timer_sync(&q->timeout); cancel_work_sync(&q->unplug_work); + throtl_shutdown_timer_wq(q); } EXPORT_SYMBOL(blk_sync_queue); @@ -459,6 +460,8 @@ void blk_cleanup_queue(struct request_queue *q) if (q->elevator) elevator_exit(q->elevator); + blk_throtl_exit(q); + blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); @@ -515,6 +518,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) return NULL; } + if (blk_throtl_init(q)) { + kmem_cache_free(blk_requestq_cachep, q); + return NULL; + } + setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); init_timer(&q->unplug_timer); @@ -1522,6 +1530,15 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } + blk_throtl_bio(q, &bio); + + /* + * If bio = NULL, bio has been throttled and will be submitted + * later. + */ + if (!bio) + break; + trace_block_bio_queue(q, bio); ret = q->make_request_fn(q, bio); @@ -2580,6 +2597,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_delayed_work(struct request_queue *q, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work(kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work); + int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * diff --git a/block/blk-throttle.c b/block/blk-throttle.c new file mode 100644 index 0000000..4b49201 --- /dev/null +++ b/block/blk-throttle.c @@ -0,0 +1,909 @@ +/* + * Interface for controlling IO bandwidth on a request queue + * + * Copyright (C) 2010 Vivek Goyal + */ + +#include +#include +#include +#include +#include +#include "blk-cgroup.h" + +/* Max dispatch from a group in 1 round */ +static int throtl_grp_quantum = 8; + +/* Total max dispatch from all groups in one round */ +static int throtl_quantum = 32; + +/* Throttling is performed over 100ms slice and after that slice is renewed */ +static unsigned long throtl_slice = HZ/10; /* 100 ms */ + +struct throtl_rb_root { + struct rb_root rb; + struct rb_node *left; + unsigned int count; + unsigned long min_disptime; +}; + +#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ + .count = 0, .min_disptime = 0} + +#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) + +struct throtl_grp { + /* List of throtl groups on the request queue*/ + struct hlist_node tg_node; + + /* active throtl group service_tree member */ + struct rb_node rb_node; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + struct blkio_group blkg; + atomic_t ref; + unsigned int flags; + + /* Two lists for READ and WRITE */ + struct bio_list bio_lists[2]; + + /* Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* bytes per second rate limits */ + uint64_t bps[2]; + + /* Number of bytes disptached in current slice */ + uint64_t bytes_disp[2]; + + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; +}; + +struct throtl_data +{ + /* List of throtl groups */ + struct hlist_head tg_list; + + /* service tree for active throtl groups */ + struct throtl_rb_root tg_service_tree; + + struct throtl_grp root_tg; + struct request_queue *queue; + + /* Total Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* + * number of total undestroyed groups (excluding root group) + */ + unsigned int nr_undestroyed_grps; + + /* Work for dispatching throttled bios */ + struct delayed_work throtl_work; +}; + +enum tg_state_flags { + THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ +}; + +#define THROTL_TG_FNS(name) \ +static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ +} \ +static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ +} \ +static inline int throtl_tg_##name(const struct throtl_grp *tg) \ +{ \ + return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ +} + +THROTL_TG_FNS(on_rr); + +#define throtl_log_tg(td, tg, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ + blkg_path(&(tg)->blkg), ##args); \ + +#define throtl_log(td, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) + +static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct throtl_grp, blkg); + + return NULL; +} + +static inline int total_nr_queued(struct throtl_data *td) +{ + return (td->nr_queued[0] + td->nr_queued[1]); +} + +static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) +{ + atomic_inc(&tg->ref); + return tg; +} + +static void throtl_put_tg(struct throtl_grp *tg) +{ + BUG_ON(atomic_read(&tg->ref) <= 0); + if (!atomic_dec_and_test(&tg->ref)) + return; + kfree(tg); +} + +static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, + struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct throtl_grp *tg = NULL; + void *key = td; + struct backing_dev_info *bdi = &td->queue->backing_dev_info; + unsigned int major, minor; + + /* + * TODO: Speed up blkiocg_lookup_group() by maintaining a radix + * tree of blkg (instead of traversing through hash list all + * the time. + */ + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + /* Fill in device details for root group */ + if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + tg->blkg.dev = MKDEV(major, minor); + goto done; + } + + if (tg) + goto done; + + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); + if (!tg) + goto done; + + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * request queue which will be dropped by either request queue + * exit or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&tg->ref, 1); + + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, + MKDEV(major, minor), BLKIO_POLICY_THROTL); + + tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); + tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); + + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; +done: + return tg; +} + +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +{ + struct cgroup *cgroup; + struct throtl_grp *tg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + tg = throtl_find_alloc_tg(td, cgroup); + if (!tg) + tg = &td->root_tg; + rcu_read_unlock(); + return tg; +} + +static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) +{ + /* Service tree is empty */ + if (!root->count) + return NULL; + + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_tg(root->left); + + return NULL; +} + +static void rb_erase_init(struct rb_node *n, struct rb_root *root) +{ + rb_erase(n, root); + RB_CLEAR_NODE(n); +} + +static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) +{ + if (root->left == n) + root->left = NULL; + rb_erase_init(n, &root->rb); + --root->count; +} + +static void update_min_dispatch_time(struct throtl_rb_root *st) +{ + struct throtl_grp *tg; + + tg = throtl_rb_first(st); + if (!tg) + return; + + st->min_disptime = tg->disptime; +} + +static void +tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct throtl_grp *__tg; + unsigned long key = tg->disptime; + int left = 1; + + while (*node != NULL) { + parent = *node; + __tg = rb_entry_tg(parent); + + if (time_before(key, __tg->disptime)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &tg->rb_node; + + rb_link_node(&tg->rb_node, parent, node); + rb_insert_color(&tg->rb_node, &st->rb); +} + +static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + tg_service_tree_add(st, tg); + throtl_mark_tg_on_rr(tg); + st->count++; +} + +static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (!throtl_tg_on_rr(tg)) + __throtl_enqueue_tg(td, tg); +} + +static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); + throtl_clear_tg_on_rr(tg); +} + +static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (throtl_tg_on_rr(tg)) + __throtl_dequeue_tg(td, tg); +} + +static void throtl_schedule_next_dispatch(struct throtl_data *td) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + /* + * If there are more bios pending, schedule more work. + */ + if (!total_nr_queued(td)) + return; + + BUG_ON(!st->count); + + update_min_dispatch_time(st); + + if (time_before_eq(st->min_disptime, jiffies)) + throtl_schedule_delayed_work(td->queue, 0); + else + throtl_schedule_delayed_work(td->queue, + (st->min_disptime - jiffies)); +} + +static inline void +throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + tg->bytes_disp[rw] = 0; + tg->slice_start[rw] = jiffies; + tg->slice_end[rw] = jiffies + throtl_slice; + throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +static inline void throtl_extend_slice(struct throtl_data *td, + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +/* Determine if previously allocated or extended slice is complete or not */ +static bool +throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) + return 0; + + return 1; +} + +/* Trim the used slices and adjust slice start accordingly */ +static inline void +throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + unsigned long nr_slices, bytes_trim, time_elapsed; + + BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); + + /* + * If bps are unlimited (-1), then time slice don't get + * renewed. Don't try to trim the slice if slice is used. A new + * slice will start when appropriate. + */ + if (throtl_slice_used(td, tg, rw)) + return; + + time_elapsed = jiffies - tg->slice_start[rw]; + + nr_slices = time_elapsed / throtl_slice; + + if (!nr_slices) + return; + + bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; + + if (!bytes_trim) + return; + + if (tg->bytes_disp[rw] >= bytes_trim) + tg->bytes_disp[rw] -= bytes_trim; + else + tg->bytes_disp[rw] = 0; + + tg->slice_start[rw] += nr_slices * throtl_slice; + + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu" + " start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, + tg->slice_start[rw], tg->slice_end[rw], jiffies); +} + +/* + * Returns whether one can dispatch a bio or not. Also returns approx number + * of jiffies to wait before this bio is with-in IO rate and can be dispatched + */ +static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + u64 bytes_allowed, extra_bytes; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + /* + * Currently whole state machine of group depends on first bio + * queued in the group bio list. So one should not be calling + * this function with a different bio if there are other bios + * queued. + */ + BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); + + /* If tg->bps = -1, then BW is unlimited */ + if (tg->bps[rw] == -1) { + if (wait) + *wait = 0; + return 1; + } + + /* + * If previous slice expired, start a new one otherwise renew/extend + * existing slice to make sure it is at least throtl_slice interval + * long since now. + */ + if (throtl_slice_used(td, tg, rw)) + throtl_start_new_slice(td, tg, rw); + else { + if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) + throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); + } + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) + / MSEC_PER_SEC; + + if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { + if (wait) + *wait = 0; + return 1; + } + + /* Calc approx time to dispatch */ + extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; + jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + + if (!jiffy_wait) + jiffy_wait = 1; + + /* + * This wait time is without taking into consideration the rounding + * up we did. Add that time also. + */ + jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); + + if (wait) + *wait = jiffy_wait; + + if (time_before(tg->slice_end[rw], jiffies + jiffy_wait)) + throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait); + + return 0; +} + +static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + bool sync = bio->bi_rw & REQ_SYNC; + + /* Charge the bio to the group */ + tg->bytes_disp[rw] += bio->bi_size; + + /* + * TODO: This will take blkg->stats_lock. Figure out a way + * to avoid this cost. + */ + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); + +} + +static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio) +{ + bool rw = bio_data_dir(bio); + + bio_list_add(&tg->bio_lists[rw], bio); + /* Take a bio reference on tg */ + throtl_ref_get_tg(tg); + tg->nr_queued[rw]++; + td->nr_queued[rw]++; + throtl_enqueue_tg(td, tg); +} + +static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) +{ + unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + struct bio *bio; + + if ((bio = bio_list_peek(&tg->bio_lists[READ]))) + tg_may_dispatch(td, tg, bio, &read_wait); + + if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) + tg_may_dispatch(td, tg, bio, &write_wait); + + min_wait = min(read_wait, write_wait); + disptime = jiffies + min_wait; + + /* + * If group is already on active tree, then update dispatch time + * only if it is lesser than existing dispatch time. Otherwise + * always update the dispatch time + */ + + if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime)) + return; + + /* Update dispatch time */ + throtl_dequeue_tg(td, tg); + tg->disptime = disptime; + throtl_enqueue_tg(td, tg); +} + +static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, + bool rw, struct bio_list *bl) +{ + struct bio *bio; + + bio = bio_list_pop(&tg->bio_lists[rw]); + tg->nr_queued[rw]--; + /* Drop bio reference on tg */ + throtl_put_tg(tg); + + BUG_ON(td->nr_queued[rw] <= 0); + td->nr_queued[rw]--; + + throtl_charge_bio(tg, bio); + bio_list_add(bl, bio); + bio->bi_rw |= REQ_THROTTLED; + + throtl_trim_slice(td, tg, rw); +} + +static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio_list *bl) +{ + unsigned int nr_reads = 0, nr_writes = 0; + unsigned int max_nr_reads = throtl_grp_quantum*3/4; + unsigned int max_nr_writes = throtl_grp_quantum - nr_reads; + struct bio *bio; + + /* Try to dispatch 75% READS and 25% WRITES */ + + while ((bio = bio_list_peek(&tg->bio_lists[READ])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_reads++; + + if (nr_reads >= max_nr_reads) + break; + } + + while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_writes++; + + if (nr_writes >= max_nr_writes) + break; + } + + return nr_reads + nr_writes; +} + +static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) +{ + unsigned int nr_disp = 0; + struct throtl_grp *tg; + struct throtl_rb_root *st = &td->tg_service_tree; + + while (1) { + tg = throtl_rb_first(st); + + if (!tg) + break; + + if (time_before(jiffies, tg->disptime)) + break; + + throtl_dequeue_tg(td, tg); + + nr_disp += throtl_dispatch_tg(td, tg, bl); + + if (tg->nr_queued[0] || tg->nr_queued[1]) { + tg_update_disptime(td, tg); + throtl_enqueue_tg(td, tg); + } + + if (nr_disp >= throtl_quantum) + break; + } + + return nr_disp; +} + +/* Dispatch throttled bios. Should be called without queue lock held. */ +static int throtl_dispatch(struct request_queue *q) +{ + struct throtl_data *td = q->td; + unsigned int nr_disp = 0; + struct bio_list bio_list_on_stack; + struct bio *bio; + + spin_lock_irq(q->queue_lock); + + if (!total_nr_queued(td)) + goto out; + + bio_list_init(&bio_list_on_stack); + + throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", + total_nr_queued(td), td->nr_queued[READ], + td->nr_queued[WRITE]); + + nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); + + if (nr_disp) + throtl_log(td, "bios disp=%u", nr_disp); + + throtl_schedule_next_dispatch(td); +out: + spin_unlock_irq(q->queue_lock); + + /* + * If we dispatched some requests, unplug the queue to make sure + * immediate dispatch + */ + if (nr_disp) { + while((bio = bio_list_pop(&bio_list_on_stack))) + generic_make_request(bio); + blk_unplug(q); + } + return nr_disp; +} + +void blk_throtl_work(struct work_struct *work) +{ + struct throtl_data *td = container_of(work, struct throtl_data, + throtl_work.work); + struct request_queue *q = td->queue; + + throtl_dispatch(q); +} + +/* Call with queue lock held */ +void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) +{ + + struct throtl_data *td = q->td; + struct delayed_work *dwork = &td->throtl_work; + + if (total_nr_queued(td) > 0) { + /* + * We might have a work scheduled to be executed in future. + * Cancel that and schedule a new one. + */ + __cancel_delayed_work(dwork); + kblockd_schedule_delayed_work(q, dwork, delay); + throtl_log(td, "schedule work. delay=%lu jiffies=%lu", + delay, jiffies); + } +} +EXPORT_SYMBOL(throtl_schedule_delayed_work); + +static void +throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&tg->tg_node)); + + hlist_del_init(&tg->tg_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + throtl_put_tg(tg); + td->nr_undestroyed_grps--; +} + +static void throtl_release_tgs(struct throtl_data *td) +{ + struct hlist_node *pos, *n; + struct throtl_grp *tg; + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&tg->blkg)) + throtl_destroy_tg(td, tg); + } +} + +static void throtl_td_free(struct throtl_data *td) +{ + kfree(td); +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid throtl_data pointer as long as we are + * rcu read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if queue was going away, cgroup deltion + * path got to it first. + */ +void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct throtl_data *td = key; + + spin_lock_irqsave(td->queue->queue_lock, flags); + throtl_destroy_tg(td, tg_of_blkg(blkg)); + spin_unlock_irqrestore(td->queue->queue_lock, flags); +} + +static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg, + u64 read_bps) +{ + tg_of_blkg(blkg)->bps[READ] = read_bps; +} + +static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg, + u64 write_bps) +{ + tg_of_blkg(blkg)->bps[WRITE] = write_bps; +} + +void throtl_shutdown_timer_wq(struct request_queue *q) +{ + struct throtl_data *td = q->td; + + cancel_delayed_work_sync(&td->throtl_work); +} + +static struct blkio_policy_type blkio_policy_throtl = { + .ops = { + .blkio_unlink_group_fn = throtl_unlink_blkio_group, + .blkio_update_group_read_bps_fn = + throtl_update_blkio_group_read_bps, + .blkio_update_group_write_bps_fn = + throtl_update_blkio_group_write_bps, + }, +}; + +int blk_throtl_bio(struct request_queue *q, struct bio **biop) +{ + struct throtl_data *td = q->td; + struct throtl_grp *tg; + struct bio *bio = *biop; + bool rw = bio_data_dir(bio), update_disptime = true; + + if (bio->bi_rw & REQ_THROTTLED) { + bio->bi_rw &= ~REQ_THROTTLED; + return 0; + } + + spin_lock_irq(q->queue_lock); + tg = throtl_get_tg(td); + + if (tg->nr_queued[rw]) { + /* + * There is already another bio queued in same dir. No + * need to update dispatch time. + */ + update_disptime = false; + goto queue_bio; + } + + /* Bio is with-in rate limit of group */ + if (tg_may_dispatch(td, tg, bio, NULL)) { + throtl_charge_bio(tg, bio); + goto out; + } + +queue_bio: + throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu" + " queued=%d/%d", rw == READ ? 'R' : 'W', + tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], + tg->nr_queued[READ], tg->nr_queued[WRITE]); + + throtl_add_bio_tg(q->td, tg, bio); + *biop = NULL; + + if (update_disptime) { + tg_update_disptime(td, tg); + throtl_schedule_next_dispatch(td); + } + +out: + spin_unlock_irq(q->queue_lock); + return 0; +} + +int blk_throtl_init(struct request_queue *q) +{ + struct throtl_data *td; + struct throtl_grp *tg; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + + INIT_HLIST_HEAD(&td->tg_list); + td->tg_service_tree = THROTL_RB_ROOT; + + /* Init root group */ + tg = &td->root_tg; + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + + /* Practically unlimited BW */ + tg->bps[0] = tg->bps[1] = -1; + atomic_set(&tg->ref, 1); + + INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); + + rcu_read_lock(); + blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, + 0, BLKIO_POLICY_THROTL); + rcu_read_unlock(); + + /* Attach throtl data to request queue */ + td->queue = q; + q->td = td; + return 0; +} + +void blk_throtl_exit(struct request_queue *q) +{ + struct throtl_data *td = q->td; + bool wait = false; + + BUG_ON(!td); + + throtl_shutdown_timer_wq(q); + + spin_lock_irq(q->queue_lock); + throtl_release_tgs(td); + blkiocg_del_blkio_group(&td->root_tg.blkg); + + /* If there are other groups */ + if (td->nr_undestroyed_grps >= 1) + wait = true; + + spin_unlock_irq(q->queue_lock); + + /* + * Wait for tg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other undestroyed groups out + * there (other than root group). This can happen if cgroup deletion + * path claimed the responsibility of cleaning up a group before + * queue cleanup code get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + throtl_td_free(td); +} + +static int __init throtl_init(void) +{ + blkio_policy_register(&blkio_policy_throtl); + return 0; +} + +module_init(throtl_init); -- cgit v1.1 From 7702e8f45b0a3bb262b9366c60beb5445758d94c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 15 Sep 2010 17:06:36 -0400 Subject: blk-cgroup: cgroup changes for IOPS limit support o cgroup changes for IOPS throttling rules. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++------- block/blk-cgroup.h | 13 +++++ 2 files changed, 135 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index aae8c93..20ce6f5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -149,6 +149,27 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, } } +static inline void blkio_update_group_iops(struct blkio_group *blkg, + unsigned int iops, int fileid) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_iops_device + && blkiop->ops.blkio_update_group_read_iops_fn) + blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops); + + if (fileid == BLKIO_THROTL_write_iops_device + && blkiop->ops.blkio_update_group_write_iops_fn) + blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops); + } +} + /* * Add to the appropriate stat variable depending on the request type. * This should be called with the blkg->stats_lock held. @@ -630,7 +651,7 @@ static int blkio_policy_parse_and_set(char *buf, { char *s[4], *p, *major_s = NULL, *minor_s = NULL; int ret; - unsigned long major, minor, temp; + unsigned long major, minor, temp, iops; int i = 0; dev_t dev; u64 bps; @@ -692,13 +713,28 @@ static int blkio_policy_parse_and_set(char *buf, newpn->val.weight = temp; break; case BLKIO_POLICY_THROTL: - ret = strict_strtoull(s[1], 10, &bps); - if (ret) - return -EINVAL; + switch(fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + ret = strict_strtoull(s[1], 10, &bps); + if (ret) + return -EINVAL; - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.bps = bps; + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.bps = bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + ret = strict_strtoul(s[1], 10, &iops); + if (ret) + return -EINVAL; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.iops = iops; + break; + } break; default: BUG(); @@ -744,6 +780,29 @@ uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) return -1; } +unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device); + if (pn) + return pn->val.iops; + else + return -1; +} + +unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device); + if (pn) + return pn->val.iops; + else + return -1; +} + /* Checks whether user asked for deleting a policy rule */ static bool blkio_delete_rule_command(struct blkio_policy_node *pn) { @@ -753,8 +812,17 @@ static bool blkio_delete_rule_command(struct blkio_policy_node *pn) return 1; break; case BLKIO_POLICY_THROTL: - if (pn->val.bps == 0) - return 1; + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + if (pn->val.bps == 0) + return 1; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + if (pn->val.iops == 0) + return 1; + } break; default: BUG(); @@ -771,7 +839,15 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, oldpn->val.weight = newpn->val.weight; break; case BLKIO_POLICY_THROTL: - oldpn->val.bps = newpn->val.bps; + switch(newpn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + oldpn->val.bps = newpn->val.bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + oldpn->val.iops = newpn->val.iops; + } break; default: BUG(); @@ -785,7 +861,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, struct blkio_group *blkg, struct blkio_policy_node *pn) { - unsigned int weight; + unsigned int weight, iops; u64 bps; switch(pn->plid) { @@ -801,6 +877,11 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, bps = pn->val.bps ? pn->val.bps : (-1); blkio_update_group_bps(blkg, bps, pn->fileid); break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + iops = pn->val.iops ? pn->val.iops : (-1); + blkio_update_group_iops(blkg, iops, pn->fileid); + break; } break; default: @@ -900,14 +981,18 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) MINOR(pn->dev), pn->val.weight); break; case BLKIO_POLICY_THROTL: - if (pn->fileid == BLKIO_THROTL_read_bps_device) + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), MINOR(pn->dev), pn->val.bps); - else if (pn->fileid == BLKIO_THROTL_write_bps_device) - seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.bps); - else - BUG(); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.iops); + break; + } break; default: BUG(); @@ -954,6 +1039,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, switch(name){ case BLKIO_THROTL_read_bps_device: case BLKIO_THROTL_write_bps_device: + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: blkio_read_policy_node_files(cft, blkcg, m); return 0; default: @@ -1171,6 +1258,24 @@ struct cftype blkio_files[] = { .write_string = blkiocg_file_write, .max_write_len = 256, }, + + { + .name = "throttle.read_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, { .name = "time", .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 1b73882..2070053 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -93,6 +93,8 @@ enum blkcg_file_name_prop { enum blkcg_file_name_throtl { BLKIO_THROTL_read_bps_device, BLKIO_THROTL_write_bps_device, + BLKIO_THROTL_read_iops_device, + BLKIO_THROTL_write_iops_device, BLKIO_THROTL_io_service_bytes, BLKIO_THROTL_io_serviced, }; @@ -168,6 +170,7 @@ struct blkio_policy_node { * by file type "fileid". */ u64 bps; + unsigned int iops; } val; }; @@ -177,6 +180,10 @@ extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev); extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev); +extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, + dev_t dev); typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, @@ -185,12 +192,18 @@ typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg, u64 read_bps); typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg, u64 write_bps); +typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg, + unsigned int read_iops); +typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg, + unsigned int write_iops); struct blkio_policy_ops { blkio_unlink_group_fn *blkio_unlink_group_fn; blkio_update_group_weight_fn *blkio_update_group_weight_fn; blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; + blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; + blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; }; struct blkio_policy_type { -- cgit v1.1 From 8e89d13f4ede2467629a971618537430fafaaea3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 15 Sep 2010 17:06:37 -0400 Subject: blkio: Implementation of IOPS limit logic o core logic of implementing IOPS throttling. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 164 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4b49201..af53f37c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -59,8 +59,13 @@ struct throtl_grp { /* bytes per second rate limits */ uint64_t bps[2]; + /* IOPS limits */ + unsigned int iops[2]; + /* Number of bytes disptached in current slice */ uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; /* When did we start a new slice */ unsigned long slice_start[2]; @@ -194,6 +199,8 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); + tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); + tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); hlist_add_head(&tg->tg_node, &td->tg_list); td->nr_undestroyed_grps++; @@ -335,6 +342,7 @@ static inline void throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) { tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + throtl_slice; throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -365,7 +373,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) static inline void throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) { - unsigned long nr_slices, bytes_trim, time_elapsed; + unsigned long nr_slices, bytes_trim, time_elapsed, io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); @@ -385,8 +393,9 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) return; bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; + io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; - if (!bytes_trim) + if (!bytes_trim && !io_trim) return; if (tg->bytes_disp[rw] >= bytes_trim) @@ -394,51 +403,62 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) else tg->bytes_disp[rw] = 0; + if (tg->io_disp[rw] >= io_trim) + tg->io_disp[rw] -= io_trim; + else + tg->io_disp[rw] = 0; + tg->slice_start[rw] += nr_slices * throtl_slice; - throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu" + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu" " start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } -/* - * Returns whether one can dispatch a bio or not. Also returns approx number - * of jiffies to wait before this bio is with-in IO rate and can be dispatched - */ -static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, - struct bio *bio, unsigned long *wait) +static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes; + unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; - /* - * Currently whole state machine of group depends on first bio - * queued in the group bio list. So one should not be calling - * this function with a different bio if there are other bios - * queued. - */ - BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; - /* If tg->bps = -1, then BW is unlimited */ - if (tg->bps[rw] == -1) { + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) + / MSEC_PER_SEC; + + if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; return 1; } - /* - * If previous slice expired, start a new one otherwise renew/extend - * existing slice to make sure it is at least throtl_slice interval - * long since now. - */ - if (throtl_slice_used(td, tg, rw)) - throtl_start_new_slice(td, tg, rw); - else { - if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) - throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); - } + /* Calc approx time to dispatch */ + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; + + if (jiffy_wait > jiffy_elapsed) + jiffy_wait = jiffy_wait - jiffy_elapsed; + else + jiffy_wait = 1; + + if (wait) + *wait = jiffy_wait; + return 0; +} + +static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + u64 bytes_allowed, extra_bytes; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -469,12 +489,62 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); - if (wait) *wait = jiffy_wait; + return 0; +} + +/* + * Returns whether one can dispatch a bio or not. Also returns approx number + * of jiffies to wait before this bio is with-in IO rate and can be dispatched + */ +static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + + /* + * Currently whole state machine of group depends on first bio + * queued in the group bio list. So one should not be calling + * this function with a different bio if there are other bios + * queued. + */ + BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); - if (time_before(tg->slice_end[rw], jiffies + jiffy_wait)) - throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait); + /* If tg->bps = -1, then BW is unlimited */ + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { + if (wait) + *wait = 0; + return 1; + } + + /* + * If previous slice expired, start a new one otherwise renew/extend + * existing slice to make sure it is at least throtl_slice interval + * long since now. + */ + if (throtl_slice_used(td, tg, rw)) + throtl_start_new_slice(td, tg, rw); + else { + if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) + throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); + } + + if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) + && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { + if (wait) + *wait = 0; + return 1; + } + + max_wait = max(bps_wait, iops_wait); + + if (wait) + *wait = max_wait; + + if (time_before(tg->slice_end[rw], jiffies + max_wait)) + throtl_extend_slice(td, tg, rw, jiffies + max_wait); return 0; } @@ -486,13 +556,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_size; + tg->io_disp[rw]++; /* * TODO: This will take blkg->stats_lock. Figure out a way * to avoid this cost. */ blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); - } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -763,6 +833,18 @@ static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg, tg_of_blkg(blkg)->bps[WRITE] = write_bps; } +static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg, + unsigned int read_iops) +{ + tg_of_blkg(blkg)->iops[READ] = read_iops; +} + +static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg, + unsigned int write_iops) +{ + tg_of_blkg(blkg)->iops[WRITE] = write_iops; +} + void throtl_shutdown_timer_wq(struct request_queue *q) { struct throtl_data *td = q->td; @@ -777,7 +859,12 @@ static struct blkio_policy_type blkio_policy_throtl = { throtl_update_blkio_group_read_bps, .blkio_update_group_write_bps_fn = throtl_update_blkio_group_write_bps, + .blkio_update_group_read_iops_fn = + throtl_update_blkio_group_read_iops, + .blkio_update_group_write_iops_fn = + throtl_update_blkio_group_write_iops, }, + .plid = BLKIO_POLICY_THROTL, }; int blk_throtl_bio(struct request_queue *q, struct bio **biop) @@ -811,9 +898,11 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) } queue_bio: - throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu" - " queued=%d/%d", rw == READ ? 'R' : 'W', + throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" + " iodisp=%u iops=%u queued=%d/%d", + rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], + tg->io_disp[rw], tg->iops[rw], tg->nr_queued[READ], tg->nr_queued[WRITE]); throtl_add_bio_tg(q->td, tg, bio); @@ -850,6 +939,7 @@ int blk_throtl_init(struct request_queue *q) /* Practically unlimited BW */ tg->bps[0] = tg->bps[1] = -1; + tg->iops[0] = tg->iops[1] = -1; atomic_set(&tg->ref, 1); INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); -- cgit v1.1 From 01ea50638bc04ca5259f5711fcdedefcdde1cf43 Mon Sep 17 00:00:00 2001 From: "Signed-off-by: Jan Kara" Date: Thu, 16 Sep 2010 20:36:36 +0200 Subject: block: Fix race during disk initialization When a new disk is being discovered, add_disk() first ties the bdev to gendisk (via register_disk()->blkdev_get()) and only after that calls bdi_register_bdev(). Because register_disk() also creates disk's kobject, it can happen that userspace manages to open and modify the device's data (or inode) before its BDI is properly initialized leading to a warning in __mark_inode_dirty(). Fix the problem by registering BDI early enough. This patch addresses https://bugzilla.kernel.org/show_bug.cgi?id=16312 Cc: stable@kernel.org Reported-by: Larry Finger Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 5c9c503..7923e72 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -541,13 +541,15 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); + /* Register BDI before referencing it from bdev */ + bdi = &disk->queue->backing_dev_info; + bdi_register_dev(bdi, disk_devt(disk)); + blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); - bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); -- cgit v1.1 From dd3932eddf428571762596e17b65f5dc92ca361b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Sep 2010 20:51:46 +0200 Subject: block: remove BLKDEV_IFL_WAIT All the blkdev_issue_* helpers can only sanely be used for synchronous caller. To issue cache flushes or barriers asynchronously the caller needs to set up a bio by itself with a completion callback to move the asynchronous state machine ahead. So drop the BLKDEV_IFL_WAIT flag that is always specified when calling blkdev_issue_* and also remove the now unused flags argument to blkdev_issue_flush and blkdev_issue_zeroout. For blkdev_issue_discard we need to keep it for the secure discard flag, which gains a more descriptive name and loses the bitops vs flag confusion. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 25 +++++++++++-------------- block/blk-lib.c | 21 ++++++++------------- block/ioctl.c | 4 ++-- 3 files changed, 21 insertions(+), 29 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 62b7df9..54b123d 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -205,7 +205,6 @@ static void bio_end_flush(struct bio *bio, int err) * @bdev: blockdev to issue flush for * @gfp_mask: memory allocation flags (for bio_alloc) * @error_sector: error sector - * @flags: BLKDEV_IFL_* flags to control behaviour * * Description: * Issue a flush for the block device in question. Caller can supply @@ -214,7 +213,7 @@ static void bio_end_flush(struct bio *bio, int err) * request was pushed in some internal queue for later handling. */ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, - sector_t *error_sector, unsigned long flags) + sector_t *error_sector) { DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; @@ -240,21 +239,19 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio = bio_alloc(gfp_mask, 0); bio->bi_end_io = bio_end_flush; bio->bi_bdev = bdev; - if (test_bit(BLKDEV_WAIT, &flags)) - bio->bi_private = &wait; + bio->bi_private = &wait; bio_get(bio); submit_bio(WRITE_FLUSH, bio); - if (test_bit(BLKDEV_WAIT, &flags)) { - wait_for_completion(&wait); - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be - * copied from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_sector; - } + wait_for_completion(&wait); + + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be + * copied from blk_rq_pos(rq). + */ + if (error_sector) + *error_sector = bio->bi_sector; if (!bio_flagged(bio, BIO_UPTODATE)) ret = -EIO; diff --git a/block/blk-lib.c b/block/blk-lib.c index fe2e6ed..1a320d2 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -61,7 +61,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, max_discard_sectors &= ~(disc_sects - 1); } - if (flags & BLKDEV_IFL_SECURE) { + if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; type |= REQ_SECURE; @@ -77,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio->bi_sector = sector; bio->bi_end_io = blkdev_discard_end_io; bio->bi_bdev = bdev; - if (flags & BLKDEV_IFL_WAIT) - bio->bi_private = &wait; + bio->bi_private = &wait; if (nr_sects > max_discard_sectors) { bio->bi_size = max_discard_sectors << 9; @@ -92,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio_get(bio); submit_bio(type, bio); - if (flags & BLKDEV_IFL_WAIT) - wait_for_completion(&wait); + wait_for_completion(&wait); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; @@ -139,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err) * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_IFL_* flags to control behaviour * * Description: * Generate and issue number of bios with zerofiled pages. @@ -148,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err) */ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) + sector_t nr_sects, gfp_t gfp_mask) { int ret; struct bio *bio; @@ -174,8 +171,7 @@ submit: bio->bi_sector = sector; bio->bi_bdev = bdev; bio->bi_end_io = bio_batch_end_io; - if (flags & BLKDEV_IFL_WAIT) - bio->bi_private = &bb; + bio->bi_private = &bb; while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); @@ -193,10 +189,9 @@ submit: submit_bio(WRITE, bio); } - if (flags & BLKDEV_IFL_WAIT) - /* Wait for bios in-flight */ - while ( issued != atomic_read(&bb.done)) - wait_for_completion(&wait); + /* Wait for bios in-flight */ + while (issued != atomic_read(&bb.done)) + wait_for_completion(&wait); if (!test_bit(BIO_UPTODATE, &bb.flags)) /* One of bios in the batch was completed with error.*/ diff --git a/block/ioctl.c b/block/ioctl.c index d8052f0..cb2b909 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -116,7 +116,7 @@ static int blkdev_reread_part(struct block_device *bdev) static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, uint64_t len, int secure) { - unsigned long flags = BLKDEV_IFL_WAIT; + unsigned long flags = 0; if (start & 511) return -EINVAL; @@ -128,7 +128,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (bdev->bd_inode->i_size >> 9)) return -EINVAL; if (secure) - flags |= BLKDEV_IFL_SECURE; + flags |= BLKDEV_DISCARD_SECURE; return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); } -- cgit v1.1 From 749ef9f8423054e326f3a246327ed2db4b6d395f Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Mon, 20 Sep 2010 15:24:50 +0200 Subject: cfq: improve fsync performance for small files Fsync performance for small files achieved by cfq on high-end disks is lower than what deadline can achieve, due to idling introduced between the sync write happening in process context and the journal commit. Moreover, when competing with a sequential reader, a process writing small files and fsync-ing them is starved. This patch fixes the two problems by: - marking journal commits as WRITE_SYNC, so that they get the REQ_NOIDLE flag set, - force all queues that have REQ_NOIDLE requests to be put in the noidle tree. Having the queue associated to the fsync-ing process and the one associated to journal commits in the noidle tree allows: - switching between them without idling, - fairness vs. competing idling queues, since they will be serviced only after the noidle tree expires its slice. Acked-by: Vivek Goyal Reviewed-by: Jeff Moyer Tested-by: Jeff Moyer Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b9f8619..6845926 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -216,7 +216,6 @@ struct cfq_data { enum wl_type_t serving_type; unsigned long workload_expires; struct cfq_group *serving_group; - bool noidle_tree_requires_idle; /* * Each priority tree is sorted by next_request position. These @@ -2126,7 +2125,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) slice = max_t(unsigned, slice, CFQ_MIN_TT); cfq_log(cfqd, "workload slice:%d", slice); cfqd->workload_expires = jiffies + slice; - cfqd->noidle_tree_requires_idle = false; } static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) @@ -3108,7 +3106,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->queued[0] + cfqq->queued[1] >= 4) cfq_mark_cfqq_deep(cfqq); - if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || + if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) + enable_idle = 0; + else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { @@ -3421,17 +3421,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_slice_expired(cfqd, 1); else if (sync && cfqq_empty && !cfq_close_cooperator(cfqd, cfqq)) { - cfqd->noidle_tree_requires_idle |= - !(rq->cmd_flags & REQ_NOIDLE); - /* - * Idling is enabled for SYNC_WORKLOAD. - * SYNC_NOIDLE_WORKLOAD idles at the end of the tree - * only if we processed at least one !REQ_NOIDLE request - */ - if (cfqd->serving_type == SYNC_WORKLOAD - || cfqd->noidle_tree_requires_idle - || cfqq->cfqg->nr_cfqq == 1) - cfq_arm_slice_timer(cfqd); + cfq_arm_slice_timer(cfqd); } } -- cgit v1.1 From 4b1977698ceb4c4caa800d475127139da49966f9 Mon Sep 17 00:00:00 2001 From: Mark Lord Date: Fri, 24 Sep 2010 09:51:13 -0400 Subject: block: Prevent hang_check firing during long I/O During long I/O operations, the hang_check timer may fire, trigger stack dumps that unnecessarily alarm the user. Eg. hdparm --security-erase NULL /dev/sdb ## can take *hours* to complete So, if hang_check is armed, we should wake up periodically to prevent it from triggering. This patch uses a wake-up interval equal to half the hang_check timer period, which keeps overhead low enough. Signed-off-by: Mark Lord Signed-off-by: Jens Axboe --- block/blk-exec.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-exec.c b/block/blk-exec.c index e1672f1..cf1456a 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, DECLARE_COMPLETION_ONSTACK(wait); char sense[SCSI_SENSE_BUFFERSIZE]; int err = 0; + unsigned long hang_check; /* * we need an extra reference to the request, so we can look at @@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, rq->end_io_data = &wait; blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); - wait_for_completion(&wait); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); + else + wait_for_completion(&wait); if (rq->errors) err = -EIO; -- cgit v1.1 From c49825facfd4969585224a896a5e717f88450cad Mon Sep 17 00:00:00 2001 From: Malahal Naineni Date: Fri, 24 Sep 2010 20:25:49 +0200 Subject: block: set the bounce_pfn to the actual DMA limit rather than to max memory The bounce_pfn of the request queue in 64 bit systems is set to the current max_low_pfn. Adding more memory later makes this incorrect. Memory allocated beyond this boot time max_low_pfn appear to require bounce buffers (bounce buffers are actually not allocated but used in calculating segments that may result in "over max segments limit" errors). Signed-off-by: Malahal Naineni Signed-off-by: Jens Axboe --- block/blk-settings.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index f8f2ddf..f47af50 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -214,16 +214,14 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) */ if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) dma = 1; - q->limits.bounce_pfn = max_low_pfn; #else if (b_pfn < blk_max_low_pfn) dma = 1; - q->limits.bounce_pfn = b_pfn; #endif + q->limits.bounce_pfn = b_pfn; if (dma) { init_emergency_isa_pool(); q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->limits.bounce_pfn = b_pfn; } } EXPORT_SYMBOL(blk_queue_bounce_limit); -- cgit v1.1 From 260a67a9e534f0c7d49ddd6451833d54ba39ac81 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Oct 2010 14:42:43 +0200 Subject: block: revert bad fix for memory hotplug causing bounces Revert "block: set the bounce_pfn to the actual DMA limit rather than to max memory" This reverts commit c49825facfd4969585224a896a5e717f88450cad. Signed-off-by: Jens Axboe --- block/blk-settings.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index f47af50..f8f2ddf 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -214,14 +214,16 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) */ if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) dma = 1; + q->limits.bounce_pfn = max_low_pfn; #else if (b_pfn < blk_max_low_pfn) dma = 1; -#endif q->limits.bounce_pfn = b_pfn; +#endif if (dma) { init_emergency_isa_pool(); q->bounce_gfp = GFP_NOIO | GFP_DMA; + q->limits.bounce_pfn = b_pfn; } } EXPORT_SYMBOL(blk_queue_bounce_limit); -- cgit v1.1 From efb012b361cf9319cd86ff169afa7550b7aa9336 Mon Sep 17 00:00:00 2001 From: Malahal Naineni Date: Fri, 1 Oct 2010 14:45:27 +0200 Subject: block: set the bounce_pfn to the actual DMA limit rather than to max memory The bounce_pfn of the request queue in 64 bit systems is set to the current max_low_pfn. Adding more memory later makes this incorrect. Memory allocated beyond this boot time max_low_pfn appear to require bounce buffers (bounce buffers are actually not allocated but used in calculating segments that may result in "over max segments limit" errors). Signed-off-by: Malahal Naineni Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index f8f2ddf..a3600a7 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -214,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) */ if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) dma = 1; - q->limits.bounce_pfn = max_low_pfn; + q->limits.bounce_pfn = max(max_low_pfn, b_pfn); #else if (b_pfn < blk_max_low_pfn) dma = 1; -- cgit v1.1 From 13f98250f587b7defa39ed738dfa74b600e46e7b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 14:49:41 +0200 Subject: blkio: Do not export throttle files if CONFIG_BLK_DEV_THROTTLING=n Currently throttling related files were visible even if user had disabled throttling using config options. It was switching off background throttling of bio but not the cgroup files. This patch fixes it. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 97 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 47 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 20ce6f5..86e7066 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1242,41 +1242,6 @@ struct cftype blkio_files[] = { .write_u64 = blkiocg_file_write_u64, }, { - .name = "throttle.read_bps_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_bps_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - - { - .name = "throttle.write_bps_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_bps_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - - { - .name = "throttle.read_iops_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_iops_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - - { - .name = "throttle.write_iops_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_iops_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - { .name = "time", .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, BLKIO_PROP_time), @@ -1295,24 +1260,12 @@ struct cftype blkio_files[] = { .read_map = blkiocg_file_read_map, }, { - .name = "throttle.io_service_bytes", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_io_service_bytes), - .read_map = blkiocg_file_read_map, - }, - { .name = "io_serviced", .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, BLKIO_PROP_io_serviced), .read_map = blkiocg_file_read_map, }, { - .name = "throttle.io_serviced", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_io_serviced), - .read_map = blkiocg_file_read_map, - }, - { .name = "io_service_time", .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, BLKIO_PROP_io_service_time), @@ -1340,6 +1293,56 @@ struct cftype blkio_files[] = { .name = "reset_stats", .write_u64 = blkiocg_reset_stats, }, +#ifdef CONFIG_BLK_DEV_THROTTLING + { + .name = "throttle.read_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.read_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + { + .name = "throttle.io_service_bytes", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_service_bytes), + .read_map = blkiocg_file_read_map, + }, + { + .name = "throttle.io_serviced", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_serviced), + .read_map = blkiocg_file_read_map, + }, +#endif /* CONFIG_BLK_DEV_THROTTLING */ + #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "avg_queue_size", -- cgit v1.1 From 61014e96e6ed55b8db0af31574eec2a75d4e8755 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 14:49:44 +0200 Subject: blkio: deletion of a cgroup was causes oops o Now a cgroup list of blkg elements can contain blkg from multiple policies. Before sending an unlink event, make sure blkg belongs to they policy. If policy does not own the blkg, do not send update for this blkg. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 86e7066..b06ca70 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1411,13 +1411,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) /* * This blkio_group is being unlinked as associated cgroup is * going away. Let all the IO controlling policies know about - * this event. Currently this is static call to one io - * controlling policy. Once we have more policies in place, we - * need some dynamic registration of callback function. + * this event. */ spin_lock(&blkio_list_lock); - list_for_each_entry(blkiop, &blkio_list, list) + list_for_each_entry(blkiop, &blkio_list, list) { + if (blkiop->plid != blkg->plid) + continue; blkiop->ops.blkio_unlink_group_fn(key, blkg); + } spin_unlock(&blkio_list_lock); } while (1); -- cgit v1.1 From 02977e4af7ed3b478c505e50491ffdf3e1314cf4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 14:49:48 +0200 Subject: blkio: Add root group to td->tg_list o Currently all the dynamically allocated groups, except root grp is added to td->tg_list. This was not a problem so far but in next patch I will travel through td->tg_list to process any updates of limits on the group. If root group is not in tg_list, then root group's updates are not processed. o It is better to root group also to tg_list instead of doing special processing for it during limit updates. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index af53f37c..bc2936b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -87,7 +87,7 @@ struct throtl_data unsigned int nr_queued[2]; /* - * number of total undestroyed groups (excluding root group) + * number of total undestroyed groups */ unsigned int nr_undestroyed_grps; @@ -940,7 +940,17 @@ int blk_throtl_init(struct request_queue *q) /* Practically unlimited BW */ tg->bps[0] = tg->bps[1] = -1; tg->iops[0] = tg->iops[1] = -1; - atomic_set(&tg->ref, 1); + + /* + * Set root group reference to 2. One reference will be dropped when + * all groups on tg_list are being deleted during queue exit. Other + * reference will remain there as we don't want to delete this group + * as it is statically allocated and gets destroyed when throtl_data + * goes away. + */ + atomic_set(&tg->ref, 2); + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); @@ -966,10 +976,9 @@ void blk_throtl_exit(struct request_queue *q) spin_lock_irq(q->queue_lock); throtl_release_tgs(td); - blkiocg_del_blkio_group(&td->root_tg.blkg); /* If there are other groups */ - if (td->nr_undestroyed_grps >= 1) + if (td->nr_undestroyed_grps > 0) wait = true; spin_unlock_irq(q->queue_lock); -- cgit v1.1 From fe0714377ee2ca161bf2afb7773e22f15f1786d4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 14:49:49 +0200 Subject: blkio: Recalculate the throttled bio dispatch time upon throttle limit change o Currently any cgroup throttle limit changes are processed asynchronousy and the change does not take affect till a new bio is dispatched from same group. o It might happen that a user sets a redicuously low limit on throttling. Say 1 bytes per second on reads. In such cases simple operations like mount a disk can wait for a very long time. o Once bio is throttled, there is no easy way to come out of that wait even if user increases the read limit later. o This patch fixes it. Now if a user changes the cgroup limits, we recalculate the bio dispatch time according to new limits. o Can't take queueu lock under blkcg_lock, hence after the change I wake up the dispatch thread again which recalculates the time. So there are some variables being synchronized across two threads without lock and I had to make use of barriers. Hoping I have used barriers correctly. Any review of memory barrier code especially will help. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 ++++-- block/blk-cgroup.h | 21 ++++---- block/blk-throttle.c | 134 ++++++++++++++++++++++++++++++++++++++++++++------- block/cfq-iosched.c | 4 +- 4 files changed, 139 insertions(+), 35 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b06ca70..52c1213 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -124,7 +124,8 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) if (blkiop->plid != blkg->plid) continue; if (blkiop->ops.blkio_update_group_weight_fn) - blkiop->ops.blkio_update_group_weight_fn(blkg, weight); + blkiop->ops.blkio_update_group_weight_fn(blkg->key, + blkg, weight); } } @@ -141,11 +142,13 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, if (fileid == BLKIO_THROTL_read_bps_device && blkiop->ops.blkio_update_group_read_bps_fn) - blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps); + blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, + blkg, bps); if (fileid == BLKIO_THROTL_write_bps_device && blkiop->ops.blkio_update_group_write_bps_fn) - blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps); + blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, + blkg, bps); } } @@ -162,11 +165,13 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg, if (fileid == BLKIO_THROTL_read_iops_device && blkiop->ops.blkio_update_group_read_iops_fn) - blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops); + blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, + blkg, iops); if (fileid == BLKIO_THROTL_write_iops_device && blkiop->ops.blkio_update_group_write_iops_fn) - blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops); + blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, + blkg,iops); } } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2070053..034c355 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -186,16 +186,17 @@ extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev); typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); -typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, - unsigned int weight); -typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg, - u64 read_bps); -typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg, - u64 write_bps); -typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg, - unsigned int read_iops); -typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg, - unsigned int write_iops); + +typedef void (blkio_update_group_weight_fn) (void *key, + struct blkio_group *blkg, unsigned int weight); +typedef void (blkio_update_group_read_bps_fn) (void * key, + struct blkio_group *blkg, u64 read_bps); +typedef void (blkio_update_group_write_bps_fn) (void *key, + struct blkio_group *blkg, u64 write_bps); +typedef void (blkio_update_group_read_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int read_iops); +typedef void (blkio_update_group_write_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int write_iops); struct blkio_policy_ops { blkio_unlink_group_fn *blkio_unlink_group_fn; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index bc2936b..11713ed 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -70,6 +70,9 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; + + /* Some throttle limits got updated for the group */ + bool limits_changed; }; struct throtl_data @@ -93,6 +96,8 @@ struct throtl_data /* Work for dispatching throttled bios */ struct delayed_work throtl_work; + + atomic_t limits_changed; }; enum tg_state_flags { @@ -592,15 +597,6 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; - /* - * If group is already on active tree, then update dispatch time - * only if it is lesser than existing dispatch time. Otherwise - * always update the dispatch time - */ - - if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime)) - return; - /* Update dispatch time */ throtl_dequeue_tg(td, tg); tg->disptime = disptime; @@ -691,6 +687,46 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) return nr_disp; } +static void throtl_process_limit_change(struct throtl_data *td) +{ + struct throtl_grp *tg; + struct hlist_node *pos, *n; + + /* + * Make sure atomic_inc() effects from + * throtl_update_blkio_group_read_bps(), group of functions are + * visible. + * Is this required or smp_mb__after_atomic_inc() was suffcient + * after the atomic_inc(). + */ + smp_rmb(); + if (!atomic_read(&td->limits_changed)) + return; + + throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * Do I need an smp_rmb() here to make sure tg->limits_changed + * update is visible. I am relying on smp_rmb() at the + * beginning of function and not putting a new one here. + */ + + if (throtl_tg_on_rr(tg) && tg->limits_changed) { + throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" + " riops=%u wiops=%u", tg->bps[READ], + tg->bps[WRITE], tg->iops[READ], + tg->iops[WRITE]); + tg_update_disptime(td, tg); + tg->limits_changed = false; + } + } + + smp_mb__before_atomic_dec(); + atomic_dec(&td->limits_changed); + smp_mb__after_atomic_dec(); +} + /* Dispatch throttled bios. Should be called without queue lock held. */ static int throtl_dispatch(struct request_queue *q) { @@ -701,6 +737,8 @@ static int throtl_dispatch(struct request_queue *q) spin_lock_irq(q->queue_lock); + throtl_process_limit_change(td); + if (!total_nr_queued(td)) goto out; @@ -821,28 +859,74 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) spin_unlock_irqrestore(td->queue->queue_lock, flags); } -static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg, - u64 read_bps) +/* + * For all update functions, key should be a valid pointer because these + * update functions are called under blkcg_lock, that means, blkg is + * valid and in turn key is valid. queue exit path can not race becuase + * of blkcg_lock + * + * Can not take queue lock in update functions as queue lock under blkcg_lock + * is not allowed. Under other paths we take blkcg_lock under queue_lock. + */ +static void throtl_update_blkio_group_read_bps(void *key, + struct blkio_group *blkg, u64 read_bps) { + struct throtl_data *td = key; + tg_of_blkg(blkg)->bps[READ] = read_bps; + /* Make sure read_bps is updated before setting limits_changed */ + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + + /* Make sure tg->limits_changed is updated before td->limits_changed */ + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + + /* Schedule a work now to process the limit change */ + throtl_schedule_delayed_work(td->queue, 0); } -static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg, - u64 write_bps) +static void throtl_update_blkio_group_write_bps(void *key, + struct blkio_group *blkg, u64 write_bps) { + struct throtl_data *td = key; + tg_of_blkg(blkg)->bps[WRITE] = write_bps; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); } -static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg, - unsigned int read_iops) +static void throtl_update_blkio_group_read_iops(void *key, + struct blkio_group *blkg, unsigned int read_iops) { + struct throtl_data *td = key; + tg_of_blkg(blkg)->iops[READ] = read_iops; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); } -static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg, - unsigned int write_iops) +static void throtl_update_blkio_group_write_iops(void *key, + struct blkio_group *blkg, unsigned int write_iops) { + struct throtl_data *td = key; + tg_of_blkg(blkg)->iops[WRITE] = write_iops; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); } void throtl_shutdown_timer_wq(struct request_queue *q) @@ -886,8 +970,14 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) /* * There is already another bio queued in same dir. No * need to update dispatch time. + * Still update the disptime if rate limits on this group + * were changed. */ - update_disptime = false; + if (!tg->limits_changed) + update_disptime = false; + else + tg->limits_changed = false; + goto queue_bio; } @@ -929,6 +1019,7 @@ int blk_throtl_init(struct request_queue *q) INIT_HLIST_HEAD(&td->tg_list); td->tg_service_tree = THROTL_RB_ROOT; + atomic_set(&td->limits_changed, 0); /* Init root group */ tg = &td->root_tg; @@ -996,6 +1087,13 @@ void blk_throtl_exit(struct request_queue *q) */ if (wait) synchronize_rcu(); + + /* + * Just being safe to make sure after previous flush if some body did + * update limits through cgroup and another work got queued, cancel + * it. + */ + throtl_shutdown_timer_wq(q); throtl_td_free(td); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6845926..86338d5 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -951,8 +951,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) return NULL; } -void -cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, + unsigned int weight) { cfqg_of_blkg(blkg)->weight = weight; } -- cgit v1.1 From 3aad5d3ee4e4fce8f4b5bb6ca73342dcade42b33 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 14:51:14 +0200 Subject: blkio-throttle: Fix link failure failure on i386 o Randy Dunlap reported following linux-next failure. This patch fixes it. on i386: blk-throttle.c:(.text+0x1abb8): undefined reference to `__udivdi3' blk-throttle.c:(.text+0x1b1dc): undefined reference to `__udivdi3' o bytes_per_second interface is 64bit and I was continuing to do 64 bit division even on 32bit platform without help of special macros/functions hence the failure. Signed-off-by: Vivek Goyal Reported-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/blk-throttle.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 11713ed..a467002 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -378,7 +378,8 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) static inline void throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) { - unsigned long nr_slices, bytes_trim, time_elapsed, io_trim; + unsigned long nr_slices, time_elapsed, io_trim; + u64 bytes_trim, tmp; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); @@ -396,8 +397,10 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) if (!nr_slices) return; + tmp = tg->bps[rw] * throtl_slice * nr_slices; + do_div(tmp, HZ); + bytes_trim = tmp; - bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; if (!bytes_trim && !io_trim) @@ -415,7 +418,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) tg->slice_start[rw] += nr_slices * throtl_slice; - throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu" + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" " start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); @@ -462,7 +465,7 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes; + u64 bytes_allowed, extra_bytes, tmp; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -473,8 +476,9 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) - / MSEC_PER_SEC; + tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd); + do_div(tmp, MSEC_PER_SEC); + bytes_allowed = tmp; if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { if (wait) -- cgit v1.1 From 5e901a2b95db709c5e40599ff4df6029be1e2a12 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 21:16:38 +0200 Subject: blkio-throttle: There is no need to convert jiffies to milli seconds o Do not convert jiffies to mili seconds as it is not required. Just work with jiffies and HZ. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a467002..c1bc1b6 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -439,8 +439,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) - / MSEC_PER_SEC; + io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ; if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) @@ -476,8 +475,8 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd); - do_div(tmp, MSEC_PER_SEC); + tmp = tg->bps[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); bytes_allowed = tmp; if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { -- cgit v1.1 From 9355aede5a3c4975e0ba8bbfe2b9d1fd73308916 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 21:16:41 +0200 Subject: blkio-throttle: limit max iops value to UINT_MAX - Limit max iops value to UINT_MAX and return error to user if value is more than that instead of accepting bigger values and truncating implicitly. Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 11 +++++++---- block/blk-cgroup.h | 3 +++ 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 52c1213..0f59b23 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -656,10 +656,10 @@ static int blkio_policy_parse_and_set(char *buf, { char *s[4], *p, *major_s = NULL, *minor_s = NULL; int ret; - unsigned long major, minor, temp, iops; + unsigned long major, minor, temp; int i = 0; dev_t dev; - u64 bps; + u64 bps, iops; memset(s, 0, sizeof(s)); @@ -731,13 +731,16 @@ static int blkio_policy_parse_and_set(char *buf, break; case BLKIO_THROTL_read_iops_device: case BLKIO_THROTL_write_iops_device: - ret = strict_strtoul(s[1], 10, &iops); + ret = strict_strtoull(s[1], 10, &iops); if (ret) return -EINVAL; + if (iops > THROTL_IOPS_MAX) + return -EINVAL; + newpn->plid = plid; newpn->fileid = fileid; - newpn->val.iops = iops; + newpn->val.iops = (unsigned int)iops; break; } break; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 034c355..ea4861b 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -20,6 +20,9 @@ enum blkio_policy_id { BLKIO_POLICY_THROTL, /* Throttling */ }; +/* Max limits for throttle policy */ +#define THROTL_IOPS_MAX UINT_MAX + #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) #ifndef CONFIG_BLK_CGROUP -- cgit v1.1 From c49c06e4960949a9bced708858433fcf6ca36a9c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 21:16:42 +0200 Subject: blkio-throttle: Fix possible multiplication overflow in iops calculations o User can specify max iops value of 32bit (UINT_MAX), through cgroup interface. If a user has specified say 4294967294 (UNIT_MAX - 2), then on 32bit platform, following multiplication can overflow. io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) o Explicitly cast the multiplication to 64bit and then perform division and then check whether result is still great then UNINT_MAX. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c1bc1b6..56ad453 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -430,6 +430,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, bool rw = bio_data_dir(bio); unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + u64 tmp; jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -439,7 +440,20 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ; + /* + * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * 1 then at max jiffy elapsed should be equivalent of 1 second as we + * will allow dispatch after 1 second and after that slice should + * have been trimmed. + */ + + tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); + + if (tmp > UINT_MAX) + io_allowed = UINT_MAX; + else + io_allowed = tmp; if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) -- cgit v1.1 From 2a48fc0ab24241755dc93bfd4f01d68efab47f5a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Jun 2010 14:28:52 +0200 Subject: block: autoconvert trivial BKL users to private mutex The block device drivers have all gained new lock_kernel calls from a recent pushdown, and some of the drivers were already using the BKL before. This turns the BKL into a set of per-driver mutexes. Still need to check whether this is safe to do. file=$1 name=$2 if grep -q lock_kernel ${file} ; then if grep -q 'include.*linux.mutex.h' ${file} ; then sed -i '/include.*/d' ${file} else sed -i 's/include.*.*$/include /g' ${file} fi sed -i ${file} \ -e "/^#include.*linux.mutex.h/,$ { 1,/^\(static\|int\|long\)/ { /^\(static\|int\|long\)/istatic DEFINE_MUTEX(${name}_mutex); } }" \ -e "s/\(un\)*lock_kernel\>[ ]*()/mutex_\1lock(\&${name}_mutex)/g" \ -e '/[ ]*cycle_kernel_lock();/d' else sed -i -e '/include.*\/d' ${file} \ -e '/cycle_kernel_lock()/d' fi Signed-off-by: Arnd Bergmann --- block/bsg.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index 82d5882..ffd79ca 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -843,9 +842,7 @@ static int bsg_open(struct inode *inode, struct file *file) { struct bsg_device *bd; - lock_kernel(); bd = bsg_get_device(inode, file); - unlock_kernel(); if (IS_ERR(bd)) return PTR_ERR(bd); -- cgit v1.1 From 892b6f90db81cccb723d5d92f4fddc2d68b206e1 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 13 Oct 2010 21:18:03 +0200 Subject: block: Ensure physical block size is unsigned int Physical block size was declared unsigned int to accomodate the maximum size reported by READ CAPACITY(16). Make sure we use the right type in the related functions. Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index a3600a7..315b88c 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -344,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size); * hardware can operate on without reverting to read-modify-write * operations. */ -void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) +void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) { q->limits.physical_block_size = size; -- cgit v1.1 From e817bf3f68f55e7307c3e9abe5f32d0c05c83988 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 15 Oct 2010 15:49:18 +0200 Subject: block: Fix double free in blk_integrity_unregister Commit 3839e4b introduced a kobject_put but failed to remove the kmem_cache_free beneath it, leading to a double free. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-integrity.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 885cbb5..54bcba6 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -432,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk) kobject_uevent(&bi->kobj, KOBJ_REMOVE); kobject_del(&bi->kobj); kobject_put(&bi->kobj); - kmem_cache_free(integrity_cachep, bi); disk->integrity = NULL; } EXPORT_SYMBOL(blk_integrity_unregister); -- cgit v1.1 From 6038f373a3dc1f1c26496e60b6c40b164716f07e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 15 Aug 2010 18:52:59 +0200 Subject: llseek: automatically add .llseek fop All file_operations should get a .llseek operation so we can make nonseekable_open the default for future file operations without a .llseek pointer. The three cases that we can automatically detect are no_llseek, seq_lseek and default_llseek. For cases where we can we can automatically prove that the file offset is always ignored, we use noop_llseek, which maintains the current behavior of not returning an error from a seek. New drivers should normally not use noop_llseek but instead use no_llseek and call nonseekable_open at open time. Existing drivers can be converted to do the same when the maintainer knows for certain that no user code relies on calling seek on the device file. The generated code is often incorrectly indented and right now contains comments that clarify for each added line why a specific variant was chosen. In the version that gets submitted upstream, the comments will be gone and I will manually fix the indentation, because there does not seem to be a way to do that using coccinelle. Some amount of new code is currently sitting in linux-next that should get the same modifications, which I will do at the end of the merge window. Many thanks to Julia Lawall for helping me learn to write a semantic patch that does all this. ===== begin semantic patch ===== // This adds an llseek= method to all file operations, // as a preparation for making no_llseek the default. // // The rules are // - use no_llseek explicitly if we do nonseekable_open // - use seq_lseek for sequential files // - use default_llseek if we know we access f_pos // - use noop_llseek if we know we don't access f_pos, // but we still want to allow users to call lseek // @ open1 exists @ identifier nested_open; @@ nested_open(...) { <+... nonseekable_open(...) ...+> } @ open exists@ identifier open_f; identifier i, f; identifier open1.nested_open; @@ int open_f(struct inode *i, struct file *f) { <+... ( nonseekable_open(...) | nested_open(...) ) ...+> } @ read disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ read_no_fpos disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { ... when != off } @ write @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ write_no_fpos @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { ... when != off } @ fops0 @ identifier fops; @@ struct file_operations fops = { ... }; @ has_llseek depends on fops0 @ identifier fops0.fops; identifier llseek_f; @@ struct file_operations fops = { ... .llseek = llseek_f, ... }; @ has_read depends on fops0 @ identifier fops0.fops; identifier read_f; @@ struct file_operations fops = { ... .read = read_f, ... }; @ has_write depends on fops0 @ identifier fops0.fops; identifier write_f; @@ struct file_operations fops = { ... .write = write_f, ... }; @ has_open depends on fops0 @ identifier fops0.fops; identifier open_f; @@ struct file_operations fops = { ... .open = open_f, ... }; // use no_llseek if we call nonseekable_open //////////////////////////////////////////// @ nonseekable1 depends on !has_llseek && has_open @ identifier fops0.fops; identifier nso ~= "nonseekable_open"; @@ struct file_operations fops = { ... .open = nso, ... +.llseek = no_llseek, /* nonseekable */ }; @ nonseekable2 depends on !has_llseek @ identifier fops0.fops; identifier open.open_f; @@ struct file_operations fops = { ... .open = open_f, ... +.llseek = no_llseek, /* open uses nonseekable */ }; // use seq_lseek for sequential files ///////////////////////////////////// @ seq depends on !has_llseek @ identifier fops0.fops; identifier sr ~= "seq_read"; @@ struct file_operations fops = { ... .read = sr, ... +.llseek = seq_lseek, /* we have seq_read */ }; // use default_llseek if there is a readdir /////////////////////////////////////////// @ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier readdir_e; @@ // any other fop is used that changes pos struct file_operations fops = { ... .readdir = readdir_e, ... +.llseek = default_llseek, /* readdir is present */ }; // use default_llseek if at least one of read/write touches f_pos ///////////////////////////////////////////////////////////////// @ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read.read_f; @@ // read fops use offset struct file_operations fops = { ... .read = read_f, ... +.llseek = default_llseek, /* read accesses f_pos */ }; @ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, ... + .llseek = default_llseek, /* write accesses f_pos */ }; // Use noop_llseek if neither read nor write accesses f_pos /////////////////////////////////////////////////////////// @ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; identifier write_no_fpos.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, .read = read_f, ... +.llseek = noop_llseek, /* read and write both use no f_pos */ }; @ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write_no_fpos.write_f; @@ struct file_operations fops = { ... .write = write_f, ... +.llseek = noop_llseek, /* write uses no f_pos */ }; @ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; @@ struct file_operations fops = { ... .read = read_f, ... +.llseek = noop_llseek, /* read uses no f_pos */ }; @ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; @@ struct file_operations fops = { ... +.llseek = noop_llseek, /* no read or write fn */ }; ===== End semantic patch ===== Signed-off-by: Arnd Bergmann Cc: Julia Lawall Cc: Christoph Hellwig --- block/bsg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index 82d5882..83e381a2 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -968,6 +968,7 @@ static const struct file_operations bsg_fops = { .release = bsg_release, .unlocked_ioctl = bsg_ioctl, .owner = THIS_MODULE, + .llseek = default_llseek, }; void bsg_unregister_queue(struct request_queue *q) -- cgit v1.1 From 7681bfeeccff5efa9eb29bf09249a3c400b15327 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 19 Oct 2010 09:05:00 +0200 Subject: block: fix accounting bug on cross partition merges /proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. When reloading partition tables, quiesce IO to ensure that no request references to the partition struct exists. When it is safe to free the partition table, the IO for that device is restarted again. Signed-off-by: Yasuaki Ishimatsu Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 24 ++++++++++++++++-------- block/blk-merge.c | 2 +- block/blk.h | 4 ---- block/genhd.c | 14 ++++++++++++++ 4 files changed, 31 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 797d509..ddc6833 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) + if (!new_io) { + part = rq->part; part_stat_inc(cpu, part, merges[rw]); - else { + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); part_round_stats(cpu, part); part_inc_in_flight(part, rw); + rq->part = part; } part_stat_unlock(); @@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); + rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -804,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl->starved[is_sync] = 0; priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) + if (priv) { rl->elvpriv++; - if (blk_queue_io_stat(q)) - rw_flags |= REQ_IO_STAT; + /* + * Don't do stats for non-priv requests + */ + if (blk_queue_io_stat(q)) + rw_flags |= REQ_IO_STAT; + } + spin_unlock_irq(q->queue_lock); rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); @@ -1777,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1797,7 +1805,7 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); diff --git a/block/blk-merge.c b/block/blk-merge.c index 6a72546..38ff234 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); diff --git a/block/blk.h b/block/blk.h index 6738831..1340cce 100644 --- a/block/blk.h +++ b/block/blk.h @@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q); int blk_dev_init(void); -void elv_quiesce_start(struct request_queue *q); -void elv_quiesce_end(struct request_queue *q); - - /* * Return the threshold (number of used requests) at which the queue is * considered to be congested. It include a little hysteresis to keep the diff --git a/block/genhd.c b/block/genhd.c index 7923e72..8313834 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -932,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head) { struct disk_part_tbl *ptbl = container_of(head, struct disk_part_tbl, rcu_head); + struct gendisk *disk = ptbl->disk; + struct request_queue *q = disk->queue; + unsigned long flags; kfree(ptbl); + + spin_lock_irqsave(q->queue_lock, flags); + elv_quiesce_end(q); + spin_unlock_irqrestore(q->queue_lock, flags); } /** @@ -951,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk, struct disk_part_tbl *new_ptbl) { struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct request_queue *q = disk->queue; rcu_assign_pointer(disk->part_tbl, new_ptbl); if (old_ptbl) { rcu_assign_pointer(old_ptbl->last_lookup, NULL); + + spin_lock_irq(q->queue_lock); + elv_quiesce_start(q); + spin_unlock_irq(q->queue_lock); + call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); } } @@ -996,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) return -ENOMEM; new_ptbl->len = target; + new_ptbl->disk = disk; for (i = 0; i < len; i++) rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); -- cgit v1.1 From b4627321e18582dcbdeb45d77df29d3177107c65 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 22 Oct 2010 09:48:43 +0200 Subject: cfq-iosched: Fix a gcc 4.5 warning and put some comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Andi encountedred following warning with gcc 4.5 linux/block/cfq-iosched.c: In function ‘cfq_dispatch_requests’: linux/block/cfq-iosched.c:2156:3: warning: array subscript is above array bounds - Warning happens due to following code. slice = group_slice * count / max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); gcc is complaining about cfqg->busy_queues_avg[] being indexed by CFQ prio classes (RT, BE and IDLE) while the array size is only 2. - At run time, we never access cfqg->busy_queues_avg[IDLE] and return from function before this code hits. - To fix warning increase the array size though it will remain unused. This patch also puts some comments to clarify some of the confusions. - I have taken Jens's patch and modified it a bit. - Compile tested with gcc 4.4 and boot tested. I don't have gcc 4.5 running, Andi can you please test it with gcc 4.5 to make sure it worked. Reported-by: Andi Kleen Signed-off-by: Vivek Goyal Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 86338d5..dfceb63 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -157,6 +157,7 @@ enum wl_prio_t { BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, + CFQ_PRIO_NR, }; /* @@ -181,10 +182,19 @@ struct cfq_group { /* number of cfqq currently on this group */ int nr_cfqq; - /* Per group busy queus average. Useful for workload slice calc. */ - unsigned int busy_queues_avg[2]; /* - * rr lists of queues with requests, onle rr for each priority class. + * Per group busy queus average. Useful for workload slice calc. We + * create the array for each prio class but at run time it is used + * only for RT and BE class and slot for IDLE class remains unused. + * This is primarily done to avoid confusion and a gcc warning. + */ + unsigned int busy_queues_avg[CFQ_PRIO_NR]; + /* + * rr lists of queues with requests. We maintain service trees for + * RT and BE classes. These trees are subdivided in subclasses + * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE + * class there is no subclassification and all the cfq queues go on + * a single tree service_tree_idle. * Counts are embedded in the cfq_rb_root */ struct cfq_rb_root service_trees[2][3]; -- cgit v1.1 From e52eec13cd6b7f30ab19081b387813e03e592ae5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 8 Sep 2010 16:54:17 +0200 Subject: SYSFS: Allow boot time switching between deprecated and modern sysfs layout I have some systems which need legacy sysfs due to old tools that are making assumptions that a directory can never be a symlink to another directory, and it's a big hazzle to compile separate kernels for them. This patch turns CONFIG_SYSFS_DEPRECATED into a run time option that can be switched on/off the kernel command line. This way the same binary can be used in both cases with just a option on the command line. The old CONFIG_SYSFS_DEPRECATED_V2 option is still there to set the default. I kept the weird name to not break existing config files. Also the compat code can be still completely disabled by undefining CONFIG_SYSFS_DEPRECATED_SWITCH -- just the optimizer takes care of this now instead of lots of ifdefs. This makes the code look nicer. v2: This is an updated version on top of Kay's patch to only handle the block devices. I tested it on my old systems and that seems to work. Cc: axboe@kernel.dk Signed-off-by: Andi Kleen Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- block/genhd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 59a2db6..4e28a84 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -22,9 +22,7 @@ #include "blk.h" static DEFINE_MUTEX(block_class_lock); -#ifndef CONFIG_SYSFS_DEPRECATED struct kobject *block_depr; -#endif /* for extended dynamic devt allocation, currently only one major is used */ #define MAX_EXT_DEVT (1 << MINORBITS) @@ -803,10 +801,9 @@ static int __init genhd_device_init(void) register_blkdev(BLOCK_EXT_MAJOR, "blkext"); -#ifndef CONFIG_SYSFS_DEPRECATED /* create top-level block dir */ - block_depr = kobject_create_and_add("block", NULL); -#endif + if (!sysfs_deprecated) + block_depr = kobject_create_and_add("block", NULL); return 0; } -- cgit v1.1 From 7ad58c028652753814054f4e3ac58f925e7343f4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 23 Oct 2010 20:40:26 +0200 Subject: block: fix use-after-free bug in blk throttle code blk_throtl_exit() frees the throttle data hanging off the queue in blk_cleanup_queue(), but blk_put_queue() will indirectly dereference this data when calling blk_sync_queue() which in turns calls throtl_shutdown_timer_wq(). Fix this by moving the freeing of the throttle data to when the queue is truly being released, and post the call to blk_sync_queue(). Reported-by: Ingo Molnar Tested-by: Ingo Molnar Signed-off-by: Jens Axboe --- block/blk-core.c | 2 -- block/blk-sysfs.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 4514146..51efd83 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -462,8 +462,6 @@ void blk_cleanup_queue(struct request_queue *q) if (q->elevator) elevator_exit(q->elevator); - blk_throtl_exit(q); - blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index da8a8a4..013457f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -471,6 +471,8 @@ static void blk_release_queue(struct kobject *kobj) blk_sync_queue(q); + blk_throtl_exit(q); + if (rl->rq_pool) mempool_destroy(rl->rq_pool); -- cgit v1.1 From f253b86b4ad1b3220544e75880510fd455ebd23f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 24 Oct 2010 22:06:02 +0200 Subject: Revert "block: fix accounting bug on cross partition merges" This reverts commit 7681bfeeccff5efa9eb29bf09249a3c400b15327. Conflicts: include/linux/genhd.h It has numerous issues with the cleanup path and non-elevator devices. Revert it for now so we can come up with a clean version without rushing things. Signed-off-by: Jens Axboe --- block/blk-core.c | 24 ++++++++---------------- block/blk-merge.c | 2 +- block/blk.h | 4 ++++ block/genhd.c | 14 -------------- 4 files changed, 13 insertions(+), 31 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 51efd83..f854887 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,15 +64,13 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) { - part = rq->part; + if (!new_io) part_stat_inc(cpu, part, merges[rw]); - } else { - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + else { part_round_stats(cpu, part); part_inc_in_flight(part, rw); - rq->part = part; } part_stat_unlock(); @@ -130,7 +128,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); - rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -805,16 +802,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl->starved[is_sync] = 0; priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) { + if (priv) rl->elvpriv++; - /* - * Don't do stats for non-priv requests - */ - if (blk_queue_io_stat(q)) - rw_flags |= REQ_IO_STAT; - } - + if (blk_queue_io_stat(q)) + rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); @@ -1791,7 +1783,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = req->part; + part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1811,7 +1803,7 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = req->part; + part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); diff --git a/block/blk-merge.c b/block/blk-merge.c index 0a2fd8a..77b7c26 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = req->part; + part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); diff --git a/block/blk.h b/block/blk.h index 1e675e5..2db8f32 100644 --- a/block/blk.h +++ b/block/blk.h @@ -116,6 +116,10 @@ void blk_queue_congestion_threshold(struct request_queue *q); int blk_dev_init(void); +void elv_quiesce_start(struct request_queue *q); +void elv_quiesce_end(struct request_queue *q); + + /* * Return the threshold (number of used requests) at which the queue is * considered to be congested. It include a little hysteresis to keep the diff --git a/block/genhd.c b/block/genhd.c index a8adf96..5fa2b44 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -929,15 +929,8 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head) { struct disk_part_tbl *ptbl = container_of(head, struct disk_part_tbl, rcu_head); - struct gendisk *disk = ptbl->disk; - struct request_queue *q = disk->queue; - unsigned long flags; kfree(ptbl); - - spin_lock_irqsave(q->queue_lock, flags); - elv_quiesce_end(q); - spin_unlock_irqrestore(q->queue_lock, flags); } /** @@ -955,17 +948,11 @@ static void disk_replace_part_tbl(struct gendisk *disk, struct disk_part_tbl *new_ptbl) { struct disk_part_tbl *old_ptbl = disk->part_tbl; - struct request_queue *q = disk->queue; rcu_assign_pointer(disk->part_tbl, new_ptbl); if (old_ptbl) { rcu_assign_pointer(old_ptbl->last_lookup, NULL); - - spin_lock_irq(q->queue_lock); - elv_quiesce_start(q); - spin_unlock_irq(q->queue_lock); - call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); } } @@ -1006,7 +993,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) return -ENOMEM; new_ptbl->len = target; - new_ptbl->disk = disk; for (i = 0; i < len; i++) rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); -- cgit v1.1 From 9284bcf4e335e5f18a8bc7b26461c33ab60d0689 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Oct 2010 08:10:18 -0600 Subject: block: check for proper length of iov entries in blk_rq_map_user_iov() Ensure that we pass down properly validated iov segments before calling into the mapping or copy functions. Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-map.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index d4a586d..5d5dbe4 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -205,6 +205,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unaligned = 1; break; } + if (!iov[i].iov_len) + return -EINVAL; } if (unaligned || (q->dma_pad_mask & len) || map_data) -- cgit v1.1 From 9f864c80913467312c7b8690e41fb5ebd1b50e92 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Oct 2010 11:31:42 -0600 Subject: block: take care not to overflow when calculating total iov length Reported-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index a8b5a10..4f4230b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -321,33 +321,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->iovec_count) { const int size = sizeof(struct sg_iovec) * hdr->iovec_count; size_t iov_data_len; - struct sg_iovec *iov; + struct sg_iovec *sg_iov; + struct iovec *iov; + int i; - iov = kmalloc(size, GFP_KERNEL); - if (!iov) { + sg_iov = kmalloc(size, GFP_KERNEL); + if (!sg_iov) { ret = -ENOMEM; goto out; } - if (copy_from_user(iov, hdr->dxferp, size)) { - kfree(iov); + if (copy_from_user(sg_iov, hdr->dxferp, size)) { + kfree(sg_iov); ret = -EFAULT; goto out; } + /* + * Sum up the vecs, making sure they don't overflow + */ + iov = (struct iovec *) sg_iov; + iov_data_len = 0; + for (i = 0; i < hdr->iovec_count; i++) { + if (iov_data_len + iov[i].iov_len < iov_data_len) { + kfree(sg_iov); + ret = -EINVAL; + goto out; + } + iov_data_len += iov[i].iov_len; + } + /* SG_IO howto says that the shorter of the two wins */ - iov_data_len = iov_length((struct iovec *)iov, - hdr->iovec_count); if (hdr->dxfer_len < iov_data_len) { - hdr->iovec_count = iov_shorten((struct iovec *)iov, + hdr->iovec_count = iov_shorten(iov, hdr->iovec_count, hdr->dxfer_len); iov_data_len = hdr->dxfer_len; } - ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count, + ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, iov_data_len, GFP_KERNEL); - kfree(iov); + kfree(sg_iov); } else if (hdr->dxfer_len) ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, GFP_KERNEL); -- cgit v1.1 From 77304d2abac6101f7249754ffdd4421258877ab0 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 8 Nov 2010 14:39:12 +0100 Subject: block: read i_size with i_size_read() Convert direct reads of an inode's i_size to using i_size_read(). i_size_{read,write} use a seqcount to protect reads from accessing incomple writes. Concurrent i_size_write()s require mutual exclussion to protect the seqcount that is used by i_size_{read,write}. But i_size_read() callers do not need to use additional locking. Signed-off-by: Mike Snitzer Acked-by: NeilBrown Acked-by: Lars Ellenberg Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- block/compat_ioctl.c | 4 ++-- block/ioctl.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f0834e2..17fcb83 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1351,7 +1351,7 @@ static void handle_bad_sector(struct bio *bio) bdevname(bio->bi_bdev, b), bio->bi_rw, (unsigned long long)bio->bi_sector + bio_sectors(bio), - (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); + (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); set_bit(BIO_EOF, &bio->bi_flags); } @@ -1404,7 +1404,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; /* Test device or partition size, when known. */ - maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; if (maxsector) { sector_t sector = bio->bi_sector; diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 119f07b..58c6ee5b 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -744,13 +744,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: - size = bdev->bd_inode->i_size; + size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) return -EFBIG; return compat_put_ulong(arg, size >> 9); case BLKGETSIZE64_32: - return compat_put_u64(arg, bdev->bd_inode->i_size); + return compat_put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESETUP32: case BLKTRACESTART: /* compatible */ diff --git a/block/ioctl.c b/block/ioctl.c index d724ceb..38aa194 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -125,7 +125,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, start >>= 9; len >>= 9; - if (start + len > (bdev->bd_inode->i_size >> 9)) + if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; if (secure) flags |= BLKDEV_DISCARD_SECURE; @@ -307,12 +307,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, ret = blkdev_reread_part(bdev); break; case BLKGETSIZE: - size = bdev->bd_inode->i_size; + size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) return -EFBIG; return put_ulong(arg, size >> 9); case BLKGETSIZE64: - return put_u64(arg, bdev->bd_inode->i_size); + return put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACESETUP: -- cgit v1.1 From a014741c0adfb8fb79952939ca087cf03d272bb9 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Mon, 8 Nov 2010 14:42:40 +0100 Subject: block: ioctl: fix information leak to userland Structure hd_geometry is copied to userland with 4 padding bytes between cylinders and start fields uninitialized on 64-bit platforms. It leads to leaking of contents of kernel stack memory. Currently there is no memset() in real implementations of getgeo() in drivers/block/, so it makes sense to have memset() in blkdev_ioctl(). Signed-off-by: Vasiliy Kulikov Signed-off-by: Jens Axboe --- block/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 38aa194..3d866d0 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -242,6 +242,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, * We need to set the startsect first, the driver may * want to override it. */ + memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) -- cgit v1.1 From 02e031cbc843b010e72fcc05c76113c688b2860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Nov 2010 14:54:09 +0100 Subject: block: remove REQ_HARDBARRIER REQ_HARDBARRIER is dead now, so remove the leftovers. What's left at this point is: - various checks inside the block layer. - sanity checks in bio based drivers. - now unused bio_empty_barrier helper. - Xen blockfront use of BLKIF_OP_WRITE_BARRIER - it's dead for a while, but Xen really needs to sort out it's barrier situaton. - setting of ordered tags in uas - dead code copied from old scsi drivers. - scsi different retry for barriers - it's dead and should have been removed when flushes were converted to FS requests. - blktrace handling of barriers - removed. Someone who knows blktrace better should add support for REQ_FLUSH and REQ_FUA, though. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ------- block/elevator.c | 4 ++-- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 17fcb83..4ce953f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1194,13 +1194,6 @@ static int __make_request(struct request_queue *q, struct bio *bio) int where = ELEVATOR_INSERT_SORT; int rw_flags; - /* REQ_HARDBARRIER is no more */ - if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER, - "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) { - bio_endio(bio, -EOPNOTSUPP); - return 0; - } - /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even diff --git a/block/elevator.c b/block/elevator.c index 282e830..2569512 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -429,7 +429,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) q->nr_sorted--; boundary = q->end_sector; - stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; + stop_flags = REQ_SOFTBARRIER | REQ_STARTED; list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); @@ -691,7 +691,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) void __elv_add_request(struct request_queue *q, struct request *rq, int where, int plug) { - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { + if (rq->cmd_flags & REQ_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS || (rq->cmd_flags & REQ_DISCARD)) { -- cgit v1.1 From cedb4a7d9f6aedb0dce94d6285b69dcb3c10fa05 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Nov 2010 13:37:54 +0100 Subject: block: remove unused copy_io_context() Reported-by: Oleg Nesterov Signed-off-by: Jens Axboe --- block/blk-ioc.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d22c4c5..3c7a339 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -153,20 +153,6 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node) } EXPORT_SYMBOL(get_io_context); -void copy_io_context(struct io_context **pdst, struct io_context **psrc) -{ - struct io_context *src = *psrc; - struct io_context *dst = *pdst; - - if (src) { - BUG_ON(atomic_long_read(&src->refcount) == 0); - atomic_long_inc(&src->refcount); - put_io_context(dst); - *pdst = src; - } -} -EXPORT_SYMBOL(copy_io_context); - static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", -- cgit v1.1