diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-24 10:16:26 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-24 10:16:26 -0700 |
commit | 6c5103890057b1bb781b26b7aae38d33e4c517d8 (patch) | |
tree | e6e57961dcddcb5841acb34956e70b9dc696a880 /block | |
parent | 3dab04e6978e358ad2307bca563fabd6c5d2c58b (diff) | |
parent | 9d2e157d970a73b3f270b631828e03eb452d525e (diff) | |
download | op-kernel-dev-6c5103890057b1bb781b26b7aae38d33e4c517d8.zip op-kernel-dev-6c5103890057b1bb781b26b7aae38d33e4c517d8.tar.gz |
Merge branch 'for-2.6.39/core' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.39/core' of git://git.kernel.dk/linux-2.6-block: (65 commits)
Documentation/iostats.txt: bit-size reference etc.
cfq-iosched: removing unnecessary think time checking
cfq-iosched: Don't clear queue stats when preempt.
blk-throttle: Reset group slice when limits are changed
blk-cgroup: Only give unaccounted_time under debug
cfq-iosched: Don't set active queue in preempt
block: fix non-atomic access to genhd inflight structures
block: attempt to merge with existing requests on plug flush
block: NULL dereference on error path in __blkdev_get()
cfq-iosched: Don't update group weights when on service tree
fs: assign sb->s_bdi to default_backing_dev_info if the bdi is going away
block: Require subsystems to explicitly allocate bio_set integrity mempool
jbd2: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and explicit plugging
jbd: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and explicit plugging
fs: make fsync_buffers_list() plug
mm: make generic_writepages() use plugging
blk-cgroup: Add unaccounted time to timeslice_used.
block: fixup plugging stubs for !CONFIG_BLOCK
block: remove obsolete comments for blkdev_issue_zeroout.
blktrace: Use rq->cmd_flags directly in blk_add_trace_rq.
...
Fix up conflicts in fs/{aio.c,super.c}
Diffstat (limited to 'block')
-rw-r--r-- | block/blk-cgroup.c | 16 | ||||
-rw-r--r-- | block/blk-cgroup.h | 14 | ||||
-rw-r--r-- | block/blk-core.c | 646 | ||||
-rw-r--r-- | block/blk-exec.c | 4 | ||||
-rw-r--r-- | block/blk-flush.c | 439 | ||||
-rw-r--r-- | block/blk-lib.c | 2 | ||||
-rw-r--r-- | block/blk-merge.c | 6 | ||||
-rw-r--r-- | block/blk-settings.c | 15 | ||||
-rw-r--r-- | block/blk-sysfs.c | 2 | ||||
-rw-r--r-- | block/blk-throttle.c | 139 | ||||
-rw-r--r-- | block/blk.h | 16 | ||||
-rw-r--r-- | block/cfq-iosched.c | 163 | ||||
-rw-r--r-- | block/cfq.h | 6 | ||||
-rw-r--r-- | block/deadline-iosched.c | 9 | ||||
-rw-r--r-- | block/elevator.c | 108 | ||||
-rw-r--r-- | block/genhd.c | 18 | ||||
-rw-r--r-- | block/noop-iosched.c | 8 |
17 files changed, 955 insertions, 656 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 455768a..2bef570 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -371,12 +371,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg, } EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); -void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) +void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, + unsigned long unaccounted_time) { unsigned long flags; spin_lock_irqsave(&blkg->stats_lock, flags); blkg->stats.time += time; + blkg->stats.unaccounted_time += unaccounted_time; spin_unlock_irqrestore(&blkg->stats_lock, flags); } EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); @@ -604,6 +606,9 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, blkg->stats.sectors, cb, dev); #ifdef CONFIG_DEBUG_BLK_CGROUP + if (type == BLKIO_STAT_UNACCOUNTED_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.unaccounted_time, cb, dev); if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { uint64_t sum = blkg->stats.avg_queue_size_sum; uint64_t samples = blkg->stats.avg_queue_size_samples; @@ -1125,6 +1130,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, return blkio_read_blkg_stats(blkcg, cft, cb, BLKIO_STAT_QUEUED, 1); #ifdef CONFIG_DEBUG_BLK_CGROUP + case BLKIO_PROP_unaccounted_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_UNACCOUNTED_TIME, 0); case BLKIO_PROP_dequeue: return blkio_read_blkg_stats(blkcg, cft, cb, BLKIO_STAT_DEQUEUE, 0); @@ -1382,6 +1390,12 @@ struct cftype blkio_files[] = { BLKIO_PROP_dequeue), .read_map = blkiocg_file_read_map, }, + { + .name = "unaccounted_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_unaccounted_time), + .read_map = blkiocg_file_read_map, + }, #endif }; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index ea4861b..10919fa 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -49,6 +49,8 @@ enum stat_type { /* All the single valued stats go below this */ BLKIO_STAT_TIME, BLKIO_STAT_SECTORS, + /* Time not charged to this cgroup */ + BLKIO_STAT_UNACCOUNTED_TIME, #ifdef CONFIG_DEBUG_BLK_CGROUP BLKIO_STAT_AVG_QUEUE_SIZE, BLKIO_STAT_IDLE_TIME, @@ -81,6 +83,7 @@ enum blkcg_file_name_prop { BLKIO_PROP_io_serviced, BLKIO_PROP_time, BLKIO_PROP_sectors, + BLKIO_PROP_unaccounted_time, BLKIO_PROP_io_service_time, BLKIO_PROP_io_wait_time, BLKIO_PROP_io_merged, @@ -114,6 +117,8 @@ struct blkio_group_stats { /* total disk time and nr sectors dispatched by this group */ uint64_t time; uint64_t sectors; + /* Time not charged to this cgroup */ + uint64_t unaccounted_time; uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; #ifdef CONFIG_DEBUG_BLK_CGROUP /* Sum of number of IOs queued across all samples */ @@ -240,7 +245,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } #endif -#define BLKIO_WEIGHT_MIN 100 +#define BLKIO_WEIGHT_MIN 10 #define BLKIO_WEIGHT_MAX 1000 #define BLKIO_WEIGHT_DEFAULT 500 @@ -293,7 +298,8 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); void blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time); + unsigned long time, + unsigned long unaccounted_time); void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, bool direction, bool sync); void blkiocg_update_completion_stats(struct blkio_group *blkg, @@ -319,7 +325,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } static inline struct blkio_group * blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time) {} + unsigned long time, + unsigned long unaccounted_time) +{} static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, bool direction, bool sync) {} static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, diff --git a/block/blk-core.c b/block/blk-core.c index a63336d..59b5c00 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -27,6 +27,7 @@ #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> +#include <linux/list_sort.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -149,39 +150,29 @@ EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { - struct request_queue *q = rq->q; - - if (&q->flush_rq != rq) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; - if (unlikely(nbytes > bio->bi_size)) { - printk(KERN_ERR "%s: want %u bytes done, %u left\n", - __func__, nbytes, bio->bi_size); - nbytes = bio->bi_size; - } + if (unlikely(nbytes > bio->bi_size)) { + printk(KERN_ERR "%s: want %u bytes done, %u left\n", + __func__, nbytes, bio->bi_size); + nbytes = bio->bi_size; + } - if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); + if (unlikely(rq->cmd_flags & REQ_QUIET)) + set_bit(BIO_QUIET, &bio->bi_flags); - bio->bi_size -= nbytes; - bio->bi_sector += (nbytes >> 9); + bio->bi_size -= nbytes; + bio->bi_sector += (nbytes >> 9); - if (bio_integrity(bio)) - bio_integrity_advance(bio, nbytes); + if (bio_integrity(bio)) + bio_integrity_advance(bio, nbytes); - if (bio->bi_size == 0) - bio_endio(bio, error); - } else { - /* - * Okay, this is the sequenced flush request in - * progress, just record the error; - */ - if (error && !q->flush_err) - q->flush_err = error; - } + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + bio_endio(bio, error); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -208,135 +199,43 @@ void blk_dump_rq_flags(struct request *rq, char *msg) EXPORT_SYMBOL(blk_dump_rq_flags); /* - * "plug" the device if there are no outstanding requests: this will - * force the transfer to start only after we have put all the requests - * on the list. - * - * This is called with interrupts off and no requests on the queue and - * with the queue lock held. - */ -void blk_plug_device(struct request_queue *q) + * Make sure that plugs that were pending when this function was entered, + * are now complete and requests pushed to the queue. +*/ +static inline void queue_sync_plugs(struct request_queue *q) { - WARN_ON(!irqs_disabled()); - /* - * don't plug a stopped queue, it must be paired with blk_start_queue() - * which will restart the queueing + * If the current process is plugged and has barriers submitted, + * we will livelock if we don't unplug first. */ - if (blk_queue_stopped(q)) - return; - - if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { - mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); - trace_block_plug(q); - } -} -EXPORT_SYMBOL(blk_plug_device); - -/** - * blk_plug_device_unlocked - plug a device without queue lock held - * @q: The &struct request_queue to plug - * - * Description: - * Like @blk_plug_device(), but grabs the queue lock and disables - * interrupts. - **/ -void blk_plug_device_unlocked(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - blk_plug_device(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(blk_plug_device_unlocked); - -/* - * remove the queue from the plugged list, if present. called with - * queue lock held and interrupts disabled. - */ -int blk_remove_plug(struct request_queue *q) -{ - WARN_ON(!irqs_disabled()); - - if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) - return 0; - - del_timer(&q->unplug_timer); - return 1; + blk_flush_plug(current); } -EXPORT_SYMBOL(blk_remove_plug); -/* - * remove the plug and let it rip.. - */ -void __generic_unplug_device(struct request_queue *q) +static void blk_delay_work(struct work_struct *work) { - if (unlikely(blk_queue_stopped(q))) - return; - if (!blk_remove_plug(q) && !blk_queue_nonrot(q)) - return; + struct request_queue *q; - q->request_fn(q); + q = container_of(work, struct request_queue, delay_work.work); + spin_lock_irq(q->queue_lock); + __blk_run_queue(q, false); + spin_unlock_irq(q->queue_lock); } /** - * generic_unplug_device - fire a request queue - * @q: The &struct request_queue in question + * blk_delay_queue - restart queueing after defined interval + * @q: The &struct request_queue in question + * @msecs: Delay in msecs * * Description: - * Linux uses plugging to build bigger requests queues before letting - * the device have at them. If a queue is plugged, the I/O scheduler - * is still adding and merging requests on the queue. Once the queue - * gets unplugged, the request_fn defined for the queue is invoked and - * transfers started. - **/ -void generic_unplug_device(struct request_queue *q) -{ - if (blk_queue_plugged(q)) { - spin_lock_irq(q->queue_lock); - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - } -} -EXPORT_SYMBOL(generic_unplug_device); - -static void blk_backing_dev_unplug(struct backing_dev_info *bdi, - struct page *page) -{ - struct request_queue *q = bdi->unplug_io_data; - - blk_unplug(q); -} - -void blk_unplug_work(struct work_struct *work) -{ - struct request_queue *q = - container_of(work, struct request_queue, unplug_work); - - trace_block_unplug_io(q); - q->unplug_fn(q); -} - -void blk_unplug_timeout(unsigned long data) -{ - struct request_queue *q = (struct request_queue *)data; - - trace_block_unplug_timer(q); - kblockd_schedule_work(q, &q->unplug_work); -} - -void blk_unplug(struct request_queue *q) + * Sometimes queueing needs to be postponed for a little while, to allow + * resources to come back. This function will make sure that queueing is + * restarted around the specified time. + */ +void blk_delay_queue(struct request_queue *q, unsigned long msecs) { - /* - * devices don't necessarily have an ->unplug_fn defined - */ - if (q->unplug_fn) { - trace_block_unplug_io(q); - q->unplug_fn(q); - } + schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs)); } -EXPORT_SYMBOL(blk_unplug); +EXPORT_SYMBOL(blk_delay_queue); /** * blk_start_queue - restart a previously stopped queue @@ -372,7 +271,7 @@ EXPORT_SYMBOL(blk_start_queue); **/ void blk_stop_queue(struct request_queue *q) { - blk_remove_plug(q); + cancel_delayed_work(&q->delay_work); queue_flag_set(QUEUE_FLAG_STOPPED, q); } EXPORT_SYMBOL(blk_stop_queue); @@ -390,13 +289,16 @@ EXPORT_SYMBOL(blk_stop_queue); * that its ->make_request_fn will not re-add plugging prior to calling * this function. * + * This function does not cancel any asynchronous activity arising + * out of elevator or throttling code. That would require elevaotor_exit() + * and blk_throtl_exit() to be called with queue lock initialized. + * */ void blk_sync_queue(struct request_queue *q) { - del_timer_sync(&q->unplug_timer); del_timer_sync(&q->timeout); - cancel_work_sync(&q->unplug_work); - throtl_shutdown_timer_wq(q); + cancel_delayed_work_sync(&q->delay_work); + queue_sync_plugs(q); } EXPORT_SYMBOL(blk_sync_queue); @@ -412,14 +314,9 @@ EXPORT_SYMBOL(blk_sync_queue); */ void __blk_run_queue(struct request_queue *q, bool force_kblockd) { - blk_remove_plug(q); - if (unlikely(blk_queue_stopped(q))) return; - if (elv_queue_empty(q)) - return; - /* * Only recurse once to avoid overrunning the stack, let the unplug * handling reinvoke the handler shortly if we already got there. @@ -427,10 +324,8 @@ void __blk_run_queue(struct request_queue *q, bool force_kblockd) if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { q->request_fn(q); queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else { - queue_flag_set(QUEUE_FLAG_PLUGGED, q); - kblockd_schedule_work(q, &q->unplug_work); - } + } else + queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); } EXPORT_SYMBOL(__blk_run_queue); @@ -457,6 +352,11 @@ void blk_put_queue(struct request_queue *q) kobject_put(&q->kobj); } +/* + * Note: If a driver supplied the queue lock, it should not zap that lock + * unexpectedly as some queue cleanup components like elevator_exit() and + * blk_throtl_exit() need queue lock. + */ void blk_cleanup_queue(struct request_queue *q) { /* @@ -475,6 +375,8 @@ void blk_cleanup_queue(struct request_queue *q) if (q->elevator) elevator_exit(q->elevator); + blk_throtl_exit(q); + blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); @@ -517,8 +419,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; - q->backing_dev_info.unplug_io_data = q; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.state = 0; @@ -538,17 +438,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); - init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); - INIT_LIST_HEAD(&q->pending_flushes); - INIT_WORK(&q->unplug_work, blk_unplug_work); + INIT_LIST_HEAD(&q->flush_queue[0]); + INIT_LIST_HEAD(&q->flush_queue[1]); + INIT_LIST_HEAD(&q->flush_data_in_flight); + INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); + /* + * By default initialize queue_lock to internal lock and driver can + * override it later if need be. + */ + q->queue_lock = &q->__queue_lock; + return q; } EXPORT_SYMBOL(blk_alloc_queue_node); @@ -631,9 +538,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; - q->unplug_fn = generic_unplug_device; q->queue_flags = QUEUE_FLAG_DEFAULT; - q->queue_lock = lock; + + /* Override internal queue lock with supplied lock pointer */ + if (lock) + q->queue_lock = lock; /* * This also sets hw/phys segments, boundary and size @@ -666,6 +575,8 @@ int blk_get_queue(struct request_queue *q) static inline void blk_free_request(struct request_queue *q, struct request *rq) { + BUG_ON(rq->cmd_flags & REQ_ON_PLUG); + if (rq->cmd_flags & REQ_ELVPRIV) elv_put_request(q, rq); mempool_free(rq, q->rq.rq_pool); @@ -762,6 +673,25 @@ static void freed_request(struct request_queue *q, int sync, int priv) } /* + * Determine if elevator data should be initialized when allocating the + * request associated with @bio. + */ +static bool blk_rq_should_init_elevator(struct bio *bio) +{ + if (!bio) + return true; + + /* + * Flush requests do not use the elevator so skip initialization. + * This allows a request to share the flush and elevator data. + */ + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) + return false; + + return true; +} + +/* * Get a free request, queue_lock must be held. * Returns NULL on failure, with queue_lock held. * Returns !NULL on success, with queue_lock *not held*. @@ -773,7 +703,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, struct request_list *rl = &q->rq; struct io_context *ioc = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; - int may_queue, priv; + int may_queue, priv = 0; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -817,9 +747,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl->count[is_sync]++; rl->starved[is_sync] = 0; - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) - rl->elvpriv++; + if (blk_rq_should_init_elevator(bio)) { + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); + if (priv) + rl->elvpriv++; + } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; @@ -866,8 +798,8 @@ out: } /* - * No available requests for this queue, unplug the device and wait for some - * requests to become available. + * No available requests for this queue, wait for some requests to become + * available. * * Called with q->queue_lock held, and returns with it unlocked. */ @@ -888,7 +820,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, trace_block_sleeprq(q, bio, rw_flags & 1); - __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); io_schedule(); @@ -1010,6 +941,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(blk_requeue_request); +static void add_acct_request(struct request_queue *q, struct request *rq, + int where) +{ + drive_stat_acct(rq, 1); + __elv_add_request(q, rq, where); +} + /** * blk_insert_request - insert a special request into a request queue * @q: request queue where request should be inserted @@ -1052,8 +990,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq, if (blk_rq_tagged(rq)) blk_queue_end_tag(q, rq); - drive_stat_acct(rq, 1); - __elv_add_request(q, rq, where, 0); + add_acct_request(q, rq, where); __blk_run_queue(q, false); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -1174,6 +1111,113 @@ void blk_add_request_payload(struct request *rq, struct page *page, } EXPORT_SYMBOL_GPL(blk_add_request_payload); +static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, + struct bio *bio) +{ + const int ff = bio->bi_rw & REQ_FAILFAST_MASK; + + /* + * Debug stuff, kill later + */ + if (!rq_mergeable(req)) { + blk_dump_rq_flags(req, "back"); + return false; + } + + if (!ll_back_merge_fn(q, req, bio)) + return false; + + trace_block_bio_backmerge(q, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_size; + req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); + + drive_stat_acct(req, 0); + return true; +} + +static bool bio_attempt_front_merge(struct request_queue *q, + struct request *req, struct bio *bio) +{ + const int ff = bio->bi_rw & REQ_FAILFAST_MASK; + sector_t sector; + + /* + * Debug stuff, kill later + */ + if (!rq_mergeable(req)) { + blk_dump_rq_flags(req, "front"); + return false; + } + + if (!ll_front_merge_fn(q, req, bio)) + return false; + + trace_block_bio_frontmerge(q, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + sector = bio->bi_sector; + + bio->bi_next = req->bio; + req->bio = bio; + + /* + * may not be valid. if the low level driver said + * it didn't need a bounce buffer then it better + * not touch req->buffer either... + */ + req->buffer = bio_data(bio); + req->__sector = bio->bi_sector; + req->__data_len += bio->bi_size; + req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); + + drive_stat_acct(req, 0); + return true; +} + +/* + * Attempts to merge with the plugged list in the current process. Returns + * true if merge was succesful, otherwise false. + */ +static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, + struct bio *bio) +{ + struct blk_plug *plug; + struct request *rq; + bool ret = false; + + plug = tsk->plug; + if (!plug) + goto out; + + list_for_each_entry_reverse(rq, &plug->list, queuelist) { + int el_ret; + + if (rq->q != q) + continue; + + el_ret = elv_try_merge(rq, bio); + if (el_ret == ELEVATOR_BACK_MERGE) { + ret = bio_attempt_back_merge(q, rq, bio); + if (ret) + break; + } else if (el_ret == ELEVATOR_FRONT_MERGE) { + ret = bio_attempt_front_merge(q, rq, bio); + if (ret) + break; + } + } +out: + return ret; +} + void init_request_from_bio(struct request *req, struct bio *bio) { req->cpu = bio->bi_comp_cpu; @@ -1189,26 +1233,12 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -/* - * Only disabling plugging for non-rotational devices if it does tagging - * as well, otherwise we do need the proper merging - */ -static inline bool queue_should_plug(struct request_queue *q) -{ - return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); -} - static int __make_request(struct request_queue *q, struct bio *bio) { - struct request *req; - int el_ret; - unsigned int bytes = bio->bi_size; - const unsigned short prio = bio_prio(bio); const bool sync = !!(bio->bi_rw & REQ_SYNC); - const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); - const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; - int where = ELEVATOR_INSERT_SORT; - int rw_flags; + struct blk_plug *plug; + int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; + struct request *req; /* * low level driver can indicate that it wants pages above a @@ -1217,78 +1247,36 @@ static int __make_request(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); - spin_lock_irq(q->queue_lock); - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { - where = ELEVATOR_INSERT_FRONT; + spin_lock_irq(q->queue_lock); + where = ELEVATOR_INSERT_FLUSH; goto get_rq; } - if (elv_queue_empty(q)) - goto get_rq; - - el_ret = elv_merge(q, &req, bio); - switch (el_ret) { - case ELEVATOR_BACK_MERGE: - BUG_ON(!rq_mergeable(req)); - - if (!ll_back_merge_fn(q, req, bio)) - break; - - trace_block_bio_backmerge(q, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bytes; - req->ioprio = ioprio_best(req->ioprio, prio); - if (!blk_rq_cpu_valid(req)) - req->cpu = bio->bi_comp_cpu; - drive_stat_acct(req, 0); - elv_bio_merged(q, req, bio); - if (!attempt_back_merge(q, req)) - elv_merged_request(q, req, el_ret); + /* + * Check if we can merge with the plugged list before grabbing + * any locks. + */ + if (attempt_plug_merge(current, q, bio)) goto out; - case ELEVATOR_FRONT_MERGE: - BUG_ON(!rq_mergeable(req)); - - if (!ll_front_merge_fn(q, req, bio)) - break; - - trace_block_bio_frontmerge(q, bio); + spin_lock_irq(q->queue_lock); - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { - blk_rq_set_mixed_merge(req); - req->cmd_flags &= ~REQ_FAILFAST_MASK; - req->cmd_flags |= ff; + el_ret = elv_merge(q, &req, bio); + if (el_ret == ELEVATOR_BACK_MERGE) { + BUG_ON(req->cmd_flags & REQ_ON_PLUG); + if (bio_attempt_back_merge(q, req, bio)) { + if (!attempt_back_merge(q, req)) + elv_merged_request(q, req, el_ret); + goto out_unlock; + } + } else if (el_ret == ELEVATOR_FRONT_MERGE) { + BUG_ON(req->cmd_flags & REQ_ON_PLUG); + if (bio_attempt_front_merge(q, req, bio)) { + if (!attempt_front_merge(q, req)) + elv_merged_request(q, req, el_ret); + goto out_unlock; } - - bio->bi_next = req->bio; - req->bio = bio; - - /* - * may not be valid. if the low level driver said - * it didn't need a bounce buffer then it better - * not touch req->buffer either... - */ - req->buffer = bio_data(bio); - req->__sector = bio->bi_sector; - req->__data_len += bytes; - req->ioprio = ioprio_best(req->ioprio, prio); - if (!blk_rq_cpu_valid(req)) - req->cpu = bio->bi_comp_cpu; - drive_stat_acct(req, 0); - elv_bio_merged(q, req, bio); - if (!attempt_front_merge(q, req)) - elv_merged_request(q, req, el_ret); - goto out; - - /* ELV_NO_MERGE: elevator says don't/can't merge. */ - default: - ; } get_rq: @@ -1315,20 +1303,35 @@ get_rq: */ init_request_from_bio(req, bio); - spin_lock_irq(q->queue_lock); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) - req->cpu = blk_cpu_to_group(smp_processor_id()); - if (queue_should_plug(q) && elv_queue_empty(q)) - blk_plug_device(q); - - /* insert the request into the elevator */ - drive_stat_acct(req, 1); - __elv_add_request(q, req, where, 0); + bio_flagged(bio, BIO_CPU_AFFINE)) { + req->cpu = blk_cpu_to_group(get_cpu()); + put_cpu(); + } + + plug = current->plug; + if (plug) { + if (!plug->should_sort && !list_empty(&plug->list)) { + struct request *__rq; + + __rq = list_entry_rq(plug->list.prev); + if (__rq->q != q) + plug->should_sort = 1; + } + /* + * Debug flag, kill later + */ + req->cmd_flags |= REQ_ON_PLUG; + list_add_tail(&req->queuelist, &plug->list); + drive_stat_acct(req, 1); + } else { + spin_lock_irq(q->queue_lock); + add_acct_request(q, req, where); + __blk_run_queue(q, false); +out_unlock: + spin_unlock_irq(q->queue_lock); + } out: - if (unplug || !queue_should_plug(q)) - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); return 0; } @@ -1731,9 +1734,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) */ BUG_ON(blk_queued_rq(rq)); - drive_stat_acct(rq, 1); - __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); - + add_acct_request(q, rq, ELEVATOR_INSERT_BACK); spin_unlock_irqrestore(q->queue_lock, flags); return 0; @@ -1805,7 +1806,7 @@ static void blk_account_io_done(struct request *req) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && req != &req->q->flush_rq) { + if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; @@ -2628,6 +2629,113 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_delayed_work(struct request_queue *q, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work(kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work); + +#define PLUG_MAGIC 0x91827364 + +void blk_start_plug(struct blk_plug *plug) +{ + struct task_struct *tsk = current; + + plug->magic = PLUG_MAGIC; + INIT_LIST_HEAD(&plug->list); + plug->should_sort = 0; + + /* + * If this is a nested plug, don't actually assign it. It will be + * flushed on its own. + */ + if (!tsk->plug) { + /* + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier + */ + tsk->plug = plug; + } +} +EXPORT_SYMBOL(blk_start_plug); + +static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + return !(rqa->q == rqb->q); +} + +static void flush_plug_list(struct blk_plug *plug) +{ + struct request_queue *q; + unsigned long flags; + struct request *rq; + + BUG_ON(plug->magic != PLUG_MAGIC); + + if (list_empty(&plug->list)) + return; + + if (plug->should_sort) + list_sort(NULL, &plug->list, plug_rq_cmp); + + q = NULL; + local_irq_save(flags); + while (!list_empty(&plug->list)) { + rq = list_entry_rq(plug->list.next); + list_del_init(&rq->queuelist); + BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG)); + BUG_ON(!rq->q); + if (rq->q != q) { + if (q) { + __blk_run_queue(q, false); + spin_unlock(q->queue_lock); + } + q = rq->q; + spin_lock(q->queue_lock); + } + rq->cmd_flags &= ~REQ_ON_PLUG; + + /* + * rq is already accounted, so use raw insert + */ + __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); + } + + if (q) { + __blk_run_queue(q, false); + spin_unlock(q->queue_lock); + } + + BUG_ON(!list_empty(&plug->list)); + local_irq_restore(flags); +} + +static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug) +{ + flush_plug_list(plug); + + if (plug == tsk->plug) + tsk->plug = NULL; +} + +void blk_finish_plug(struct blk_plug *plug) +{ + if (plug) + __blk_finish_plug(current, plug); +} +EXPORT_SYMBOL(blk_finish_plug); + +void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug) +{ + __blk_finish_plug(tsk, plug); + tsk->plug = plug; +} +EXPORT_SYMBOL(__blk_flush_plug); + int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * diff --git a/block/blk-exec.c b/block/blk-exec.c index cf1456a..7482b7f 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -54,8 +54,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq->end_io = done; WARN_ON(irqs_disabled()); spin_lock_irq(q->queue_lock); - __elv_add_request(q, rq, where, 1); - __generic_unplug_device(q); + __elv_add_request(q, rq, where); + __blk_run_queue(q, false); /* the queue is stopped so it won't be plugged+unplugged */ if (rq->cmd_type == REQ_TYPE_PM_RESUME) q->request_fn(q); diff --git a/block/blk-flush.c b/block/blk-flush.c index b27d020..93d5fd8 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -1,6 +1,69 @@ /* * Functions to sequence FLUSH and FUA writes. + * + * Copyright (C) 2011 Max Planck Institute for Gravitational Physics + * Copyright (C) 2011 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three + * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request + * properties and hardware capability. + * + * If a request doesn't have data, only REQ_FLUSH makes sense, which + * indicates a simple flush request. If there is data, REQ_FLUSH indicates + * that the device cache should be flushed before the data is executed, and + * REQ_FUA means that the data must be on non-volatile media on request + * completion. + * + * If the device doesn't have writeback cache, FLUSH and FUA don't make any + * difference. The requests are either completed immediately if there's no + * data or executed as normal requests otherwise. + * + * If the device has writeback cache and supports FUA, REQ_FLUSH is + * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. + * + * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is + * translated to PREFLUSH and REQ_FUA to POSTFLUSH. + * + * The actual execution of flush is double buffered. Whenever a request + * needs to execute PRE or POSTFLUSH, it queues at + * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a + * flush is issued and the pending_idx is toggled. When the flush + * completes, all the requests which were pending are proceeded to the next + * step. This allows arbitrary merging of different types of FLUSH/FUA + * requests. + * + * Currently, the following conditions are used to determine when to issue + * flush. + * + * C1. At any given time, only one flush shall be in progress. This makes + * double buffering sufficient. + * + * C2. Flush is deferred if any request is executing DATA of its sequence. + * This avoids issuing separate POSTFLUSHes for requests which shared + * PREFLUSH. + * + * C3. The second condition is ignored if there is a request which has + * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid + * starvation in the unlikely case where there are continuous stream of + * FUA (without FLUSH) requests. + * + * For devices which support FUA, it isn't clear whether C2 (and thus C3) + * is beneficial. + * + * Note that a sequenced FLUSH/FUA request with DATA is completed twice. + * Once while executing DATA and again after the whole sequence is + * complete. The first completion updates the contained bio but doesn't + * finish it so that the bio submitter is notified only after the whole + * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in + * req_bio_endio(). + * + * The above peculiarity requires that each FLUSH/FUA request has only one + * bio attached to it, which is guaranteed as they aren't allowed to be + * merged in the usual way. */ + #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> @@ -11,58 +74,142 @@ /* FLUSH/FUA sequences */ enum { - QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ - QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ - QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ - QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ - QUEUE_FSEQ_DONE = (1 << 4), + REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ + REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ + REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ + REQ_FSEQ_DONE = (1 << 3), + + REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | + REQ_FSEQ_POSTFLUSH, + + /* + * If flush has been pending longer than the following timeout, + * it's issued even if flush_data requests are still in flight. + */ + FLUSH_PENDING_TIMEOUT = 5 * HZ, }; -static struct request *queue_next_fseq(struct request_queue *q); +static bool blk_kick_flush(struct request_queue *q); -unsigned blk_flush_cur_seq(struct request_queue *q) +static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) { - if (!q->flush_seq) - return 0; - return 1 << ffz(q->flush_seq); + unsigned int policy = 0; + + if (fflags & REQ_FLUSH) { + if (rq->cmd_flags & REQ_FLUSH) + policy |= REQ_FSEQ_PREFLUSH; + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) + policy |= REQ_FSEQ_POSTFLUSH; + } + return policy; } -static struct request *blk_flush_complete_seq(struct request_queue *q, - unsigned seq, int error) +static unsigned int blk_flush_cur_seq(struct request *rq) { - struct request *next_rq = NULL; - - if (error && !q->flush_err) - q->flush_err = error; - - BUG_ON(q->flush_seq & seq); - q->flush_seq |= seq; - - if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) { - /* not complete yet, queue the next flush sequence */ - next_rq = queue_next_fseq(q); - } else { - /* complete this flush request */ - __blk_end_request_all(q->orig_flush_rq, q->flush_err); - q->orig_flush_rq = NULL; - q->flush_seq = 0; - - /* dispatch the next flush if there's one */ - if (!list_empty(&q->pending_flushes)) { - next_rq = list_entry_rq(q->pending_flushes.next); - list_move(&next_rq->queuelist, &q->queue_head); - } + return 1 << ffz(rq->flush.seq); +} + +static void blk_flush_restore_request(struct request *rq) +{ + /* + * After flush data completion, @rq->bio is %NULL but we need to + * complete the bio again. @rq->biotail is guaranteed to equal the + * original @rq->bio. Restore it. + */ + rq->bio = rq->biotail; + + /* make @rq a normal request */ + rq->cmd_flags &= ~REQ_FLUSH_SEQ; + rq->end_io = NULL; +} + +/** + * blk_flush_complete_seq - complete flush sequence + * @rq: FLUSH/FUA request being sequenced + * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) + * @error: whether an error occurred + * + * @rq just completed @seq part of its flush sequence, record the + * completion and trigger the next step. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + * + * RETURNS: + * %true if requests were added to the dispatch queue, %false otherwise. + */ +static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, + int error) +{ + struct request_queue *q = rq->q; + struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + bool queued = false; + + BUG_ON(rq->flush.seq & seq); + rq->flush.seq |= seq; + + if (likely(!error)) + seq = blk_flush_cur_seq(rq); + else + seq = REQ_FSEQ_DONE; + + switch (seq) { + case REQ_FSEQ_PREFLUSH: + case REQ_FSEQ_POSTFLUSH: + /* queue for flush */ + if (list_empty(pending)) + q->flush_pending_since = jiffies; + list_move_tail(&rq->flush.list, pending); + break; + + case REQ_FSEQ_DATA: + list_move_tail(&rq->flush.list, &q->flush_data_in_flight); + list_add(&rq->queuelist, &q->queue_head); + queued = true; + break; + + case REQ_FSEQ_DONE: + /* + * @rq was previously adjusted by blk_flush_issue() for + * flush sequencing and may already have gone through the + * flush data request completion path. Restore @rq for + * normal completion and end it. + */ + BUG_ON(!list_empty(&rq->queuelist)); + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + __blk_end_request_all(rq, error); + break; + + default: + BUG(); } - return next_rq; + + return blk_kick_flush(q) | queued; } -static void blk_flush_complete_seq_end_io(struct request_queue *q, - unsigned seq, int error) +static void flush_end_io(struct request *flush_rq, int error) { - bool was_empty = elv_queue_empty(q); - struct request *next_rq; + struct request_queue *q = flush_rq->q; + struct list_head *running = &q->flush_queue[q->flush_running_idx]; + bool queued = false; + struct request *rq, *n; - next_rq = blk_flush_complete_seq(q, seq, error); + BUG_ON(q->flush_pending_idx == q->flush_running_idx); + + /* account completion of the flush request */ + q->flush_running_idx ^= 1; + elv_completed_request(q, flush_rq); + + /* and push the waiting requests to the next stage */ + list_for_each_entry_safe(rq, n, running, flush.list) { + unsigned int seq = blk_flush_cur_seq(rq); + + BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + queued |= blk_flush_complete_seq(rq, seq, error); + } /* * Moving a request silently to empty queue_head may stall the @@ -70,127 +217,153 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q, * from request completion path and calling directly into * request_fn may confuse the driver. Always use kblockd. */ - if (was_empty && next_rq) + if (queued) __blk_run_queue(q, true); } -static void pre_flush_end_io(struct request *rq, int error) +/** + * blk_kick_flush - consider issuing flush request + * @q: request_queue being kicked + * + * Flush related states of @q have changed, consider issuing flush request. + * Please read the comment at the top of this file for more info. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + * + * RETURNS: + * %true if flush was issued, %false otherwise. + */ +static bool blk_kick_flush(struct request_queue *q) { - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); + struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct request *first_rq = + list_first_entry(pending, struct request, flush.list); + + /* C1 described at the top of this file */ + if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) + return false; + + /* C2 and C3 */ + if (!list_empty(&q->flush_data_in_flight) && + time_before(jiffies, + q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) + return false; + + /* + * Issue flush and toggle pending_idx. This makes pending_idx + * different from running_idx, which means flush is in flight. + */ + blk_rq_init(q, &q->flush_rq); + q->flush_rq.cmd_type = REQ_TYPE_FS; + q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + q->flush_rq.rq_disk = first_rq->rq_disk; + q->flush_rq.end_io = flush_end_io; + + q->flush_pending_idx ^= 1; + elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE); + return true; } static void flush_data_end_io(struct request *rq, int error) { - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error); -} + struct request_queue *q = rq->q; -static void post_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); + /* + * After populating an empty queue, kick it to avoid stall. Read + * the comment in flush_end_io(). + */ + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + __blk_run_queue(q, true); } -static void init_flush_request(struct request *rq, struct gendisk *disk) +/** + * blk_insert_flush - insert a new FLUSH/FUA request + * @rq: request to insert + * + * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions. + * @rq is being submitted. Analyze what needs to be done and put it on the + * right queue. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + */ +void blk_insert_flush(struct request *rq) { - rq->cmd_type = REQ_TYPE_FS; - rq->cmd_flags = WRITE_FLUSH; - rq->rq_disk = disk; -} + struct request_queue *q = rq->q; + unsigned int fflags = q->flush_flags; /* may change, cache */ + unsigned int policy = blk_flush_policy(fflags, rq); -static struct request *queue_next_fseq(struct request_queue *q) -{ - struct request *orig_rq = q->orig_flush_rq; - struct request *rq = &q->flush_rq; + BUG_ON(rq->end_io); + BUG_ON(!rq->bio || rq->bio != rq->biotail); - blk_rq_init(q, rq); + /* + * @policy now records what operations need to be done. Adjust + * REQ_FLUSH and FUA for the driver. + */ + rq->cmd_flags &= ~REQ_FLUSH; + if (!(fflags & REQ_FUA)) + rq->cmd_flags &= ~REQ_FUA; - switch (blk_flush_cur_seq(q)) { - case QUEUE_FSEQ_PREFLUSH: - init_flush_request(rq, orig_rq->rq_disk); - rq->end_io = pre_flush_end_io; - break; - case QUEUE_FSEQ_DATA: - init_request_from_bio(rq, orig_rq->bio); - /* - * orig_rq->rq_disk may be different from - * bio->bi_bdev->bd_disk if orig_rq got here through - * remapping drivers. Make sure rq->rq_disk points - * to the same one as orig_rq. - */ - rq->rq_disk = orig_rq->rq_disk; - rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); - rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); - rq->end_io = flush_data_end_io; - break; - case QUEUE_FSEQ_POSTFLUSH: - init_flush_request(rq, orig_rq->rq_disk); - rq->end_io = post_flush_end_io; - break; - default: - BUG(); + /* + * If there's data but flush is not necessary, the request can be + * processed directly without going through flush machinery. Queue + * for normal execution. + */ + if ((policy & REQ_FSEQ_DATA) && + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { + list_add(&rq->queuelist, &q->queue_head); + return; } - elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); - return rq; + /* + * @rq should go through flush machinery. Mark it part of flush + * sequence and submit for further processing. + */ + memset(&rq->flush, 0, sizeof(rq->flush)); + INIT_LIST_HEAD(&rq->flush.list); + rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->end_io = flush_data_end_io; + + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); } -struct request *blk_do_flush(struct request_queue *q, struct request *rq) +/** + * blk_abort_flushes - @q is being aborted, abort flush requests + * @q: request_queue being aborted + * + * To be called from elv_abort_queue(). @q is being aborted. Prepare all + * FLUSH/FUA requests for abortion. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + */ +void blk_abort_flushes(struct request_queue *q) { - unsigned int fflags = q->flush_flags; /* may change, cache it */ - bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; - bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH); - bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA); - unsigned skip = 0; + struct request *rq, *n; + int i; /* - * Special case. If there's data but flush is not necessary, - * the request can be issued directly. - * - * Flush w/o data should be able to be issued directly too but - * currently some drivers assume that rq->bio contains - * non-zero data if it isn't NULL and empty FLUSH requests - * getting here usually have bio's without data. + * Requests in flight for data are already owned by the dispatch + * queue or the device driver. Just restore for normal completion. */ - if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { - rq->cmd_flags &= ~REQ_FLUSH; - if (!has_fua) - rq->cmd_flags &= ~REQ_FUA; - return rq; + list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) { + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); } /* - * Sequenced flushes can't be processed in parallel. If - * another one is already in progress, queue for later - * processing. + * We need to give away requests on flush queues. Restore for + * normal completion and put them on the dispatch queue. */ - if (q->flush_seq) { - list_move_tail(&rq->queuelist, &q->pending_flushes); - return NULL; + for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) { + list_for_each_entry_safe(rq, n, &q->flush_queue[i], + flush.list) { + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + list_add_tail(&rq->queuelist, &q->queue_head); + } } - - /* - * Start a new flush sequence - */ - q->flush_err = 0; - q->flush_seq |= QUEUE_FSEQ_STARTED; - - /* adjust FLUSH/FUA of the original request and stash it away */ - rq->cmd_flags &= ~REQ_FLUSH; - if (!has_fua) - rq->cmd_flags &= ~REQ_FUA; - blk_dequeue_request(rq); - q->orig_flush_rq = rq; - - /* skip unneded sequences and return the first one */ - if (!do_preflush) - skip |= QUEUE_FSEQ_PREFLUSH; - if (!blk_rq_sectors(rq)) - skip |= QUEUE_FSEQ_DATA; - if (!do_postflush) - skip |= QUEUE_FSEQ_POSTFLUSH; - return blk_flush_complete_seq(q, skip, 0); } static void bio_end_flush(struct bio *bio, int err) diff --git a/block/blk-lib.c b/block/blk-lib.c index bd3e8df..25de73e 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -136,8 +136,6 @@ static void bio_batch_end_io(struct bio *bio, int err) * * Description: * Generate and issue number of bios with zerofiled pages. - * Send barrier at the beginning and at the end if requested. This guarantie - * correct request ordering. Empty barrier allow us to avoid post queue flush. */ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, diff --git a/block/blk-merge.c b/block/blk-merge.c index ea85e20..cfcc37c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq) return 0; } + +int blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) +{ + return attempt_merge(q, rq, next); +} diff --git a/block/blk-settings.c b/block/blk-settings.c index 36c8c1f..1fa7692 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -164,25 +164,10 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) blk_queue_congestion_threshold(q); q->nr_batching = BLK_BATCH_REQ; - q->unplug_thresh = 4; /* hmm */ - q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */ - if (q->unplug_delay == 0) - q->unplug_delay = 1; - - q->unplug_timer.function = blk_unplug_timeout; - q->unplug_timer.data = (unsigned long)q; - blk_set_default_limits(&q->limits); blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); /* - * If the caller didn't supply a lock, fall back to our embedded - * per-queue locks - */ - if (!q->queue_lock) - q->queue_lock = &q->__queue_lock; - - /* * by default assume old behaviour and bounce for any highmem page */ blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 41fb691..261c75c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj) blk_sync_queue(q); - blk_throtl_exit(q); - if (rl->rq_pool) mempool_destroy(rl->rq_pool); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index e36cc10..5352bda 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -102,7 +102,7 @@ struct throtl_data /* Work for dispatching throttled bios */ struct delayed_work throtl_work; - atomic_t limits_changed; + bool limits_changed; }; enum tg_state_flags { @@ -201,6 +201,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, RB_CLEAR_NODE(&tg->rb_node); bio_list_init(&tg->bio_lists[0]); bio_list_init(&tg->bio_lists[1]); + td->limits_changed = false; /* * Take the initial reference that will be released on destroy @@ -737,34 +738,36 @@ static void throtl_process_limit_change(struct throtl_data *td) struct throtl_grp *tg; struct hlist_node *pos, *n; - if (!atomic_read(&td->limits_changed)) + if (!td->limits_changed) return; - throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); + xchg(&td->limits_changed, false); - /* - * Make sure updates from throtl_update_blkio_group_read_bps() group - * of functions to tg->limits_changed are visible. We do not - * want update td->limits_changed to be visible but update to - * tg->limits_changed not being visible yet on this cpu. Hence - * the read barrier. - */ - smp_rmb(); + throtl_log(td, "limits changed"); hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { - if (throtl_tg_on_rr(tg) && tg->limits_changed) { - throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" - " riops=%u wiops=%u", tg->bps[READ], - tg->bps[WRITE], tg->iops[READ], - tg->iops[WRITE]); + if (!tg->limits_changed) + continue; + + if (!xchg(&tg->limits_changed, false)) + continue; + + throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" + " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], + tg->iops[READ], tg->iops[WRITE]); + + /* + * Restart the slices for both READ and WRITES. It + * might happen that a group's limit are dropped + * suddenly and we don't want to account recently + * dispatched IO with new low rate + */ + throtl_start_new_slice(td, tg, 0); + throtl_start_new_slice(td, tg, 1); + + if (throtl_tg_on_rr(tg)) tg_update_disptime(td, tg); - tg->limits_changed = false; - } } - - smp_mb__before_atomic_dec(); - atomic_dec(&td->limits_changed); - smp_mb__after_atomic_dec(); } /* Dispatch throttled bios. Should be called without queue lock held. */ @@ -774,6 +777,7 @@ static int throtl_dispatch(struct request_queue *q) unsigned int nr_disp = 0; struct bio_list bio_list_on_stack; struct bio *bio; + struct blk_plug plug; spin_lock_irq(q->queue_lock); @@ -802,9 +806,10 @@ out: * immediate dispatch */ if (nr_disp) { + blk_start_plug(&plug); while((bio = bio_list_pop(&bio_list_on_stack))) generic_make_request(bio); - blk_unplug(q); + blk_finish_plug(&plug); } return nr_disp; } @@ -825,7 +830,8 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) struct delayed_work *dwork = &td->throtl_work; - if (total_nr_queued(td) > 0) { + /* schedule work if limits changed even if no bio is queued */ + if (total_nr_queued(td) > 0 || td->limits_changed) { /* * We might have a work scheduled to be executed in future. * Cancel that and schedule a new one. @@ -898,6 +904,15 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) spin_unlock_irqrestore(td->queue->queue_lock, flags); } +static void throtl_update_blkio_group_common(struct throtl_data *td, + struct throtl_grp *tg) +{ + xchg(&tg->limits_changed, true); + xchg(&td->limits_changed, true); + /* Schedule a work now to process the limit change */ + throtl_schedule_delayed_work(td, 0); +} + /* * For all update functions, key should be a valid pointer because these * update functions are called under blkcg_lock, that means, blkg is @@ -911,64 +926,43 @@ static void throtl_update_blkio_group_read_bps(void *key, struct blkio_group *blkg, u64 read_bps) { struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); - tg_of_blkg(blkg)->bps[READ] = read_bps; - /* Make sure read_bps is updated before setting limits_changed */ - smp_wmb(); - tg_of_blkg(blkg)->limits_changed = true; - - /* Make sure tg->limits_changed is updated before td->limits_changed */ - smp_mb__before_atomic_inc(); - atomic_inc(&td->limits_changed); - smp_mb__after_atomic_inc(); - - /* Schedule a work now to process the limit change */ - throtl_schedule_delayed_work(td, 0); + tg->bps[READ] = read_bps; + throtl_update_blkio_group_common(td, tg); } static void throtl_update_blkio_group_write_bps(void *key, struct blkio_group *blkg, u64 write_bps) { struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); - tg_of_blkg(blkg)->bps[WRITE] = write_bps; - smp_wmb(); - tg_of_blkg(blkg)->limits_changed = true; - smp_mb__before_atomic_inc(); - atomic_inc(&td->limits_changed); - smp_mb__after_atomic_inc(); - throtl_schedule_delayed_work(td, 0); + tg->bps[WRITE] = write_bps; + throtl_update_blkio_group_common(td, tg); } static void throtl_update_blkio_group_read_iops(void *key, struct blkio_group *blkg, unsigned int read_iops) { struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); - tg_of_blkg(blkg)->iops[READ] = read_iops; - smp_wmb(); - tg_of_blkg(blkg)->limits_changed = true; - smp_mb__before_atomic_inc(); - atomic_inc(&td->limits_changed); - smp_mb__after_atomic_inc(); - throtl_schedule_delayed_work(td, 0); + tg->iops[READ] = read_iops; + throtl_update_blkio_group_common(td, tg); } static void throtl_update_blkio_group_write_iops(void *key, struct blkio_group *blkg, unsigned int write_iops) { struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); - tg_of_blkg(blkg)->iops[WRITE] = write_iops; - smp_wmb(); - tg_of_blkg(blkg)->limits_changed = true; - smp_mb__before_atomic_inc(); - atomic_inc(&td->limits_changed); - smp_mb__after_atomic_inc(); - throtl_schedule_delayed_work(td, 0); + tg->iops[WRITE] = write_iops; + throtl_update_blkio_group_common(td, tg); } -void throtl_shutdown_timer_wq(struct request_queue *q) +static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; @@ -1009,20 +1003,28 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) /* * There is already another bio queued in same dir. No * need to update dispatch time. - * Still update the disptime if rate limits on this group - * were changed. */ - if (!tg->limits_changed) - update_disptime = false; - else - tg->limits_changed = false; - + update_disptime = false; goto queue_bio; + } /* Bio is with-in rate limit of group */ if (tg_may_dispatch(td, tg, bio, NULL)) { throtl_charge_bio(tg, bio); + + /* + * We need to trim slice even when bios are not being queued + * otherwise it might happen that a bio is not queued for + * a long time and slice keeps on extending and trim is not + * called for a long time. Now if limits are reduced suddenly + * we take into account all the IO dispatched so far at new + * low rate and * newly queued IO gets a really long dispatch + * time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(td, tg, rw); goto out; } @@ -1058,7 +1060,7 @@ int blk_throtl_init(struct request_queue *q) INIT_HLIST_HEAD(&td->tg_list); td->tg_service_tree = THROTL_RB_ROOT; - atomic_set(&td->limits_changed, 0); + td->limits_changed = false; /* Init root group */ tg = &td->root_tg; @@ -1070,6 +1072,7 @@ int blk_throtl_init(struct request_queue *q) /* Practically unlimited BW */ tg->bps[0] = tg->bps[1] = -1; tg->iops[0] = tg->iops[1] = -1; + td->limits_changed = false; /* * Set root group reference to 2. One reference will be dropped when @@ -1102,7 +1105,7 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!td); - throtl_shutdown_timer_wq(q); + throtl_shutdown_wq(q); spin_lock_irq(q->queue_lock); throtl_release_tgs(td); @@ -1132,7 +1135,7 @@ void blk_throtl_exit(struct request_queue *q) * update limits through cgroup and another work got queued, cancel * it. */ - throtl_shutdown_timer_wq(q); + throtl_shutdown_wq(q); throtl_td_free(td); } diff --git a/block/blk.h b/block/blk.h index 2db8f32..c8db371 100644 --- a/block/blk.h +++ b/block/blk.h @@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq, void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); -void blk_unplug_work(struct work_struct *work); -void blk_unplug_timeout(unsigned long data); void blk_rq_timed_out_timer(unsigned long data); void blk_delete_timer(struct request *); void blk_add_timer(struct request *); @@ -51,21 +49,17 @@ static inline void blk_clear_rq_complete(struct request *rq) */ #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) -struct request *blk_do_flush(struct request_queue *q, struct request *rq); +void blk_insert_flush(struct request *rq); +void blk_abort_flushes(struct request_queue *q); static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; while (1) { - while (!list_empty(&q->queue_head)) { + if (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || - rq == &q->flush_rq) - return rq; - rq = blk_do_flush(q, rq); - if (rq) - return rq; + return rq; } if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) @@ -109,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio); int attempt_back_merge(struct request_queue *q, struct request *rq); int attempt_front_merge(struct request_queue *q, struct request *rq); +int blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next); void blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ea83a4f..7785169 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4; #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) #define RQ_CIC(rq) \ - ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) -#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) + ((struct cfq_io_context *) (rq)->elevator_private[0]) +#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1]) +#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2]) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -146,7 +146,6 @@ struct cfq_queue { struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; - struct cfq_group *orig_cfqg; /* Number of sectors dispatched from queue in single dispatch round */ unsigned long nr_sectors; }; @@ -179,6 +178,8 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; unsigned int weight; + unsigned int new_weight; + bool needs_update; /* number of cfqq currently on this group */ int nr_cfqq; @@ -238,6 +239,7 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; + unsigned int busy_sync_queues; int rq_in_driver; int rq_in_flight[2]; @@ -285,7 +287,6 @@ struct cfq_data { unsigned int cfq_slice_idle; unsigned int cfq_group_idle; unsigned int cfq_latency; - unsigned int cfq_group_isolation; unsigned int cic_index; struct list_head cic_list; @@ -501,13 +502,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) } } -static int cfq_queue_empty(struct request_queue *q) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - return !cfqd->rq_queued; -} - /* * Scale schedule slice based on io priority. Use the sync time slice only * if a queue is marked sync and has sync io queued. A sync queue with async @@ -558,15 +552,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) static void update_min_vdisktime(struct cfq_rb_root *st) { - u64 vdisktime = st->min_vdisktime; struct cfq_group *cfqg; if (st->left) { cfqg = rb_entry_cfqg(st->left); - vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); + st->min_vdisktime = max_vdisktime(st->min_vdisktime, + cfqg->vdisktime); } - - st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); } /* @@ -863,7 +855,27 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) } static void -cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) +cfq_update_group_weight(struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + if (cfqg->needs_update) { + cfqg->weight = cfqg->new_weight; + cfqg->needs_update = false; + } +} + +static void +cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + + cfq_update_group_weight(cfqg); + __cfq_group_service_tree_add(st, cfqg); + st->total_weight += cfqg->weight; +} + +static void +cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) { struct cfq_rb_root *st = &cfqd->grp_service_tree; struct cfq_group *__cfqg; @@ -884,13 +896,19 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; } else cfqg->vdisktime = st->min_vdisktime; + cfq_group_service_tree_add(st, cfqg); +} - __cfq_group_service_tree_add(st, cfqg); - st->total_weight += cfqg->weight; +static void +cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + st->total_weight -= cfqg->weight; + if (!RB_EMPTY_NODE(&cfqg->rb_node)) + cfq_rb_erase(&cfqg->rb_node, st); } static void -cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) +cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) { struct cfq_rb_root *st = &cfqd->grp_service_tree; @@ -902,14 +920,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) return; cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); - st->total_weight -= cfqg->weight; - if (!RB_EMPTY_NODE(&cfqg->rb_node)) - cfq_rb_erase(&cfqg->rb_node, st); + cfq_group_service_tree_del(st, cfqg); cfqg->saved_workload_slice = 0; cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); } -static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) +static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, + unsigned int *unaccounted_time) { unsigned int slice_used; @@ -928,8 +945,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 1); } else { slice_used = jiffies - cfqq->slice_start; - if (slice_used > cfqq->allocated_slice) + if (slice_used > cfqq->allocated_slice) { + *unaccounted_time = slice_used - cfqq->allocated_slice; slice_used = cfqq->allocated_slice; + } + if (time_after(cfqq->slice_start, cfqq->dispatch_start)) + *unaccounted_time += cfqq->slice_start - + cfqq->dispatch_start; } return slice_used; @@ -939,12 +961,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, struct cfq_queue *cfqq) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - unsigned int used_sl, charge; + unsigned int used_sl, charge, unaccounted_sl = 0; int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - cfqg->service_tree_idle.count; BUG_ON(nr_sync < 0); - used_sl = charge = cfq_cfqq_slice_usage(cfqq); + used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); if (iops_mode(cfqd)) charge = cfqq->slice_dispatch; @@ -952,9 +974,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, charge = cfqq->allocated_slice; /* Can't update vdisktime while group is on service tree */ - cfq_rb_erase(&cfqg->rb_node, st); + cfq_group_service_tree_del(st, cfqg); cfqg->vdisktime += cfq_scale_slice(charge, cfqg); - __cfq_group_service_tree_add(st, cfqg); + /* If a new weight was requested, update now, off tree */ + cfq_group_service_tree_add(st, cfqg); /* This group is being expired. Save the context */ if (time_after(cfqd->workload_expires, jiffies)) { @@ -970,7 +993,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" " sect=%u", used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd), cfqq->nr_sectors); - cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); + cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, + unaccounted_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); } @@ -985,7 +1009,9 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, unsigned int weight) { - cfqg_of_blkg(blkg)->weight = weight; + struct cfq_group *cfqg = cfqg_of_blkg(blkg); + cfqg->new_weight = weight; + cfqg->needs_update = true; } static struct cfq_group * @@ -1187,32 +1213,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, int new_cfqq = 1; int group_changed = 0; -#ifdef CONFIG_CFQ_GROUP_IOSCHED - if (!cfqd->cfq_group_isolation - && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD - && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { - /* Move this cfq to root group */ - cfq_log_cfqq(cfqd, cfqq, "moving to root group"); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_group_service_tree_del(cfqd, cfqq->cfqg); - cfqq->orig_cfqg = cfqq->cfqg; - cfqq->cfqg = &cfqd->root_group; - cfqd->root_group.ref++; - group_changed = 1; - } else if (!cfqd->cfq_group_isolation - && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { - /* cfqq is sequential now needs to go to its original group */ - BUG_ON(cfqq->cfqg != &cfqd->root_group); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_group_service_tree_del(cfqd, cfqq->cfqg); - cfq_put_cfqg(cfqq->cfqg); - cfqq->cfqg = cfqq->orig_cfqg; - cfqq->orig_cfqg = NULL; - group_changed = 1; - cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); - } -#endif - service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), cfqq_type(cfqq)); if (cfq_class_idle(cfqq)) { @@ -1284,7 +1284,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, service_tree->count++; if ((add_front || !new_cfqq) && !group_changed) return; - cfq_group_service_tree_add(cfqd, cfqq->cfqg); + cfq_group_notify_queue_add(cfqd, cfqq->cfqg); } static struct cfq_queue * @@ -1372,6 +1372,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; + if (cfq_cfqq_sync(cfqq)) + cfqd->busy_sync_queues++; cfq_resort_rr_list(cfqd, cfqq); } @@ -1395,9 +1397,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfqq->p_root = NULL; } - cfq_group_service_tree_del(cfqd, cfqq->cfqg); + cfq_group_notify_queue_del(cfqd, cfqq->cfqg); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; + if (cfq_cfqq_sync(cfqq)) + cfqd->busy_sync_queues--; } /* @@ -2405,6 +2409,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Does this cfqq already have too much IO in flight? */ if (cfqq->dispatched >= max_dispatch) { + bool promote_sync = false; /* * idle queue must always only have a single IO in flight */ @@ -2412,15 +2417,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) return false; /* + * If there is only one sync queue + * we can ignore async queue here and give the sync + * queue no dispatch limit. The reason is a sync queue can + * preempt async queue, limiting the sync queue doesn't make + * sense. This is useful for aiostress test. + */ + if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) + promote_sync = true; + + /* * We have other queues, don't allow more IO from this one */ - if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) + if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && + !promote_sync) return false; /* * Sole queue user, no limit */ - if (cfqd->busy_queues == 1) + if (cfqd->busy_queues == 1 || promote_sync) max_dispatch = -1; else /* @@ -2542,7 +2558,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; - struct cfq_group *cfqg, *orig_cfqg; + struct cfq_group *cfqg; BUG_ON(cfqq->ref <= 0); @@ -2554,7 +2570,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); cfqg = cfqq->cfqg; - orig_cfqg = cfqq->orig_cfqg; if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); @@ -2564,8 +2579,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); cfq_put_cfqg(cfqg); - if (orig_cfqg) - cfq_put_cfqg(orig_cfqg); } /* @@ -3613,12 +3626,12 @@ static void cfq_put_request(struct request *rq) put_io_context(RQ_CIC(rq)->ioc); - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; + rq->elevator_private[0] = NULL; + rq->elevator_private[1] = NULL; /* Put down rq reference on cfqg */ cfq_put_cfqg(RQ_CFQG(rq)); - rq->elevator_private3 = NULL; + rq->elevator_private[2] = NULL; cfq_put_queue(cfqq); } @@ -3705,13 +3718,12 @@ new_queue: } cfqq->allocated[rw]++; - cfqq->ref++; - rq->elevator_private = cic; - rq->elevator_private2 = cfqq; - rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); + cfqq->ref++; + rq->elevator_private[0] = cic; + rq->elevator_private[1] = cfqq; + rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); spin_unlock_irqrestore(q->queue_lock, flags); - return 0; queue_fail: @@ -3953,7 +3965,6 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_group_idle = cfq_group_idle; cfqd->cfq_latency = 1; - cfqd->cfq_group_isolation = 0; cfqd->hw_tag = -1; /* * we optimistically start assuming sync ops weren't delayed in last @@ -4029,7 +4040,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); -SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -4063,7 +4073,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); -STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -4081,7 +4090,6 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_idle), CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), - CFQ_ATTR(group_isolation), __ATTR_NULL }; @@ -4096,7 +4104,6 @@ static struct elevator_type iosched_cfq = { .elevator_add_req_fn = cfq_insert_request, .elevator_activate_req_fn = cfq_activate_request, .elevator_deactivate_req_fn = cfq_deactivate_request, - .elevator_queue_empty_fn = cfq_queue_empty, .elevator_completed_req_fn = cfq_completed_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, diff --git a/block/cfq.h b/block/cfq.h index 54a6d90..2a15592 100644 --- a/block/cfq.h +++ b/block/cfq.h @@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, } static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time) + unsigned long time, unsigned long unaccounted_time) { - blkiocg_update_timeslice_used(blkg, time); + blkiocg_update_timeslice_used(blkg, time, unaccounted_time); } static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) @@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, unsigned long dequeue) {} static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time) {} + unsigned long time, unsigned long unaccounted_time) {} static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, bool direction, bool sync) {} diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b547cbc..5139c0e 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -326,14 +326,6 @@ dispatch_request: return 1; } -static int deadline_queue_empty(struct request_queue *q) -{ - struct deadline_data *dd = q->elevator->elevator_data; - - return list_empty(&dd->fifo_list[WRITE]) - && list_empty(&dd->fifo_list[READ]); -} - static void deadline_exit_queue(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = { .elevator_merge_req_fn = deadline_merged_requests, .elevator_dispatch_fn = deadline_dispatch_requests, .elevator_add_req_fn = deadline_add_request, - .elevator_queue_empty_fn = deadline_queue_empty, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, .elevator_init_fn = deadline_init_queue, diff --git a/block/elevator.c b/block/elevator.c index 236e93c..c387d31 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_rq_merge_ok); -static inline int elv_try_merge(struct request *__rq, struct bio *bio) +int elv_try_merge(struct request *__rq, struct bio *bio) { int ret = ELEVATOR_NO_MERGE; @@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) struct list_head *entry; int stop_flags; + BUG_ON(rq->cmd_flags & REQ_ON_PLUG); + if (q->last_merge == rq) q->last_merge = NULL; @@ -519,6 +521,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) return ELEVATOR_NO_MERGE; } +/* + * Attempt to do an insertion back merge. Only check for the case where + * we can append 'rq' to an existing request, so we can throw 'rq' away + * afterwards. + * + * Returns true if we merged, false otherwise + */ +static bool elv_attempt_insert_merge(struct request_queue *q, + struct request *rq) +{ + struct request *__rq; + + if (blk_queue_nomerges(q)) + return false; + + /* + * First try one-hit cache. + */ + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + return true; + + if (blk_queue_noxmerges(q)) + return false; + + /* + * See if our hash lookup can find a potential backmerge. + */ + __rq = elv_rqhash_find(q, blk_rq_pos(rq)); + if (__rq && blk_attempt_req_merge(q, __rq, rq)) + return true; + + return false; +} + void elv_merged_request(struct request_queue *q, struct request *rq, int type) { struct elevator_queue *e = q->elevator; @@ -536,14 +572,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; + const int next_sorted = next->cmd_flags & REQ_SORTED; - if (e->ops->elevator_merge_req_fn) + if (next_sorted && e->ops->elevator_merge_req_fn) e->ops->elevator_merge_req_fn(q, rq, next); elv_rqhash_reposition(q, rq); - elv_rqhash_del(q, next); - q->nr_sorted--; + if (next_sorted) { + elv_rqhash_del(q, next); + q->nr_sorted--; + } + q->last_merge = rq; } @@ -617,21 +657,12 @@ void elv_quiesce_end(struct request_queue *q) void elv_insert(struct request_queue *q, struct request *rq, int where) { - int unplug_it = 1; - trace_block_rq_insert(q, rq); rq->q = q; switch (where) { case ELEVATOR_INSERT_REQUEUE: - /* - * Most requeues happen because of a busy condition, - * don't force unplug of the queue for that case. - * Clear unplug_it and fall through. - */ - unplug_it = 0; - case ELEVATOR_INSERT_FRONT: rq->cmd_flags |= REQ_SOFTBARRIER; list_add(&rq->queuelist, &q->queue_head); @@ -654,6 +685,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) __blk_run_queue(q, false); break; + case ELEVATOR_INSERT_SORT_MERGE: + /* + * If we succeed in merging this request with one in the + * queue already, we are done - rq has now been freed, + * so no need to do anything further. + */ + if (elv_attempt_insert_merge(q, rq)) + break; case ELEVATOR_INSERT_SORT: BUG_ON(rq->cmd_type != REQ_TYPE_FS && !(rq->cmd_flags & REQ_DISCARD)); @@ -673,24 +712,21 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) q->elevator->ops->elevator_add_req_fn(q, rq); break; + case ELEVATOR_INSERT_FLUSH: + rq->cmd_flags |= REQ_SOFTBARRIER; + blk_insert_flush(rq); + break; default: printk(KERN_ERR "%s: bad insertion point %d\n", __func__, where); BUG(); } - - if (unplug_it && blk_queue_plugged(q)) { - int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] - - queue_in_flight(q); - - if (nrq >= q->unplug_thresh) - __generic_unplug_device(q); - } } -void __elv_add_request(struct request_queue *q, struct request *rq, int where, - int plug) +void __elv_add_request(struct request_queue *q, struct request *rq, int where) { + BUG_ON(rq->cmd_flags & REQ_ON_PLUG); + if (rq->cmd_flags & REQ_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS || @@ -702,38 +738,20 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where, where == ELEVATOR_INSERT_SORT) where = ELEVATOR_INSERT_BACK; - if (plug) - blk_plug_device(q); - elv_insert(q, rq, where); } EXPORT_SYMBOL(__elv_add_request); -void elv_add_request(struct request_queue *q, struct request *rq, int where, - int plug) +void elv_add_request(struct request_queue *q, struct request *rq, int where) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - __elv_add_request(q, rq, where, plug); + __elv_add_request(q, rq, where); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(elv_add_request); -int elv_queue_empty(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - - if (!list_empty(&q->queue_head)) - return 0; - - if (e->ops->elevator_queue_empty_fn) - return e->ops->elevator_queue_empty_fn(q); - - return 1; -} -EXPORT_SYMBOL(elv_queue_empty); - struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; @@ -759,7 +777,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) if (e->ops->elevator_set_req_fn) return e->ops->elevator_set_req_fn(q, rq, gfp_mask); - rq->elevator_private = NULL; + rq->elevator_private[0] = NULL; return 0; } @@ -785,6 +803,8 @@ void elv_abort_queue(struct request_queue *q) { struct request *rq; + blk_abort_flushes(q); + while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); rq->cmd_flags |= REQ_QUIET; diff --git a/block/genhd.c b/block/genhd.c index cbf1112..c91a2da 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%u %lu %lu %llu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[0]), - part_stat_read(hd, merges[0]), - (unsigned long long)part_stat_read(hd, sectors[0]), - jiffies_to_msecs(part_stat_read(hd, ticks[0])), - part_stat_read(hd, ios[1]), - part_stat_read(hd, merges[1]), - (unsigned long long)part_stat_read(hd, sectors[1]), - jiffies_to_msecs(part_stat_read(hd, ticks[1])), + part_stat_read(hd, ios[READ]), + part_stat_read(hd, merges[READ]), + (unsigned long long)part_stat_read(hd, sectors[READ]), + jiffies_to_msecs(part_stat_read(hd, ticks[READ])), + part_stat_read(hd, ios[WRITE]), + part_stat_read(hd, merges[WRITE]), + (unsigned long long)part_stat_read(hd, sectors[WRITE]), + jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) @@ -1494,7 +1494,7 @@ void disk_block_events(struct gendisk *disk) void disk_unblock_events(struct gendisk *disk) { if (disk->ev) - __disk_unblock_events(disk, true); + __disk_unblock_events(disk, false); } /** diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 232c4b3..06389e9 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq) list_add_tail(&rq->queuelist, &nd->queue); } -static int noop_queue_empty(struct request_queue *q) -{ - struct noop_data *nd = q->elevator->elevator_data; - - return list_empty(&nd->queue); -} - static struct request * noop_former_request(struct request_queue *q, struct request *rq) { @@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = { .elevator_merge_req_fn = noop_merged_requests, .elevator_dispatch_fn = noop_dispatch, .elevator_add_req_fn = noop_add_request, - .elevator_queue_empty_fn = noop_queue_empty, .elevator_former_req_fn = noop_former_request, .elevator_latter_req_fn = noop_latter_request, .elevator_init_fn = noop_init_queue, |