diff options
-rw-r--r-- | Documentation/block/queue-sysfs.txt | 10 | ||||
-rw-r--r-- | block/blk-core.c | 11 | ||||
-rw-r--r-- | block/blk-ioc.c | 40 | ||||
-rw-r--r-- | block/blk-lib.c | 5 | ||||
-rw-r--r-- | block/blk-softirq.c | 11 | ||||
-rw-r--r-- | block/blk-sysfs.c | 13 | ||||
-rw-r--r-- | block/blk-throttle.c | 8 | ||||
-rw-r--r-- | block/cfq-iosched.c | 152 | ||||
-rw-r--r-- | block/compat_ioctl.c | 14 | ||||
-rw-r--r-- | block/deadline-iosched.c | 4 | ||||
-rw-r--r-- | block/elevator.c | 7 | ||||
-rw-r--r-- | block/genhd.c | 28 | ||||
-rw-r--r-- | fs/block_dev.c | 23 | ||||
-rw-r--r-- | fs/compat_ioctl.c | 5 | ||||
-rw-r--r-- | fs/partitions/check.c | 12 | ||||
-rw-r--r-- | fs/reiserfs/journal.c | 13 | ||||
-rw-r--r-- | fs/super.c | 4 | ||||
-rw-r--r-- | include/linux/blkdev.h | 27 | ||||
-rw-r--r-- | include/linux/elevator.h | 2 | ||||
-rw-r--r-- | include/linux/fd.h | 22 | ||||
-rw-r--r-- | include/linux/fs.h | 4 | ||||
-rw-r--r-- | include/linux/genhd.h | 2 | ||||
-rw-r--r-- | include/linux/init_task.h | 1 | ||||
-rw-r--r-- | include/linux/iocontext.h | 14 | ||||
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/exit.c | 1 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | mm/backing-dev.c | 2 |
28 files changed, 229 insertions, 208 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index f652740..d8147b3 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -45,9 +45,13 @@ device. rq_affinity (RW) ---------------- -If this option is enabled, the block layer will migrate request completions -to the CPU that originally submitted the request. For some workloads -this provides a significant reduction in CPU cycles due to caching effects. +If this option is '1', the block layer will migrate request completions to the +cpu "group" that originally submitted the request. For some workloads this +provides a significant reduction in CPU cycles due to caching effects. + +For storage configurations that need to maximize distribution of completion +processing setting this option to '2' forces the completion to run on the +requesting cpu (bypassing the "group" aggregation logic). scheduler (RW) -------------- diff --git a/block/blk-core.c b/block/blk-core.c index 1d49e1c..f8cb099 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1282,10 +1282,8 @@ get_rq: init_request_from_bio(req, bio); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) { - req->cpu = blk_cpu_to_group(get_cpu()); - put_cpu(); - } + bio_flagged(bio, BIO_CPU_AFFINE)) + req->cpu = smp_processor_id(); plug = current->plug; if (plug) { @@ -1305,7 +1303,10 @@ get_rq: plug->should_sort = 1; } list_add_tail(&req->queuelist, &plug->list); + plug->count++; drive_stat_acct(req, 1); + if (plug->count >= BLK_MAX_REQUEST_COUNT) + blk_flush_plug_list(plug, false); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); @@ -2629,6 +2630,7 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); plug->should_sort = 0; + plug->count = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2712,6 +2714,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; list_splice_init(&plug->list, &list); + plug->count = 0; if (plug->should_sort) { list_sort(NULL, &list, plug_rq_cmp); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 342eae9..6f9bbd9 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -82,26 +82,26 @@ void exit_io_context(struct task_struct *task) struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { - struct io_context *ret; + struct io_context *ioc; - ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_long_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = 0; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - INIT_HLIST_HEAD(&ret->cic_list); - ret->ioc_data = NULL; + ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); + if (ioc) { + atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); + spin_lock_init(&ioc->lock); + ioc->ioprio_changed = 0; + ioc->ioprio = 0; + ioc->last_waited = 0; /* doesn't matter... */ + ioc->nr_batch_requests = 0; /* because this is 0 */ + INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); + INIT_HLIST_HEAD(&ioc->cic_list); + ioc->ioc_data = NULL; #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) - ret->cgroup_changed = 0; + ioc->cgroup_changed = 0; #endif } - return ret; + return ioc; } /* @@ -139,19 +139,19 @@ struct io_context *current_io_context(gfp_t gfp_flags, int node) */ struct io_context *get_io_context(gfp_t gfp_flags, int node) { - struct io_context *ret = NULL; + struct io_context *ioc = NULL; /* * Check for unlikely race with exiting task. ioc ref count is * zero when ioc is being detached. */ do { - ret = current_io_context(gfp_flags, node); - if (unlikely(!ret)) + ioc = current_io_context(gfp_flags, node); + if (unlikely(!ioc)) break; - } while (!atomic_long_inc_not_zero(&ret->refcount)); + } while (!atomic_long_inc_not_zero(&ioc->refcount)); - return ret; + return ioc; } EXPORT_SYMBOL(get_io_context); diff --git a/block/blk-lib.c b/block/blk-lib.c index 78e627e..2b461b4 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -59,7 +59,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, * granularity */ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - if (q->limits.discard_granularity) { + if (unlikely(!max_discard_sectors)) { + /* Avoid infinite loop below. Being cautious never hurts. */ + return -EOPNOTSUPP; + } else if (q->limits.discard_granularity) { unsigned int disc_sects = q->limits.discard_granularity >> 9; max_discard_sectors &= ~(disc_sects - 1); diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ee9c216..475fab8 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = { void __blk_complete_request(struct request *req) { + int ccpu, cpu, group_cpu = NR_CPUS; struct request_queue *q = req->q; unsigned long flags; - int ccpu, cpu, group_cpu; BUG_ON(!q->softirq_done_fn); local_irq_save(flags); cpu = smp_processor_id(); - group_cpu = blk_cpu_to_group(cpu); /* * Select completion CPU */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) { ccpu = req->cpu; - else + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { + ccpu = blk_cpu_to_group(ccpu); + group_cpu = blk_cpu_to_group(cpu); + } + } else ccpu = cpu; if (ccpu == cpu || ccpu == group_cpu) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d935bd8..0ee17b5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); - return queue_var_show(set, page); + return queue_var_show(set << force, page); } static ssize_t @@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) ret = queue_var_store(&val, page, count); spin_lock_irq(q->queue_lock); - if (val) + if (val) { queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - else - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + if (val == 2) + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + } else { + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } spin_unlock_irq(q->queue_lock); #endif return ret; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 3689f83..f6a7941 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -142,9 +142,9 @@ static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) return NULL; } -static inline int total_nr_queued(struct throtl_data *td) +static inline unsigned int total_nr_queued(struct throtl_data *td) { - return (td->nr_queued[0] + td->nr_queued[1]); + return td->nr_queued[0] + td->nr_queued[1]; } static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) @@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q) bio_list_init(&bio_list_on_stack); - throtl_log(td, "dispatch nr_queued=%d read=%u write=%u", + throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", total_nr_queued(td), td->nr_queued[READ], td->nr_queued[WRITE]); @@ -970,7 +970,7 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) struct delayed_work *dwork = &td->throtl_work; /* schedule work if limits changed even if no bio is queued */ - if (total_nr_queued(td) > 0 || td->limits_changed) { + if (total_nr_queued(td) || td->limits_changed) { /* * We might have a work scheduled to be executed in future. * Cancel that and schedule a new one. diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ae21919..1f96ad6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -87,9 +87,10 @@ struct cfq_rb_root { unsigned count; unsigned total_weight; u64 min_vdisktime; + struct cfq_ttime ttime; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ - .count = 0, .min_vdisktime = 0, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \ + .ttime = {.last_end_request = jiffies,},} /* * Per process-grouping structure @@ -129,14 +130,12 @@ struct cfq_queue { unsigned long slice_end; long slice_resid; - /* pending metadata requests */ - int meta_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; /* io prio of this group */ unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; + unsigned short ioprio_class; pid_t pid; @@ -212,6 +211,7 @@ struct cfq_group { #endif /* number of requests that are on the dispatch list or inside driver */ int dispatched; + struct cfq_ttime ttime; }; /* @@ -393,6 +393,18 @@ CFQ_CFQQ_FNS(wait_busy); j++, st = i < IDLE_WORKLOAD ? \ &cfqg->service_trees[i][j]: NULL) \ +static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, + struct cfq_ttime *ttime, bool group_idle) +{ + unsigned long slice; + if (!sample_valid(ttime->ttime_samples)) + return false; + if (group_idle) + slice = cfqd->cfq_group_idle; + else + slice = cfqd->cfq_slice_idle; + return ttime->ttime_mean > slice; +} static inline bool iops_mode(struct cfq_data *cfqd) { @@ -670,9 +682,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq_is_sync(rq1) != rq_is_sync(rq2)) return rq_is_sync(rq1) ? rq1 : rq2; - if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) - return rq1->cmd_flags & REQ_META ? rq1 : rq2; - s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -1005,8 +1014,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) return NULL; } -void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, - unsigned int weight) +static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, + unsigned int weight) { struct cfq_group *cfqg = cfqg_of_blkg(blkg); cfqg->new_weight = weight; @@ -1059,6 +1068,8 @@ static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); + cfqg->ttime.last_end_request = jiffies; + /* * Take the initial reference that will be released on destroy * This can be thought of a joint reference by cgroup and @@ -1235,7 +1246,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd) * it should not be NULL as even if elevator was exiting, cgroup deltion * path got to it first. */ -void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) +static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) { unsigned long flags; struct cfq_data *cfqd = key; @@ -1502,16 +1513,11 @@ static void cfq_add_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); struct cfq_data *cfqd = cfqq->cfqd; - struct request *__alias, *prev; + struct request *prev; cfqq->queued[rq_is_sync(rq)]++; - /* - * looks a little odd, but the first insert might return an alias. - * if that happens, put the alias on the dispatch list - */ - while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) - cfq_dispatch_insert(cfqd->queue, __alias); + elv_rb_add(&cfqq->sort_list, rq); if (!cfq_cfqq_on_rr(cfqq)) cfq_add_cfqq_rr(cfqd, cfqq); @@ -1598,10 +1604,6 @@ static void cfq_remove_request(struct request *rq) cfqq->cfqd->rq_queued--; cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), rq_is_sync(rq)); - if (rq->cmd_flags & REQ_META) { - WARN_ON(!cfqq->meta_pending); - cfqq->meta_pending--; - } } static int cfq_merge(struct request_queue *q, struct request **req, @@ -1969,7 +1971,8 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) + if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) && + !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false)) return true; cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", service_tree->count); @@ -2022,10 +2025,10 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * slice, then don't idle. This avoids overrunning the allotted * time slice. */ - if (sample_valid(cic->ttime_samples) && - (cfqq->slice_end - jiffies < cic->ttime_mean)) { + if (sample_valid(cic->ttime.ttime_samples) && + (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) { cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", - cic->ttime_mean); + cic->ttime.ttime_mean); return; } @@ -2381,8 +2384,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * this group, wait for requests to complete. */ check_group_idle: - if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 - && cfqq->cfqg->dispatched) { + if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && + cfqq->cfqg->dispatched && + !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { cfqq = NULL; goto keep_queue; } @@ -2833,7 +2837,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, cfqd->queue->node); if (cic) { - cic->last_end_request = jiffies; + cic->ttime.last_end_request = jiffies; INIT_LIST_HEAD(&cic->queue_list); INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; @@ -2883,7 +2887,6 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) * elevate the priority of this queue */ cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; cfq_clear_cfqq_prio_changed(cfqq); } @@ -3221,14 +3224,28 @@ err: } static void -cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) +__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) { - unsigned long elapsed = jiffies - cic->last_end_request; - unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); + unsigned long elapsed = jiffies - ttime->last_end_request; + elapsed = min(elapsed, 2UL * slice_idle); - cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; - cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; - cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; + ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8; + ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples; +} + +static void +cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_io_context *cic) +{ + if (cfq_cfqq_sync(cfqq)) { + __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); + __cfq_update_io_thinktime(&cfqq->service_tree->ttime, + cfqd->cfq_slice_idle); + } +#ifdef CONFIG_CFQ_GROUP_IOSCHED + __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); +#endif } static void @@ -3277,8 +3294,8 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; - else if (sample_valid(cic->ttime_samples)) { - if (cic->ttime_mean > cfqd->cfq_slice_idle) + else if (sample_valid(cic->ttime.ttime_samples)) { + if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) enable_idle = 0; else enable_idle = 1; @@ -3340,13 +3357,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, return true; /* - * So both queues are sync. Let the new request get disk time if - * it's a metadata request and the current queue is doing regular IO. - */ - if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) - return true; - - /* * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. */ if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) @@ -3410,10 +3420,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; - if (rq->cmd_flags & REQ_META) - cfqq->meta_pending++; - cfq_update_io_thinktime(cfqd, cic); + cfq_update_io_thinktime(cfqd, cfqq, cic); cfq_update_io_seektime(cfqd, cfqq, rq); cfq_update_idle_window(cfqd, cfqq, cic); @@ -3520,12 +3528,16 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) if (cfqq->cfqg->nr_cfqq > 1) return false; + /* the only queue in the group, but think time is big */ + if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) + return false; + if (cfq_slice_used(cfqq)) return true; /* if slice left is less than think time, wait busy */ - if (cic && sample_valid(cic->ttime_samples) - && (cfqq->slice_end - jiffies < cic->ttime_mean)) + if (cic && sample_valid(cic->ttime.ttime_samples) + && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) return true; /* @@ -3566,11 +3578,24 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; if (sync) { - RQ_CIC(rq)->last_end_request = now; + struct cfq_rb_root *service_tree; + + RQ_CIC(rq)->ttime.last_end_request = now; + + if (cfq_cfqq_on_rr(cfqq)) + service_tree = cfqq->service_tree; + else + service_tree = service_tree_for(cfqq->cfqg, + cfqq_prio(cfqq), cfqq_type(cfqq)); + service_tree->ttime.last_end_request = now; if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) cfqd->last_delayed_sync = now; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED + cfqq->cfqg->ttime.last_end_request = now; +#endif + /* * If this is the active queue, check if it needs to be expired, * or if we want to idle in case it has no pending requests. @@ -3616,30 +3641,6 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_schedule_dispatch(cfqd); } -/* - * we temporarily boost lower priority queues if they are holding fs exclusive - * resources. they are boosted to normal prio (CLASS_BE/4) - */ -static void cfq_prio_boost(struct cfq_queue *cfqq) -{ - if (has_fs_excl()) { - /* - * boost idle prio on transactions that would lock out other - * users of the filesystem - */ - if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; - if (cfqq->ioprio > IOPRIO_NORM) - cfqq->ioprio = IOPRIO_NORM; - } else { - /* - * unboost the queue (if needed) - */ - cfqq->ioprio_class = cfqq->org_ioprio_class; - cfqq->ioprio = cfqq->org_ioprio; - } -} - static inline int __cfq_may_queue(struct cfq_queue *cfqq) { if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { @@ -3670,7 +3671,6 @@ static int cfq_may_queue(struct request_queue *q, int rw) cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); if (cfqq) { cfq_init_prio_data(cfqq, cic->ioc); - cfq_prio_boost(cfqq); return __cfq_may_queue(cfqq); } diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index cc3eb78..7b72502 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -208,19 +208,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode, #define BLKBSZSET_32 _IOW(0x12, 113, int) #define BLKGETSIZE64_32 _IOR(0x12, 114, int) -struct compat_floppy_struct { - compat_uint_t size; - compat_uint_t sect; - compat_uint_t head; - compat_uint_t track; - compat_uint_t stretch; - unsigned char gap; - unsigned char rate; - unsigned char spec1; - unsigned char fmt_gap; - const compat_caddr_t name; -}; - struct compat_floppy_drive_params { char cmos; compat_ulong_t max_dtr; @@ -288,7 +275,6 @@ struct compat_floppy_write_errors { #define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct) #define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct) -#define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct) #define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params) #define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params) #define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct) diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 5139c0e..c644137 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -77,10 +77,8 @@ static void deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) { struct rb_root *root = deadline_rb_root(dd, rq); - struct request *__alias; - while (unlikely(__alias = elv_rb_add(root, rq))) - deadline_move_request(dd, __alias); + elv_rb_add(root, rq); } static inline void diff --git a/block/elevator.c b/block/elevator.c index b0b38ce..a3b64bc 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -353,7 +353,7 @@ static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) * RB-tree support functions for inserting/lookup/removal of requests * in a sorted RB tree. */ -struct request *elv_rb_add(struct rb_root *root, struct request *rq) +void elv_rb_add(struct rb_root *root, struct request *rq) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -365,15 +365,12 @@ struct request *elv_rb_add(struct rb_root *root, struct request *rq) if (blk_rq_pos(rq) < blk_rq_pos(__rq)) p = &(*p)->rb_left; - else if (blk_rq_pos(rq) > blk_rq_pos(__rq)) + else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) p = &(*p)->rb_right; - else - return __rq; } rb_link_node(&rq->rb_node, parent, p); rb_insert_color(&rq->rb_node, root); - return NULL; } EXPORT_SYMBOL(elv_rb_add); diff --git a/block/genhd.c b/block/genhd.c index 6024b82..5cb51c5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -602,7 +602,7 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); - /* Register BDI before referencing it from bdev */ + /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, disk_devt(disk)); @@ -1140,7 +1140,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) "wsect wuse running use aveq" "\n\n"); */ - + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); @@ -1164,7 +1164,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) ); } disk_part_iter_exit(&piter); - + return 0; } @@ -1492,30 +1492,32 @@ void disk_unblock_events(struct gendisk *disk) } /** - * disk_check_events - schedule immediate event checking - * @disk: disk to check events for + * disk_flush_events - schedule immediate event checking and flushing + * @disk: disk to check and flush events for + * @mask: events to flush * - * Schedule immediate event checking on @disk if not blocked. + * Schedule immediate event checking on @disk if not blocked. Events in + * @mask are scheduled to be cleared from the driver. Note that this + * doesn't clear the events from @disk->ev. * * CONTEXT: - * Don't care. Safe to call from irq context. + * If @mask is non-zero must be called with bdev->bd_mutex held. */ -void disk_check_events(struct gendisk *disk) +void disk_flush_events(struct gendisk *disk, unsigned int mask) { struct disk_events *ev = disk->ev; - unsigned long flags; if (!ev) return; - spin_lock_irqsave(&ev->lock, flags); + spin_lock_irq(&ev->lock); + ev->clearing |= mask; if (!ev->block) { cancel_delayed_work(&ev->dwork); queue_delayed_work(system_nrt_wq, &ev->dwork, 0); } - spin_unlock_irqrestore(&ev->lock, flags); + spin_unlock_irq(&ev->lock); } -EXPORT_SYMBOL_GPL(disk_check_events); /** * disk_clear_events - synchronously check, clear and return pending events @@ -1705,7 +1707,7 @@ static int disk_events_set_dfl_poll_msecs(const char *val, mutex_lock(&disk_events_mutex); list_for_each_entry(ev, &disk_events, node) - disk_check_events(ev->disk); + disk_flush_events(ev->disk, 0); mutex_unlock(&disk_events_mutex); diff --git a/fs/block_dev.c b/fs/block_dev.c index 9fb0b15..c62fb84 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1448,6 +1448,8 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) int blkdev_put(struct block_device *bdev, fmode_t mode) { + mutex_lock(&bdev->bd_mutex); + if (mode & FMODE_EXCL) { bool bdev_free; @@ -1456,7 +1458,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * are protected with bdev_lock. bd_mutex is to * synchronize disk_holder unlinking. */ - mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); @@ -1474,17 +1475,21 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) * If this was the last claim, remove holder link and * unblock evpoll if it was a write holder. */ - if (bdev_free) { - if (bdev->bd_write_holder) { - disk_unblock_events(bdev->bd_disk); - disk_check_events(bdev->bd_disk); - bdev->bd_write_holder = false; - } + if (bdev_free && bdev->bd_write_holder) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; } - - mutex_unlock(&bdev->bd_mutex); } + /* + * Trigger event checking and tell drivers to flush MEDIA_CHANGE + * event. This is to ensure detection of media removal commanded + * from userland - e.g. eject(1). + */ + disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); + + mutex_unlock(&bdev->bd_mutex); + return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 61abb63..8be086e 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -68,6 +68,8 @@ #ifdef CONFIG_BLOCK #include <linux/loop.h> +#include <linux/cdrom.h> +#include <linux/fd.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/sg.h> @@ -944,6 +946,9 @@ COMPATIBLE_IOCTL(FIOQSIZE) IGNORE_IOCTL(LOOP_CLR_FD) /* md calls this on random blockdevs */ IGNORE_IOCTL(RAID_VERSION) +/* qemu/qemu-img might call these two on plain files for probing */ +IGNORE_IOCTL(CDROM_DRIVE_STATUS) +IGNORE_IOCTL(FDGETPRM32) /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index d545e97..e3c63d1 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -237,22 +237,22 @@ ssize_t part_size_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } -ssize_t part_ro_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%d\n", p->policy ? 1 : 0); } -ssize_t part_alignment_offset_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); } -ssize_t part_discard_alignment_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%u\n", p->discard_alignment); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index c5e82ec..a159ba5 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -678,23 +678,19 @@ struct buffer_chunk { static void write_chunk(struct buffer_chunk *chunk) { int i; - get_fs_excl(); for (i = 0; i < chunk->nr; i++) { submit_logged_buffer(chunk->bh[i]); } chunk->nr = 0; - put_fs_excl(); } static void write_ordered_chunk(struct buffer_chunk *chunk) { int i; - get_fs_excl(); for (i = 0; i < chunk->nr; i++) { submit_ordered_buffer(chunk->bh[i]); } chunk->nr = 0; - put_fs_excl(); } static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, @@ -986,8 +982,6 @@ static int flush_commit_list(struct super_block *s, return 0; } - get_fs_excl(); - /* before we can put our commit blocks on disk, we have to make sure everyone older than ** us is on disk too */ @@ -1145,7 +1139,6 @@ static int flush_commit_list(struct super_block *s, if (retval) reiserfs_abort(s, retval, "Journal write error in %s", __func__); - put_fs_excl(); return retval; } @@ -1374,8 +1367,6 @@ static int flush_journal_list(struct super_block *s, return 0; } - get_fs_excl(); - /* if all the work is already done, get out of here */ if (atomic_read(&(jl->j_nonzerolen)) <= 0 && atomic_read(&(jl->j_commit_left)) <= 0) { @@ -1597,7 +1588,6 @@ static int flush_journal_list(struct super_block *s, put_journal_list(s, jl); if (flushall) mutex_unlock(&journal->j_flush_mutex); - put_fs_excl(); return err; } @@ -3108,7 +3098,6 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, th->t_trans_id = journal->j_trans_id; unlock_journal(sb); INIT_LIST_HEAD(&th->t_list); - get_fs_excl(); return 0; out_fail: @@ -3964,7 +3953,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, flush = flags & FLUSH_ALL; wait_on_commit = flags & WAIT; - put_fs_excl(); current->journal_info = th->t_handle_save; reiserfs_check_lock_depth(sb, "journal end"); if (journal->j_len == 0) { @@ -4316,4 +4304,3 @@ void reiserfs_abort_journal(struct super_block *sb, int errno) dump_stack(); #endif } - @@ -351,13 +351,11 @@ bool grab_super_passive(struct super_block *sb) */ void lock_super(struct super_block * sb) { - get_fs_excl(); mutex_lock(&sb->s_lock); } void unlock_super(struct super_block * sb) { - put_fs_excl(); mutex_unlock(&sb->s_lock); } @@ -385,7 +383,6 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); - get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(&sb->s_inodes); @@ -400,7 +397,6 @@ void generic_shutdown_super(struct super_block *sb) "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } - put_fs_excl(); } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1a23722..0e67c45 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -73,7 +73,7 @@ enum rq_cmd_type_bits { /* * try to put the fields that are referenced together in the same cacheline. - * if you modify this structure, be sure to check block/blk-core.c:rq_init() + * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() * as well! */ struct request { @@ -260,8 +260,7 @@ struct queue_limits { unsigned char discard_zeroes_data; }; -struct request_queue -{ +struct request_queue { /* * Together with queue_head for cacheline sharing */ @@ -304,14 +303,14 @@ struct request_queue void *queuedata; /* - * queue needs bounce pages for pages above this limit + * various queue flags, see QUEUE_* below */ - gfp_t bounce_gfp; + unsigned long queue_flags; /* - * various queue flags, see QUEUE_* below + * queue needs bounce pages for pages above this limit */ - unsigned long queue_flags; + gfp_t bounce_gfp; /* * protects queue structures from reentrancy. ->__queue_lock should @@ -334,8 +333,8 @@ struct request_queue unsigned int nr_congestion_off; unsigned int nr_batching; - void *dma_drain_buffer; unsigned int dma_drain_size; + void *dma_drain_buffer; unsigned int dma_pad_mask; unsigned int dma_alignment; @@ -393,7 +392,7 @@ struct request_queue #define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 9 /* force complete on same CPU */ +#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */ #define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */ #define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */ @@ -403,6 +402,7 @@ struct request_queue #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ +#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -857,12 +857,21 @@ struct request_queue *blk_alloc_queue(gfp_t); struct request_queue *blk_alloc_queue_node(gfp_t, int); extern void blk_put_queue(struct request_queue *); +/* + * Note: Code in between changing the blk_plug list/cb_list or element of such + * lists is preemptable, but such code can't do sleep (or be very careful), + * otherwise data is corrupted. For details, please check schedule() where + * blk_schedule_flush_plug() is called. + */ struct blk_plug { unsigned long magic; struct list_head list; struct list_head cb_list; unsigned int should_sort; + unsigned int count; }; +#define BLK_MAX_REQUEST_COUNT 16 + struct blk_plug_cb { struct list_head list; void (*callback)(struct blk_plug_cb *); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 21a8ebf..d800d51 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -146,7 +146,7 @@ extern struct request *elv_rb_latter_request(struct request_queue *, struct requ /* * rb support functions. */ -extern struct request *elv_rb_add(struct rb_root *, struct request *); +extern void elv_rb_add(struct rb_root *, struct request *); extern void elv_rb_del(struct rb_root *, struct request *); extern struct request *elv_rb_find(struct rb_root *, sector_t); diff --git a/include/linux/fd.h b/include/linux/fd.h index f5d194a..72202b1 100644 --- a/include/linux/fd.h +++ b/include/linux/fd.h @@ -377,4 +377,26 @@ struct floppy_raw_cmd { #define FDEJECT _IO(2, 0x5a) /* eject the disk */ + +#ifdef __KERNEL__ +#ifdef CONFIG_COMPAT +#include <linux/compat.h> + +struct compat_floppy_struct { + compat_uint_t size; + compat_uint_t sect; + compat_uint_t head; + compat_uint_t track; + compat_uint_t stretch; + unsigned char gap; + unsigned char rate; + unsigned char spec1; + unsigned char fmt_gap; + const compat_caddr_t name; +}; + +#define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct) +#endif +#endif + #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index b224dc4..0c35d6e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1469,10 +1469,6 @@ enum { #define vfs_check_frozen(sb, level) \ wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) -#define get_fs_excl() atomic_inc(¤t->fs_excl) -#define put_fs_excl() atomic_dec(¤t->fs_excl) -#define has_fs_excl() atomic_read(¤t->fs_excl) - /* * until VFS tracks user namespaces for inodes, just make all files * belong to init_user_ns diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 300d758..02fa469 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -420,7 +420,7 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); -extern void disk_check_events(struct gendisk *disk); +extern void disk_flush_events(struct gendisk *disk, unsigned int mask); extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); /* drivers/char/random.c */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 580f70c..d14e058 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -176,7 +176,6 @@ extern struct cred init_cred; .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ - .fs_excl = ATOMIC_INIT(0), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ .pids = { \ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index b2eee89..5037a0a 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -5,6 +5,14 @@ #include <linux/rcupdate.h> struct cfq_queue; +struct cfq_ttime { + unsigned long last_end_request; + + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; +}; + struct cfq_io_context { void *key; @@ -12,11 +20,7 @@ struct cfq_io_context { struct io_context *ioc; - unsigned long last_end_request; - - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; + struct cfq_ttime ttime; struct list_head queue_list; struct hlist_node cic_list; diff --git a/include/linux/sched.h b/include/linux/sched.h index ed766ad..20b03bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1512,7 +1512,6 @@ struct task_struct { short il_next; short pref_node_fork; #endif - atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; /* diff --git a/kernel/exit.c b/kernel/exit.c index 73bb192..12ea415 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -898,7 +898,6 @@ NORET_TYPE void do_exit(long code) profile_task_exit(tsk); - WARN_ON(atomic_read(&tsk->fs_excl)); WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) diff --git a/kernel/fork.c b/kernel/fork.c index aeae5b1..17bf7c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -290,7 +290,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); - atomic_set(&tsk->fs_excl, 0); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e..2ef0dc9 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -505,7 +505,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); - synchronize_rcu(); + synchronize_rcu_expedited(); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, |