diff options
70 files changed, 3664 insertions, 3521 deletions
diff --git a/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt b/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt new file mode 100644 index 0000000..099d936 --- /dev/null +++ b/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt @@ -0,0 +1,14 @@ +NVIDIA Tegra 20 GART + +Required properties: +- compatible: "nvidia,tegra20-gart" +- reg: Two pairs of cells specifying the physical address and size of + the memory controller registers and the GART aperture respectively. + +Example: + + gart { + compatible = "nvidia,tegra20-gart"; + reg = <0x7000f024 0x00000018 /* controller registers */ + 0x58000000 0x02000000>; /* GART aperture */ + }; diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b40b413..c45513d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -335,6 +335,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. requirements as needed. This option does not override iommu=pt + amd_iommu_dump= [HW,X86-64] + Enable AMD IOMMU driver option to dump the ACPI table + for AMD IOMMU. With this option enabled, AMD IOMMU + driver will print ACPI tables for AMD IOMMU during + IOMMU initialization. + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: <a>,<b> diff --git a/MAINTAINERS b/MAINTAINERS index a246490..64e675d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2818,6 +2818,12 @@ F: Documentation/firmware_class/ F: drivers/base/firmware*.c F: include/linux/firmware.h +FLOPPY DRIVER +M: Jiri Kosina <jkosina@suse.cz> +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/floppy.git +S: Odd fixes +F: drivers/block/floppy.c + FPU EMULATOR M: Bill Metzenthen <billm@melbpc.org.au> W: http://floatingpoint.sourceforge.net/emulator/index.html diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 3199b76..421bef9 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,8 +23,6 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" - # If BLK_CGROUP is a module, CFQ has to be built as module. - depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -34,8 +32,6 @@ config IOSCHED_CFQ This is the default I/O scheduler. - Note: If BLK_CGROUP=m, then CFQ can be built only as module. - config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && BLK_CGROUP diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 126c341..02cf633 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -11,1570 +11,612 @@ * Nauman Rafique <nauman@google.com> */ #include <linux/ioprio.h> -#include <linux/seq_file.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/slab.h> -#include "blk-cgroup.h" #include <linux/genhd.h> +#include <linux/delay.h> +#include <linux/atomic.h> +#include "blk-cgroup.h" +#include "blk.h" #define MAX_KEY_LEN 100 -static DEFINE_SPINLOCK(blkio_list_lock); -static LIST_HEAD(blkio_list); +static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; -EXPORT_SYMBOL_GPL(blkio_root_cgroup); +struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; +EXPORT_SYMBOL_GPL(blkcg_root); -/* for encoding cft->private value on file */ -#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) -/* What policy owns the file, proportional or throttle */ -#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) -#define BLKIOFILE_ATTR(val) ((val) & 0xffff) +static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; -static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, - struct blkio_policy_node *pn) +struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { - list_add(&pn->node, &blkcg->policy_list); + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkcg, css); } +EXPORT_SYMBOL_GPL(cgroup_to_blkcg); -static inline bool cftype_blkg_same_policy(struct cftype *cft, - struct blkio_group *blkg) +static struct blkcg *task_blkcg(struct task_struct *tsk) { - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - - if (blkg->plid == plid) - return 1; - - return 0; + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkcg, css); } -/* Determines if policy node matches cgroup file being accessed */ -static inline bool pn_matches_cftype(struct cftype *cft, - struct blkio_policy_node *pn) +struct blkcg *bio_blkcg(struct bio *bio) { - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int fileid = BLKIOFILE_ATTR(cft->private); - - return (plid == pn->plid && fileid == pn->fileid); + if (bio && bio->bi_css) + return container_of(bio->bi_css, struct blkcg, css); + return task_blkcg(current); } +EXPORT_SYMBOL_GPL(bio_blkcg); -/* Must be called with blkcg->lock held */ -static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) +static bool blkcg_policy_enabled(struct request_queue *q, + const struct blkcg_policy *pol) { - list_del(&pn->node); + return pol && test_bit(pol->plid, q->blkcg_pols); } -/* Must be called with blkcg->lock held */ -static struct blkio_policy_node * -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, - enum blkio_policy_id plid, int fileid) +/** + * blkg_free - free a blkg + * @blkg: blkg to free + * + * Free @blkg which may be partially allocated. + */ +static void blkg_free(struct blkcg_gq *blkg) { - struct blkio_policy_node *pn; - - list_for_each_entry(pn, &blkcg->policy_list, node) { - if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) - return pn; - } + int i; - return NULL; -} + if (!blkg) + return; -struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), - struct blkio_cgroup, css); -} -EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkg_policy_data *pd = blkg->pd[i]; -struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkio_cgroup, css); -} -EXPORT_SYMBOL_GPL(task_blkio_cgroup); + if (!pd) + continue; -static inline void -blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) -{ - struct blkio_policy_type *blkiop; + if (pol && pol->pd_exit_fn) + pol->pd_exit_fn(blkg); - list_for_each_entry(blkiop, &blkio_list, list) { - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) - continue; - if (blkiop->ops.blkio_update_group_weight_fn) - blkiop->ops.blkio_update_group_weight_fn(blkg->key, - blkg, weight); + kfree(pd); } + + kfree(blkg); } -static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, - int fileid) +/** + * blkg_alloc - allocate a blkg + * @blkcg: block cgroup the new blkg is associated with + * @q: request_queue the new blkg is associated with + * + * Allocate a new blkg assocating @blkcg and @q. + */ +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) { - struct blkio_policy_type *blkiop; - - list_for_each_entry(blkiop, &blkio_list, list) { - - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) - continue; - - if (fileid == BLKIO_THROTL_read_bps_device - && blkiop->ops.blkio_update_group_read_bps_fn) - blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, - blkg, bps); + struct blkcg_gq *blkg; + int i; - if (fileid == BLKIO_THROTL_write_bps_device - && blkiop->ops.blkio_update_group_write_bps_fn) - blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, - blkg, bps); - } -} + /* alloc and init base part */ + blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); + if (!blkg) + return NULL; -static inline void blkio_update_group_iops(struct blkio_group *blkg, - unsigned int iops, int fileid) -{ - struct blkio_policy_type *blkiop; + blkg->q = q; + INIT_LIST_HEAD(&blkg->q_node); + blkg->blkcg = blkcg; + blkg->refcnt = 1; - list_for_each_entry(blkiop, &blkio_list, list) { + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkg_policy_data *pd; - /* If this policy does not own the blkg, do not send updates */ - if (blkiop->plid != blkg->plid) + if (!blkcg_policy_enabled(q, pol)) continue; - if (fileid == BLKIO_THROTL_read_iops_device - && blkiop->ops.blkio_update_group_read_iops_fn) - blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, - blkg, iops); + /* alloc per-policy data and attach it to blkg */ + pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); + if (!pd) { + blkg_free(blkg); + return NULL; + } - if (fileid == BLKIO_THROTL_write_iops_device - && blkiop->ops.blkio_update_group_write_iops_fn) - blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, - blkg,iops); + blkg->pd[i] = pd; + pd->blkg = blkg; } -} -/* - * Add to the appropriate stat variable depending on the request type. - * This should be called with the blkg->stats_lock held. - */ -static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, - bool sync) -{ - if (direction) - stat[BLKIO_STAT_WRITE] += add; - else - stat[BLKIO_STAT_READ] += add; - if (sync) - stat[BLKIO_STAT_SYNC] += add; - else - stat[BLKIO_STAT_ASYNC] += add; -} + /* invoke per-policy init */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; -/* - * Decrements the appropriate stat variable if non-zero depending on the - * request type. Panics on value being zero. - * This should be called with the blkg->stats_lock held. - */ -static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) -{ - if (direction) { - BUG_ON(stat[BLKIO_STAT_WRITE] == 0); - stat[BLKIO_STAT_WRITE]--; - } else { - BUG_ON(stat[BLKIO_STAT_READ] == 0); - stat[BLKIO_STAT_READ]--; - } - if (sync) { - BUG_ON(stat[BLKIO_STAT_SYNC] == 0); - stat[BLKIO_STAT_SYNC]--; - } else { - BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); - stat[BLKIO_STAT_ASYNC]--; + if (blkcg_policy_enabled(blkg->q, pol)) + pol->pd_init_fn(blkg); } -} -#ifdef CONFIG_DEBUG_BLK_CGROUP -/* This should be called with the blkg->stats_lock held. */ -static void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) -{ - if (blkio_blkg_waiting(&blkg->stats)) - return; - if (blkg == curr_blkg) - return; - blkg->stats.start_group_wait_time = sched_clock(); - blkio_mark_blkg_waiting(&blkg->stats); + return blkg; } -/* This should be called with the blkg->stats_lock held. */ -static void blkio_update_group_wait_time(struct blkio_group_stats *stats) +static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) { - unsigned long long now; + struct blkcg_gq *blkg; - if (!blkio_blkg_waiting(stats)) - return; + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; - now = sched_clock(); - if (time_after64(now, stats->start_group_wait_time)) - stats->group_wait_time += now - stats->start_group_wait_time; - blkio_clear_blkg_waiting(stats); + /* + * Hint didn't match. Look up from the radix tree. Note that we + * may not be holding queue_lock and thus are not sure whether + * @blkg from blkg_tree has already been removed or not, so we + * can't update hint to the lookup result. Leave it to the caller. + */ + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); + if (blkg && blkg->q == q) + return blkg; + + return NULL; } -/* This should be called with the blkg->stats_lock held. */ -static void blkio_end_empty_time(struct blkio_group_stats *stats) +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock and is guaranteed to return %NULL if @q is bypassing + * - see blk_queue_bypass_start() for details. + */ +struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { - unsigned long long now; - - if (!blkio_blkg_empty(stats)) - return; + WARN_ON_ONCE(!rcu_read_lock_held()); - now = sched_clock(); - if (time_after64(now, stats->start_empty_time)) - stats->empty_time += now - stats->start_empty_time; - blkio_clear_blkg_empty(stats); + if (unlikely(blk_queue_bypass(q))) + return NULL; + return __blkg_lookup(blkcg, q); } +EXPORT_SYMBOL_GPL(blkg_lookup); -void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) { - unsigned long flags; + struct blkcg_gq *blkg; + int ret; - spin_lock_irqsave(&blkg->stats_lock, flags); - BUG_ON(blkio_blkg_idling(&blkg->stats)); - blkg->stats.start_idle_time = sched_clock(); - blkio_mark_blkg_idling(&blkg->stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); -void blkiocg_update_idle_time_stats(struct blkio_group *blkg) -{ - unsigned long flags; - unsigned long long now; - struct blkio_group_stats *stats; - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - if (blkio_blkg_idling(stats)) { - now = sched_clock(); - if (time_after64(now, stats->start_idle_time)) - stats->idle_time += now - stats->start_idle_time; - blkio_clear_blkg_idling(stats); + /* lookup and update hint on success, see __blkg_lookup() for details */ + blkg = __blkg_lookup(blkcg, q); + if (blkg) { + rcu_assign_pointer(blkcg->blkg_hint, blkg); + return blkg; } - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); -void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) -{ - unsigned long flags; - struct blkio_group_stats *stats; - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - stats->avg_queue_size_sum += - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; - stats->avg_queue_size_samples++; - blkio_update_group_wait_time(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); + /* blkg holds a reference to blkcg */ + if (!css_tryget(&blkcg->css)) + return ERR_PTR(-EINVAL); -void blkiocg_set_start_empty_time(struct blkio_group *blkg) -{ - unsigned long flags; - struct blkio_group_stats *stats; + /* allocate */ + ret = -ENOMEM; + blkg = blkg_alloc(blkcg, q); + if (unlikely(!blkg)) + goto err_put; - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; + /* insert */ + ret = radix_tree_preload(GFP_ATOMIC); + if (ret) + goto err_free; - if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || - stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); - return; + spin_lock(&blkcg->lock); + ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + if (likely(!ret)) { + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + list_add(&blkg->q_node, &q->blkg_list); } + spin_unlock(&blkcg->lock); - /* - * group is already marked empty. This can happen if cfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. - */ - if(blkio_blkg_empty(stats)) { - spin_unlock_irqrestore(&blkg->stats_lock, flags); - return; - } + radix_tree_preload_end(); - stats->start_empty_time = sched_clock(); - blkio_mark_blkg_empty(stats); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + if (!ret) + return blkg; +err_free: + blkg_free(blkg); +err_put: + css_put(&blkcg->css); + return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); -void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { - blkg->stats.dequeue += dequeue; -} -EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); -#else -static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, - struct blkio_group *curr_blkg) {} -static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} -#endif - -void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, - bool sync) -{ - unsigned long flags; - - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, - sync); - blkio_end_empty_time(&blkg->stats); - blkio_set_start_group_wait_time(blkg, curr_blkg); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + /* + * This could be the first entry point of blkcg implementation and + * we shouldn't allow anything to go through for a bypassing queue. + */ + if (unlikely(blk_queue_bypass(q))) + return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); + return __blkg_lookup_create(blkcg, q); } -EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); +EXPORT_SYMBOL_GPL(blkg_lookup_create); -void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) +static void blkg_destroy(struct blkcg_gq *blkg) { - unsigned long flags; + struct request_queue *q = blkg->q; + struct blkcg *blkcg = blkg->blkcg; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], - direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); + lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&blkcg->lock); -void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, - unsigned long unaccounted_time) -{ - unsigned long flags; - - spin_lock_irqsave(&blkg->stats_lock, flags); - blkg->stats.time += time; -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg->stats.unaccounted_time += unaccounted_time; -#endif - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); + /* Something wrong if we are trying to remove same group twice */ + WARN_ON_ONCE(list_empty(&blkg->q_node)); + WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); -/* - * should be called under rcu read lock or queue lock to make sure blkg pointer - * is valid. - */ -void blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) -{ - struct blkio_group_stats_cpu *stats_cpu; - unsigned long flags; + radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); + list_del_init(&blkg->q_node); + hlist_del_init_rcu(&blkg->blkcg_node); /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. + * Both setting lookup hint to and clearing it from @blkg are done + * under queue_lock. If it's not pointing to @blkg now, it never + * will. Hint assignment itself can race safely. */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(blkg->stats_cpu); - - u64_stats_update_begin(&stats_cpu->syncp); - stats_cpu->sectors += bytes >> 9; - blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], - 1, direction, sync); - blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], - bytes, direction, sync); - u64_stats_update_end(&stats_cpu->syncp); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); - -void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) -{ - struct blkio_group_stats *stats; - unsigned long flags; - unsigned long long now = sched_clock(); - - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - if (time_after64(now, io_start_time)) - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], - now - io_start_time, direction, sync); - if (time_after64(io_start_time, start_time)) - blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], - io_start_time - start_time, direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); -} -EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); - -/* Merged stats are per cpu. */ -void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, - bool sync) -{ - struct blkio_group_stats_cpu *stats_cpu; - unsigned long flags; + if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) + rcu_assign_pointer(blkcg->blkg_hint, NULL); /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(blkg->stats_cpu); - - u64_stats_update_begin(&stats_cpu->syncp); - blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, - direction, sync); - u64_stats_update_end(&stats_cpu->syncp); - local_irq_restore(flags); + blkg_put(blkg); } -EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); -/* - * This function allocates the per cpu stats for blkio_group. Should be called - * from sleepable context as alloc_per_cpu() requires that. +/** + * blkg_destroy_all - destroy all blkgs associated with a request_queue + * @q: request_queue of interest + * + * Destroy all blkgs associated with @q. */ -int blkio_alloc_blkg_stats(struct blkio_group *blkg) +static void blkg_destroy_all(struct request_queue *q) { - /* Allocate memory for per cpu stats */ - blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); - if (!blkg->stats_cpu) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); + struct blkcg_gq *blkg, *n; -void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid) -{ - unsigned long flags; - - spin_lock_irqsave(&blkcg->lock, flags); - spin_lock_init(&blkg->stats_lock); - rcu_assign_pointer(blkg->key, key); - blkg->blkcg_id = css_id(&blkcg->css); - hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - blkg->plid = plid; - spin_unlock_irqrestore(&blkcg->lock, flags); - /* Need to take css reference ? */ - cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); - blkg->dev = dev; -} -EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); + lockdep_assert_held(q->queue_lock); -static void __blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - hlist_del_init_rcu(&blkg->blkcg_node); - blkg->blkcg_id = 0; -} + list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; -/* - * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 - * indicating that blk_group was unhashed by the time we got to it. - */ -int blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - struct blkio_cgroup *blkcg; - unsigned long flags; - struct cgroup_subsys_state *css; - int ret = 1; - - rcu_read_lock(); - css = css_lookup(&blkio_subsys, blkg->blkcg_id); - if (css) { - blkcg = container_of(css, struct blkio_cgroup, css); - spin_lock_irqsave(&blkcg->lock, flags); - if (!hlist_unhashed(&blkg->blkcg_node)) { - __blkiocg_del_blkio_group(blkg); - ret = 0; - } - spin_unlock_irqrestore(&blkcg->lock, flags); + spin_lock(&blkcg->lock); + blkg_destroy(blkg); + spin_unlock(&blkcg->lock); } - - rcu_read_unlock(); - return ret; } -EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); -/* called under rcu_read_lock(). */ -struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +static void blkg_rcu_free(struct rcu_head *rcu_head) { - struct blkio_group *blkg; - struct hlist_node *n; - void *__key; - - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { - __key = blkg->key; - if (__key == key) - return blkg; - } - - return NULL; + blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); } -EXPORT_SYMBOL_GPL(blkiocg_lookup_group); -static void blkio_reset_stats_cpu(struct blkio_group *blkg) +void __blkg_release(struct blkcg_gq *blkg) { - struct blkio_group_stats_cpu *stats_cpu; - int i, j, k; + /* release the extra blkcg reference this blkg has been holding */ + css_put(&blkg->blkcg->css); + /* - * Note: On 64 bit arch this should not be an issue. This has the - * possibility of returning some inconsistent value on 32bit arch - * as 64bit update on 32bit is non atomic. Taking care of this - * corner case makes code very complicated, like sending IPIs to - * cpus, taking care of stats of offline cpus etc. + * A group is freed in rcu manner. But having an rcu lock does not + * mean that one can access all the fields of blkg and assume these + * are valid. For example, don't try to follow throtl_data and + * request queue links. * - * reset stats is anyway more of a debug feature and this sounds a - * corner case. So I am not complicating the code yet until and - * unless this becomes a real issue. + * Having a reference to blkg under an rcu allows acess to only + * values local to groups like group stats and group rate limits */ - for_each_possible_cpu(i) { - stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); - stats_cpu->sectors = 0; - for(j = 0; j < BLKIO_STAT_CPU_NR; j++) - for (k = 0; k < BLKIO_STAT_TOTAL; k++) - stats_cpu->stat_arr_cpu[j][k] = 0; - } + call_rcu(&blkg->rcu_head, blkg_rcu_free); } +EXPORT_SYMBOL_GPL(__blkg_release); -static int -blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) +static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, + u64 val) { - struct blkio_cgroup *blkcg; - struct blkio_group *blkg; - struct blkio_group_stats *stats; + struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg_gq *blkg; struct hlist_node *n; - uint64_t queued[BLKIO_STAT_TOTAL]; int i; -#ifdef CONFIG_DEBUG_BLK_CGROUP - bool idling, waiting, empty; - unsigned long long now = sched_clock(); -#endif - blkcg = cgroup_to_blkio_cgroup(cgroup); + mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - spin_lock(&blkg->stats_lock); - stats = &blkg->stats; -#ifdef CONFIG_DEBUG_BLK_CGROUP - idling = blkio_blkg_idling(stats); - waiting = blkio_blkg_waiting(stats); - empty = blkio_blkg_empty(stats); -#endif - for (i = 0; i < BLKIO_STAT_TOTAL; i++) - queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; - memset(stats, 0, sizeof(struct blkio_group_stats)); - for (i = 0; i < BLKIO_STAT_TOTAL; i++) - stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; -#ifdef CONFIG_DEBUG_BLK_CGROUP - if (idling) { - blkio_mark_blkg_idling(stats); - stats->start_idle_time = now; - } - if (waiting) { - blkio_mark_blkg_waiting(stats); - stats->start_group_wait_time = now; - } - if (empty) { - blkio_mark_blkg_empty(stats); - stats->start_empty_time = now; - } -#endif - spin_unlock(&blkg->stats_lock); - - /* Reset Per cpu stats which don't take blkg->stats_lock */ - blkio_reset_stats_cpu(blkg); - } - - spin_unlock_irq(&blkcg->lock); - return 0; -} - -static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, - int chars_left, bool diskname_only) -{ - snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); - chars_left -= strlen(str); - if (chars_left <= 0) { - printk(KERN_WARNING - "Possibly incorrect cgroup stat display format"); - return; - } - if (diskname_only) - return; - switch (type) { - case BLKIO_STAT_READ: - strlcat(str, " Read", chars_left); - break; - case BLKIO_STAT_WRITE: - strlcat(str, " Write", chars_left); - break; - case BLKIO_STAT_SYNC: - strlcat(str, " Sync", chars_left); - break; - case BLKIO_STAT_ASYNC: - strlcat(str, " Async", chars_left); - break; - case BLKIO_STAT_TOTAL: - strlcat(str, " Total", chars_left); - break; - default: - strlcat(str, " Invalid", chars_left); - } -} - -static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, - struct cgroup_map_cb *cb, dev_t dev) -{ - blkio_get_key_name(0, dev, str, chars_left, true); - cb->fill(cb, str, val); - return val; -} - - -static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, - enum stat_type_cpu type, enum stat_sub_type sub_type) -{ - int cpu; - struct blkio_group_stats_cpu *stats_cpu; - u64 val = 0, tval; - - for_each_possible_cpu(cpu) { - unsigned int start; - stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); - - do { - start = u64_stats_fetch_begin(&stats_cpu->syncp); - if (type == BLKIO_STAT_CPU_SECTORS) - tval = stats_cpu->sectors; - else - tval = stats_cpu->stat_arr_cpu[type][sub_type]; - } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); - - val += tval; - } - - return val; -} - -static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, - struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) -{ - uint64_t disk_total, val; - char key_str[MAX_KEY_LEN]; - enum stat_sub_type sub_type; - if (type == BLKIO_STAT_CPU_SECTORS) { - val = blkio_read_stat_cpu(blkg, type, 0); - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); - } - - for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; - sub_type++) { - blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); - val = blkio_read_stat_cpu(blkg, type, sub_type); - cb->fill(cb, key_str, val); - } - - disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + - blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); - - blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); - cb->fill(cb, key_str, disk_total); - return disk_total; -} - -/* This should be called with blkg->stats_lock held */ -static uint64_t blkio_get_stat(struct blkio_group *blkg, - struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) -{ - uint64_t disk_total; - char key_str[MAX_KEY_LEN]; - enum stat_sub_type sub_type; - - if (type == BLKIO_STAT_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.time, cb, dev); -#ifdef CONFIG_DEBUG_BLK_CGROUP - if (type == BLKIO_STAT_UNACCOUNTED_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.unaccounted_time, cb, dev); - if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { - uint64_t sum = blkg->stats.avg_queue_size_sum; - uint64_t samples = blkg->stats.avg_queue_size_samples; - if (samples) - do_div(sum, samples); - else - sum = 0; - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); - } - if (type == BLKIO_STAT_GROUP_WAIT_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.group_wait_time, cb, dev); - if (type == BLKIO_STAT_IDLE_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.idle_time, cb, dev); - if (type == BLKIO_STAT_EMPTY_TIME) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.empty_time, cb, dev); - if (type == BLKIO_STAT_DEQUEUE) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.dequeue, cb, dev); -#endif - - for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; - sub_type++) { - blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); - cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); - } - disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + - blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; - blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); - cb->fill(cb, key_str, disk_total); - return disk_total; -} - -static int blkio_policy_parse_and_set(char *buf, - struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) -{ - struct gendisk *disk = NULL; - char *s[4], *p, *major_s = NULL, *minor_s = NULL; - unsigned long major, minor; - int i = 0, ret = -EINVAL; - int part; - dev_t dev; - u64 temp; - - memset(s, 0, sizeof(s)); - - while ((p = strsep(&buf, " ")) != NULL) { - if (!*p) - continue; - - s[i++] = p; - - /* Prevent from inputing too many things */ - if (i == 3) - break; - } - - if (i != 2) - goto out; - - p = strsep(&s[0], ":"); - if (p != NULL) - major_s = p; - else - goto out; - - minor_s = s[0]; - if (!minor_s) - goto out; - - if (strict_strtoul(major_s, 10, &major)) - goto out; - - if (strict_strtoul(minor_s, 10, &minor)) - goto out; - - dev = MKDEV(major, minor); - - if (strict_strtoull(s[1], 10, &temp)) - goto out; - - /* For rule removal, do not check for device presence. */ - if (temp) { - disk = get_gendisk(dev, &part); - if (!disk || part) { - ret = -ENODEV; - goto out; - } - } - - newpn->dev = dev; - - switch (plid) { - case BLKIO_POLICY_PROP: - if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || - temp > BLKIO_WEIGHT_MAX) - goto out; - - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.weight = temp; - break; - case BLKIO_POLICY_THROTL: - switch(fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.bps = temp; - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - if (temp > THROTL_IOPS_MAX) - goto out; - - newpn->plid = plid; - newpn->fileid = fileid; - newpn->val.iops = (unsigned int)temp; - break; - } - break; - default: - BUG(); - } - ret = 0; -out: - put_disk(disk); - return ret; -} - -unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, - dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - unsigned int weight; - - spin_lock_irqsave(&blkcg->lock, flags); - - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, - BLKIO_PROP_weight_device); - if (pn) - weight = pn->val.weight; - else - weight = blkcg->weight; - - spin_unlock_irqrestore(&blkcg->lock, flags); - - return weight; -} -EXPORT_SYMBOL_GPL(blkcg_get_weight); - -uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - uint64_t bps = -1; - - spin_lock_irqsave(&blkcg->lock, flags); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_bps_device); - if (pn) - bps = pn->val.bps; - spin_unlock_irqrestore(&blkcg->lock, flags); - - return bps; -} - -uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - uint64_t bps = -1; - - spin_lock_irqsave(&blkcg->lock, flags); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_bps_device); - if (pn) - bps = pn->val.bps; - spin_unlock_irqrestore(&blkcg->lock, flags); - - return bps; -} - -unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - unsigned int iops = -1; - - spin_lock_irqsave(&blkcg->lock, flags); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_iops_device); - if (pn) - iops = pn->val.iops; - spin_unlock_irqrestore(&blkcg->lock, flags); - - return iops; -} - -unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) -{ - struct blkio_policy_node *pn; - unsigned long flags; - unsigned int iops = -1; - - spin_lock_irqsave(&blkcg->lock, flags); - pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_iops_device); - if (pn) - iops = pn->val.iops; - spin_unlock_irqrestore(&blkcg->lock, flags); - - return iops; -} + /* + * Note that stat reset is racy - it doesn't synchronize against + * stat updates. This is a debug feature which shouldn't exist + * anyway. If you get hit by a race, retry. + */ + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; -/* Checks whether user asked for deleting a policy rule */ -static bool blkio_delete_rule_command(struct blkio_policy_node *pn) -{ - switch(pn->plid) { - case BLKIO_POLICY_PROP: - if (pn->val.weight == 0) - return 1; - break; - case BLKIO_POLICY_THROTL: - switch(pn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - if (pn->val.bps == 0) - return 1; - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - if (pn->val.iops == 0) - return 1; + if (blkcg_policy_enabled(blkg->q, pol) && + pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg); } - break; - default: - BUG(); } + spin_unlock_irq(&blkcg->lock); + mutex_unlock(&blkcg_pol_mutex); return 0; } -static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, - struct blkio_policy_node *newpn) -{ - switch(oldpn->plid) { - case BLKIO_POLICY_PROP: - oldpn->val.weight = newpn->val.weight; - break; - case BLKIO_POLICY_THROTL: - switch(newpn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - oldpn->val.bps = newpn->val.bps; - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - oldpn->val.iops = newpn->val.iops; - } - break; - default: - BUG(); - } -} - -/* - * Some rules/values in blkg have changed. Propagate those to respective - * policies. - */ -static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, struct blkio_policy_node *pn) +static const char *blkg_dev_name(struct blkcg_gq *blkg) { - unsigned int weight, iops; - u64 bps; - - switch(pn->plid) { - case BLKIO_POLICY_PROP: - weight = pn->val.weight ? pn->val.weight : - blkcg->weight; - blkio_update_group_weight(blkg, weight); - break; - case BLKIO_POLICY_THROTL: - switch(pn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - bps = pn->val.bps ? pn->val.bps : (-1); - blkio_update_group_bps(blkg, bps, pn->fileid); - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - iops = pn->val.iops ? pn->val.iops : (-1); - blkio_update_group_iops(blkg, iops, pn->fileid); - break; - } - break; - default: - BUG(); - } + /* some drivers (floppy) instantiate a queue w/o disk registered */ + if (blkg->q->backing_dev_info.dev) + return dev_name(blkg->q->backing_dev_info.dev); + return NULL; } -/* - * A policy node rule has been updated. Propagate this update to all the - * block groups which might be affected by this update. +/** + * blkcg_print_blkgs - helper for printing per-blkg data + * @sf: seq_file to print to + * @blkcg: blkcg of interest + * @prfill: fill function to print out a blkg + * @pol: policy in question + * @data: data to be passed to @prfill + * @show_total: to print out sum of prfill return values or not + * + * This function invokes @prfill on each blkg of @blkcg if pd for the + * policy specified by @pol exists. @prfill is invoked with @sf, the + * policy data and @data. If @show_total is %true, the sum of the return + * values from @prfill is printed with "Total" label at the end. + * + * This is to be used to construct print functions for + * cftype->read_seq_string method. */ -static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, - struct blkio_policy_node *pn) +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total) { - struct blkio_group *blkg; + struct blkcg_gq *blkg; struct hlist_node *n; + u64 total = 0; - spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (pn->dev != blkg->dev || pn->plid != blkg->plid) - continue; - blkio_update_blkg_policy(blkcg, blkg, pn); - } - + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + if (blkcg_policy_enabled(blkg->q, pol)) + total += prfill(sf, blkg->pd[pol->plid], data); spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); + + if (show_total) + seq_printf(sf, "Total %llu\n", (unsigned long long)total); } +EXPORT_SYMBOL_GPL(blkcg_print_blkgs); -static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +/** + * __blkg_prfill_u64 - prfill helper for a single u64 value + * @sf: seq_file to print to + * @pd: policy private data of interest + * @v: value to print + * + * Print @v to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { - int ret = 0; - char *buf; - struct blkio_policy_node *newpn, *pn; - struct blkio_cgroup *blkcg; - int keep_newpn = 0; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int fileid = BLKIOFILE_ATTR(cft->private); - - buf = kstrdup(buffer, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); - if (!newpn) { - ret = -ENOMEM; - goto free_buf; - } - - ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); - if (ret) - goto free_newpn; - - blkcg = cgroup_to_blkio_cgroup(cgrp); - - spin_lock_irq(&blkcg->lock); - - pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); - if (!pn) { - if (!blkio_delete_rule_command(newpn)) { - blkio_policy_insert_node(blkcg, newpn); - keep_newpn = 1; - } - spin_unlock_irq(&blkcg->lock); - goto update_io_group; - } - - if (blkio_delete_rule_command(newpn)) { - blkio_policy_delete_node(pn); - kfree(pn); - spin_unlock_irq(&blkcg->lock); - goto update_io_group; - } - spin_unlock_irq(&blkcg->lock); + const char *dname = blkg_dev_name(pd->blkg); - blkio_update_policy_rule(pn, newpn); + if (!dname) + return 0; -update_io_group: - blkio_update_policy_node_blkg(blkcg, newpn); - -free_newpn: - if (!keep_newpn) - kfree(newpn); -free_buf: - kfree(buf); - return ret; + seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); + return v; } +EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -static void -blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) -{ - switch(pn->plid) { - case BLKIO_POLICY_PROP: - if (pn->fileid == BLKIO_PROP_weight_device) - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.weight); - break; - case BLKIO_POLICY_THROTL: - switch(pn->fileid) { - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.bps); - break; - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->val.iops); - break; - } - break; - default: - BUG(); - } -} +/** + * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @rwstat: rwstat to print + * + * Print @rwstat to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat *rwstat) +{ + static const char *rwstr[] = { + [BLKG_RWSTAT_READ] = "Read", + [BLKG_RWSTAT_WRITE] = "Write", + [BLKG_RWSTAT_SYNC] = "Sync", + [BLKG_RWSTAT_ASYNC] = "Async", + }; + const char *dname = blkg_dev_name(pd->blkg); + u64 v; + int i; -/* cgroup files which read their data from policy nodes end up here */ -static void blkio_read_policy_node_files(struct cftype *cft, - struct blkio_cgroup *blkcg, struct seq_file *m) -{ - struct blkio_policy_node *pn; - - if (!list_empty(&blkcg->policy_list)) { - spin_lock_irq(&blkcg->lock); - list_for_each_entry(pn, &blkcg->policy_list, node) { - if (!pn_matches_cftype(cft, pn)) - continue; - blkio_print_policy_node(m, pn); - } - spin_unlock_irq(&blkcg->lock); - } -} + if (!dname) + return 0; -static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *m) -{ - struct blkio_cgroup *blkcg; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int name = BLKIOFILE_ATTR(cft->private); - - blkcg = cgroup_to_blkio_cgroup(cgrp); - - switch(plid) { - case BLKIO_POLICY_PROP: - switch(name) { - case BLKIO_PROP_weight_device: - blkio_read_policy_node_files(cft, blkcg, m); - return 0; - default: - BUG(); - } - break; - case BLKIO_POLICY_THROTL: - switch(name){ - case BLKIO_THROTL_read_bps_device: - case BLKIO_THROTL_write_bps_device: - case BLKIO_THROTL_read_iops_device: - case BLKIO_THROTL_write_iops_device: - blkio_read_policy_node_files(cft, blkcg, m); - return 0; - default: - BUG(); - } - break; - default: - BUG(); - } + for (i = 0; i < BLKG_RWSTAT_NR; i++) + seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], + (unsigned long long)rwstat->cnt[i]); - return 0; + v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); + return v; } -static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, - struct cftype *cft, struct cgroup_map_cb *cb, - enum stat_type type, bool show_total, bool pcpu) +/** + * blkg_prfill_stat - prfill callback for blkg_stat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_stat in @pd + * + * prfill callback for printing a blkg_stat. + */ +u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkio_group *blkg; - struct hlist_node *n; - uint64_t cgroup_total = 0; - - rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (blkg->dev) { - if (!cftype_blkg_same_policy(cft, blkg)) - continue; - if (pcpu) - cgroup_total += blkio_get_stat_cpu(blkg, cb, - blkg->dev, type); - else { - spin_lock_irq(&blkg->stats_lock); - cgroup_total += blkio_get_stat(blkg, cb, - blkg->dev, type); - spin_unlock_irq(&blkg->stats_lock); - } - } - } - if (show_total) - cb->fill(cb, "Total", cgroup_total); - rcu_read_unlock(); - return 0; + return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); } +EXPORT_SYMBOL_GPL(blkg_prfill_stat); -/* All map kind of cgroup file get serviced by this function */ -static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +/** + * blkg_prfill_rwstat - prfill callback for blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_rwstat in @pd + * + * prfill callback for printing a blkg_rwstat. + */ +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) { - struct blkio_cgroup *blkcg; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int name = BLKIOFILE_ATTR(cft->private); - - blkcg = cgroup_to_blkio_cgroup(cgrp); - - switch(plid) { - case BLKIO_POLICY_PROP: - switch(name) { - case BLKIO_PROP_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_TIME, 0, 0); - case BLKIO_PROP_sectors: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_CPU_SECTORS, 0, 1); - case BLKIO_PROP_io_service_bytes: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); - case BLKIO_PROP_io_serviced: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_CPU_SERVICED, 1, 1); - case BLKIO_PROP_io_service_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_TIME, 1, 0); - case BLKIO_PROP_io_wait_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_WAIT_TIME, 1, 0); - case BLKIO_PROP_io_merged: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_CPU_MERGED, 1, 1); - case BLKIO_PROP_io_queued: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_QUEUED, 1, 0); -#ifdef CONFIG_DEBUG_BLK_CGROUP - case BLKIO_PROP_unaccounted_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); - case BLKIO_PROP_dequeue: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_DEQUEUE, 0, 0); - case BLKIO_PROP_avg_queue_size: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); - case BLKIO_PROP_group_wait_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); - case BLKIO_PROP_idle_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_IDLE_TIME, 0, 0); - case BLKIO_PROP_empty_time: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_EMPTY_TIME, 0, 0); -#endif - default: - BUG(); - } - break; - case BLKIO_POLICY_THROTL: - switch(name){ - case BLKIO_THROTL_io_service_bytes: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); - case BLKIO_THROTL_io_serviced: - return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_CPU_SERVICED, 1, 1); - default: - BUG(); - } - break; - default: - BUG(); - } + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); - return 0; + return __blkg_prfill_rwstat(sf, pd, &rwstat); } +EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); -static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) +/** + * blkg_conf_prep - parse and prepare for per-blkg config update + * @blkcg: target block cgroup + * @pol: target policy + * @input: input string + * @ctx: blkg_conf_ctx to be filled + * + * Parse per-blkg config update from @input and initialize @ctx with the + * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new + * value. This function returns with RCU read lock and queue lock held and + * must be paired with blkg_conf_finish(). + */ +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + const char *input, struct blkg_conf_ctx *ctx) + __acquires(rcu) __acquires(disk->queue->queue_lock) { - struct blkio_group *blkg; - struct hlist_node *n; - struct blkio_policy_node *pn; + struct gendisk *disk; + struct blkcg_gq *blkg; + unsigned int major, minor; + unsigned long long v; + int part, ret; - if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) return -EINVAL; - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - blkcg->weight = (unsigned int)val; - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - pn = blkio_policy_search_node(blkcg, blkg->dev, - BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); - if (pn) - continue; - - blkio_update_group_weight(blkg, blkcg->weight); - } - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); - return 0; -} + disk = get_gendisk(MKDEV(major, minor), &part); + if (!disk || part) + return -EINVAL; -static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { - struct blkio_cgroup *blkcg; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int name = BLKIOFILE_ATTR(cft->private); + rcu_read_lock(); + spin_lock_irq(disk->queue->queue_lock); - blkcg = cgroup_to_blkio_cgroup(cgrp); + if (blkcg_policy_enabled(disk->queue, pol)) + blkg = blkg_lookup_create(blkcg, disk->queue); + else + blkg = ERR_PTR(-EINVAL); - switch(plid) { - case BLKIO_POLICY_PROP: - switch(name) { - case BLKIO_PROP_weight: - return (u64)blkcg->weight; + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + rcu_read_unlock(); + spin_unlock_irq(disk->queue->queue_lock); + put_disk(disk); + /* + * If queue was bypassing, we should retry. Do so after a + * short msleep(). It isn't strictly necessary but queue + * can be bypassing for some time and it's always nice to + * avoid busy looping. + */ + if (ret == -EBUSY) { + msleep(10); + ret = restart_syscall(); } - break; - default: - BUG(); + return ret; } + + ctx->disk = disk; + ctx->blkg = blkg; + ctx->v = v; return 0; } +EXPORT_SYMBOL_GPL(blkg_conf_prep); -static int -blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +/** + * blkg_conf_finish - finish up per-blkg config update + * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() + * + * Finish up after per-blkg config update. This function must be paired + * with blkg_conf_prep(). + */ +void blkg_conf_finish(struct blkg_conf_ctx *ctx) + __releases(ctx->disk->queue->queue_lock) __releases(rcu) { - struct blkio_cgroup *blkcg; - enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); - int name = BLKIOFILE_ATTR(cft->private); - - blkcg = cgroup_to_blkio_cgroup(cgrp); - - switch(plid) { - case BLKIO_POLICY_PROP: - switch(name) { - case BLKIO_PROP_weight: - return blkio_weight_write(blkcg, val); - } - break; - default: - BUG(); - } - - return 0; + spin_unlock_irq(ctx->disk->queue->queue_lock); + rcu_read_unlock(); + put_disk(ctx->disk); } +EXPORT_SYMBOL_GPL(blkg_conf_finish); -struct cftype blkio_files[] = { - { - .name = "weight_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_weight_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - { - .name = "weight", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_weight), - .read_u64 = blkiocg_file_read_u64, - .write_u64 = blkiocg_file_write_u64, - }, - { - .name = "time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "sectors", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_sectors), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_service_bytes", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_service_bytes), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_serviced", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_serviced), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_service_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_service_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_wait_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_wait_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_merged", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_merged), - .read_map = blkiocg_file_read_map, - }, - { - .name = "io_queued", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_io_queued), - .read_map = blkiocg_file_read_map, - }, +struct cftype blkcg_files[] = { { .name = "reset_stats", - .write_u64 = blkiocg_reset_stats, - }, -#ifdef CONFIG_BLK_DEV_THROTTLING - { - .name = "throttle.read_bps_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_bps_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - - { - .name = "throttle.write_bps_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_bps_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - - { - .name = "throttle.read_iops_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_read_iops_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - - { - .name = "throttle.write_iops_device", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_write_iops_device), - .read_seq_string = blkiocg_file_read, - .write_string = blkiocg_file_write, - .max_write_len = 256, - }, - { - .name = "throttle.io_service_bytes", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_io_service_bytes), - .read_map = blkiocg_file_read_map, - }, - { - .name = "throttle.io_serviced", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, - BLKIO_THROTL_io_serviced), - .read_map = blkiocg_file_read_map, - }, -#endif /* CONFIG_BLK_DEV_THROTTLING */ - -#ifdef CONFIG_DEBUG_BLK_CGROUP - { - .name = "avg_queue_size", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_avg_queue_size), - .read_map = blkiocg_file_read_map, + .write_u64 = blkcg_reset_stats, }, - { - .name = "group_wait_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_group_wait_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "idle_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_idle_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "empty_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_empty_time), - .read_map = blkiocg_file_read_map, - }, - { - .name = "dequeue", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_dequeue), - .read_map = blkiocg_file_read_map, - }, - { - .name = "unaccounted_time", - .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, - BLKIO_PROP_unaccounted_time), - .read_map = blkiocg_file_read_map, - }, -#endif { } /* terminate */ }; -static void blkiocg_destroy(struct cgroup *cgroup) +/** + * blkcg_pre_destroy - cgroup pre_destroy callback + * @cgroup: cgroup of interest + * + * This function is called when @cgroup is about to go away and responsible + * for shooting down all blkgs associated with @cgroup. blkgs should be + * removed while holding both q and blkcg locks. As blkcg lock is nested + * inside q lock, this function performs reverse double lock dancing. + * + * This is the blkcg counterpart of ioc_release_fn(). + */ +static int blkcg_pre_destroy(struct cgroup *cgroup) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - unsigned long flags; - struct blkio_group *blkg; - void *key; - struct blkio_policy_type *blkiop; - struct blkio_policy_node *pn, *pntmp; + struct blkcg *blkcg = cgroup_to_blkcg(cgroup); - rcu_read_lock(); - do { - spin_lock_irqsave(&blkcg->lock, flags); + spin_lock_irq(&blkcg->lock); - if (hlist_empty(&blkcg->blkg_list)) { - spin_unlock_irqrestore(&blkcg->lock, flags); - break; + while (!hlist_empty(&blkcg->blkg_list)) { + struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, + struct blkcg_gq, blkcg_node); + struct request_queue *q = blkg->q; + + if (spin_trylock(q->queue_lock)) { + blkg_destroy(blkg); + spin_unlock(q->queue_lock); + } else { + spin_unlock_irq(&blkcg->lock); + cpu_relax(); + spin_lock_irq(&blkcg->lock); } + } - blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, - blkcg_node); - key = rcu_dereference(blkg->key); - __blkiocg_del_blkio_group(blkg); - - spin_unlock_irqrestore(&blkcg->lock, flags); - - /* - * This blkio_group is being unlinked as associated cgroup is - * going away. Let all the IO controlling policies know about - * this event. - */ - spin_lock(&blkio_list_lock); - list_for_each_entry(blkiop, &blkio_list, list) { - if (blkiop->plid != blkg->plid) - continue; - blkiop->ops.blkio_unlink_group_fn(key, blkg); - } - spin_unlock(&blkio_list_lock); - } while (1); + spin_unlock_irq(&blkcg->lock); + return 0; +} - list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { - blkio_policy_delete_node(pn); - kfree(pn); - } +static void blkcg_destroy(struct cgroup *cgroup) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgroup); - free_css_id(&blkio_subsys, &blkcg->css); - rcu_read_unlock(); - if (blkcg != &blkio_root_cgroup) + if (blkcg != &blkcg_root) kfree(blkcg); } -static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) +static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) { - struct blkio_cgroup *blkcg; + static atomic64_t id_seq = ATOMIC64_INIT(0); + struct blkcg *blkcg; struct cgroup *parent = cgroup->parent; if (!parent) { - blkcg = &blkio_root_cgroup; + blkcg = &blkcg_root; goto done; } @@ -1582,22 +624,68 @@ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) if (!blkcg) return ERR_PTR(-ENOMEM); - blkcg->weight = BLKIO_WEIGHT_DEFAULT; + blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; + blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ done: spin_lock_init(&blkcg->lock); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&blkcg->blkg_list); - INIT_LIST_HEAD(&blkcg->policy_list); return &blkcg->css; } +/** + * blkcg_init_queue - initialize blkcg part of request queue + * @q: request_queue to initialize + * + * Called from blk_alloc_queue_node(). Responsible for initializing blkcg + * part of new request_queue @q. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkcg_init_queue(struct request_queue *q) +{ + might_sleep(); + + return blk_throtl_init(q); +} + +/** + * blkcg_drain_queue - drain blkcg part of request_queue + * @q: request_queue to drain + * + * Called from blk_drain_queue(). Responsible for draining blkcg part. + */ +void blkcg_drain_queue(struct request_queue *q) +{ + lockdep_assert_held(q->queue_lock); + + blk_throtl_drain(q); +} + +/** + * blkcg_exit_queue - exit and release blkcg part of request_queue + * @q: request_queue being released + * + * Called from blk_release_queue(). Responsible for exiting blkcg part. + */ +void blkcg_exit_queue(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + + blk_throtl_exit(q); +} + /* * We cannot support shared io contexts, as we have no mean to support * two tasks with the same ioc in two different groups without major rework * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; @@ -1616,63 +704,213 @@ static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) return ret; } -static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -{ - struct task_struct *task; - struct io_context *ioc; - - cgroup_taskset_for_each(task, cgrp, tset) { - /* we don't lose anything even if ioc allocation fails */ - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc_cgroup_changed(ioc); - put_io_context(ioc); - } - } -} - struct cgroup_subsys blkio_subsys = { .name = "blkio", - .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, - .destroy = blkiocg_destroy, -#ifdef CONFIG_BLK_CGROUP - /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ + .create = blkcg_create, + .can_attach = blkcg_can_attach, + .pre_destroy = blkcg_pre_destroy, + .destroy = blkcg_destroy, .subsys_id = blkio_subsys_id, -#endif - .base_cftypes = blkio_files, - .use_id = 1, + .base_cftypes = blkcg_files, .module = THIS_MODULE, }; EXPORT_SYMBOL_GPL(blkio_subsys); -void blkio_policy_register(struct blkio_policy_type *blkiop) +/** + * blkcg_activate_policy - activate a blkcg policy on a request_queue + * @q: request_queue of interest + * @pol: blkcg policy to activate + * + * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through + * bypass mode to populate its blkgs with policy_data for @pol. + * + * Activation happens with @q bypassed, so nobody would be accessing blkgs + * from IO path. Update of each blkg is protected by both queue and blkcg + * locks so that holding either lock and testing blkcg_policy_enabled() is + * always enough for dereferencing policy data. + * + * The caller is responsible for synchronizing [de]activations and policy + * [un]registerations. Returns 0 on success, -errno on failure. + */ +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { - spin_lock(&blkio_list_lock); - list_add_tail(&blkiop->list, &blkio_list); - spin_unlock(&blkio_list_lock); + LIST_HEAD(pds); + struct blkcg_gq *blkg; + struct blkg_policy_data *pd, *n; + int cnt = 0, ret; + + if (blkcg_policy_enabled(q, pol)) + return 0; + + blk_queue_bypass_start(q); + + /* make sure the root blkg exists and count the existing blkgs */ + spin_lock_irq(q->queue_lock); + + rcu_read_lock(); + blkg = __blkg_lookup_create(&blkcg_root, q); + rcu_read_unlock(); + + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + goto out_unlock; + } + q->root_blkg = blkg; + + list_for_each_entry(blkg, &q->blkg_list, q_node) + cnt++; + + spin_unlock_irq(q->queue_lock); + + /* allocate policy_data for all existing blkgs */ + while (cnt--) { + pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); + if (!pd) { + ret = -ENOMEM; + goto out_free; + } + list_add_tail(&pd->alloc_node, &pds); + } + + /* + * Install the allocated pds. With @q bypassing, no new blkg + * should have been created while the queue lock was dropped. + */ + spin_lock_irq(q->queue_lock); + + list_for_each_entry(blkg, &q->blkg_list, q_node) { + if (WARN_ON(list_empty(&pds))) { + /* umm... this shouldn't happen, just abort */ + ret = -ENOMEM; + goto out_unlock; + } + pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); + list_del_init(&pd->alloc_node); + + /* grab blkcg lock too while installing @pd on @blkg */ + spin_lock(&blkg->blkcg->lock); + + blkg->pd[pol->plid] = pd; + pd->blkg = blkg; + pol->pd_init_fn(blkg); + + spin_unlock(&blkg->blkcg->lock); + } + + __set_bit(pol->plid, q->blkcg_pols); + ret = 0; +out_unlock: + spin_unlock_irq(q->queue_lock); +out_free: + blk_queue_bypass_end(q); + list_for_each_entry_safe(pd, n, &pds, alloc_node) + kfree(pd); + return ret; } -EXPORT_SYMBOL_GPL(blkio_policy_register); +EXPORT_SYMBOL_GPL(blkcg_activate_policy); -void blkio_policy_unregister(struct blkio_policy_type *blkiop) +/** + * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue + * @q: request_queue of interest + * @pol: blkcg policy to deactivate + * + * Deactivate @pol on @q. Follows the same synchronization rules as + * blkcg_activate_policy(). + */ +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { - spin_lock(&blkio_list_lock); - list_del_init(&blkiop->list); - spin_unlock(&blkio_list_lock); + struct blkcg_gq *blkg; + + if (!blkcg_policy_enabled(q, pol)) + return; + + blk_queue_bypass_start(q); + spin_lock_irq(q->queue_lock); + + __clear_bit(pol->plid, q->blkcg_pols); + + /* if no policy is left, no need for blkgs - shoot them down */ + if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) + blkg_destroy_all(q); + + list_for_each_entry(blkg, &q->blkg_list, q_node) { + /* grab blkcg lock too while removing @pd from @blkg */ + spin_lock(&blkg->blkcg->lock); + + if (pol->pd_exit_fn) + pol->pd_exit_fn(blkg); + + kfree(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + + spin_unlock(&blkg->blkcg->lock); + } + + spin_unlock_irq(q->queue_lock); + blk_queue_bypass_end(q); } -EXPORT_SYMBOL_GPL(blkio_policy_unregister); +EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); -static int __init init_cgroup_blkio(void) +/** + * blkcg_policy_register - register a blkcg policy + * @pol: blkcg policy to register + * + * Register @pol with blkcg core. Might sleep and @pol may be modified on + * successful registration. Returns 0 on success and -errno on failure. + */ +int blkcg_policy_register(struct blkcg_policy *pol) { - return cgroup_load_subsys(&blkio_subsys); + int i, ret; + + if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) + return -EINVAL; + + mutex_lock(&blkcg_pol_mutex); + + /* find an empty slot */ + ret = -ENOSPC; + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (!blkcg_policy[i]) + break; + if (i >= BLKCG_MAX_POLS) + goto out_unlock; + + /* register and update blkgs */ + pol->plid = i; + blkcg_policy[i] = pol; + + /* everything is in place, add intf files for the new policy */ + if (pol->cftypes) + WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); + ret = 0; +out_unlock: + mutex_unlock(&blkcg_pol_mutex); + return ret; } +EXPORT_SYMBOL_GPL(blkcg_policy_register); -static void __exit exit_cgroup_blkio(void) +/** + * blkcg_policy_unregister - unregister a blkcg policy + * @pol: blkcg policy to unregister + * + * Undo blkcg_policy_register(@pol). Might sleep. + */ +void blkcg_policy_unregister(struct blkcg_policy *pol) { - cgroup_unload_subsys(&blkio_subsys); -} + mutex_lock(&blkcg_pol_mutex); -module_init(init_cgroup_blkio); -module_exit(exit_cgroup_blkio); -MODULE_LICENSE("GPL"); + if (WARN_ON(blkcg_policy[pol->plid] != pol)) + goto out_unlock; + + /* kill the intf files first */ + if (pol->cftypes) + cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); + + /* unregister and update blkgs */ + blkcg_policy[pol->plid] = NULL; +out_unlock: + mutex_unlock(&blkcg_pol_mutex); +} +EXPORT_SYMBOL_GPL(blkcg_policy_unregister); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 6f3ace7..8ac457c 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,350 +15,371 @@ #include <linux/cgroup.h> #include <linux/u64_stats_sync.h> - -enum blkio_policy_id { - BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ - BLKIO_POLICY_THROTL, /* Throttling */ -}; +#include <linux/seq_file.h> +#include <linux/radix-tree.h> /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) - -#ifndef CONFIG_BLK_CGROUP -/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ -extern struct cgroup_subsys blkio_subsys; -#define blkio_subsys_id blkio_subsys.subsys_id -#endif - -enum stat_type { - /* Total time spent (in ns) between request dispatch to the driver and - * request completion for IOs doen by this cgroup. This may not be - * accurate when NCQ is turned on. */ - BLKIO_STAT_SERVICE_TIME = 0, - /* Total time spent waiting in scheduler queue in ns */ - BLKIO_STAT_WAIT_TIME, - /* Number of IOs queued up */ - BLKIO_STAT_QUEUED, - /* All the single valued stats go below this */ - BLKIO_STAT_TIME, -#ifdef CONFIG_DEBUG_BLK_CGROUP - /* Time not charged to this cgroup */ - BLKIO_STAT_UNACCOUNTED_TIME, - BLKIO_STAT_AVG_QUEUE_SIZE, - BLKIO_STAT_IDLE_TIME, - BLKIO_STAT_EMPTY_TIME, - BLKIO_STAT_GROUP_WAIT_TIME, - BLKIO_STAT_DEQUEUE -#endif -}; +/* CFQ specific, out here for blkcg->cfq_weight */ +#define CFQ_WEIGHT_MIN 10 +#define CFQ_WEIGHT_MAX 1000 +#define CFQ_WEIGHT_DEFAULT 500 -/* Per cpu stats */ -enum stat_type_cpu { - BLKIO_STAT_CPU_SECTORS, - /* Total bytes transferred */ - BLKIO_STAT_CPU_SERVICE_BYTES, - /* Total IOs serviced, post merge */ - BLKIO_STAT_CPU_SERVICED, - /* Number of IOs merged */ - BLKIO_STAT_CPU_MERGED, - BLKIO_STAT_CPU_NR -}; +#ifdef CONFIG_BLK_CGROUP -enum stat_sub_type { - BLKIO_STAT_READ = 0, - BLKIO_STAT_WRITE, - BLKIO_STAT_SYNC, - BLKIO_STAT_ASYNC, - BLKIO_STAT_TOTAL -}; +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, -/* blkg state flags */ -enum blkg_state_flags { - BLKG_waiting = 0, - BLKG_idling, - BLKG_empty, + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, }; -/* cgroup files owned by proportional weight policy */ -enum blkcg_file_name_prop { - BLKIO_PROP_weight = 1, - BLKIO_PROP_weight_device, - BLKIO_PROP_io_service_bytes, - BLKIO_PROP_io_serviced, - BLKIO_PROP_time, - BLKIO_PROP_sectors, - BLKIO_PROP_unaccounted_time, - BLKIO_PROP_io_service_time, - BLKIO_PROP_io_wait_time, - BLKIO_PROP_io_merged, - BLKIO_PROP_io_queued, - BLKIO_PROP_avg_queue_size, - BLKIO_PROP_group_wait_time, - BLKIO_PROP_idle_time, - BLKIO_PROP_empty_time, - BLKIO_PROP_dequeue, -}; +struct blkcg_gq; -/* cgroup files owned by throttle policy */ -enum blkcg_file_name_throtl { - BLKIO_THROTL_read_bps_device, - BLKIO_THROTL_write_bps_device, - BLKIO_THROTL_read_iops_device, - BLKIO_THROTL_write_iops_device, - BLKIO_THROTL_io_service_bytes, - BLKIO_THROTL_io_serviced, -}; +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; -struct blkio_cgroup { - struct cgroup_subsys_state css; - unsigned int weight; - spinlock_t lock; - struct hlist_head blkg_list; - struct list_head policy_list; /* list of blkio_policy_node */ -}; + struct radix_tree_root blkg_tree; + struct blkcg_gq *blkg_hint; + struct hlist_head blkg_list; + + /* for policies to test whether associated blkcg has changed */ + uint64_t id; -struct blkio_group_stats { - /* total disk time and nr sectors dispatched by this group */ - uint64_t time; - uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; -#ifdef CONFIG_DEBUG_BLK_CGROUP - /* Time not charged to this cgroup */ - uint64_t unaccounted_time; - - /* Sum of number of IOs queued across all samples */ - uint64_t avg_queue_size_sum; - /* Count of samples taken for average */ - uint64_t avg_queue_size_samples; - /* How many times this group has been removed from service tree */ - unsigned long dequeue; - - /* Total time spent waiting for it to be assigned a timeslice. */ - uint64_t group_wait_time; - uint64_t start_group_wait_time; - - /* Time spent idling for this blkio_group */ - uint64_t idle_time; - uint64_t start_idle_time; - /* - * Total time when we have requests queued and do not contain the - * current active queue. - */ - uint64_t empty_time; - uint64_t start_empty_time; - uint16_t flags; -#endif + /* TODO: per-policy storage in blkcg */ + unsigned int cfq_weight; /* belongs to cfq */ }; -/* Per cpu blkio group stats */ -struct blkio_group_stats_cpu { - uint64_t sectors; - uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; - struct u64_stats_sync syncp; +struct blkg_stat { + struct u64_stats_sync syncp; + uint64_t cnt; }; -struct blkio_group { - /* An rcu protected unique identifier for the group */ - void *key; - struct hlist_node blkcg_node; - unsigned short blkcg_id; - /* Store cgroup path */ - char path[128]; - /* The device MKDEV(major, minor), this group has been created for */ - dev_t dev; - /* policy which owns this blk group */ - enum blkio_policy_id plid; - - /* Need to serialize the stats in the case of reset/update */ - spinlock_t stats_lock; - struct blkio_group_stats stats; - /* Per cpu stats pointer */ - struct blkio_group_stats_cpu __percpu *stats_cpu; +struct blkg_rwstat { + struct u64_stats_sync syncp; + uint64_t cnt[BLKG_RWSTAT_NR]; }; -struct blkio_policy_node { - struct list_head node; - dev_t dev; - /* This node belongs to max bw policy or porportional weight policy */ - enum blkio_policy_id plid; - /* cgroup file to which this rule belongs to */ - int fileid; - - union { - unsigned int weight; - /* - * Rate read/write in terms of bytes per second - * Whether this rate represents read or write is determined - * by file type "fileid". - */ - u64 bps; - unsigned int iops; - } val; +/* + * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a + * request_queue (q). This is used by blkcg policies which need to track + * information per blkcg - q pair. + * + * There can be multiple active blkcg policies and each has its private + * data on each blkg, the size of which is determined by + * blkcg_policy->pd_size. blkcg core allocates and frees such areas + * together with blkg and invokes pd_init/exit_fn() methods. + * + * Such private data must embed struct blkg_policy_data (pd) at the + * beginning and pd_size can't be smaller than pd. + */ +struct blkg_policy_data { + /* the blkg this per-policy data belongs to */ + struct blkcg_gq *blkg; + + /* used during policy activation */ + struct list_head alloc_node; }; -extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, - dev_t dev); -extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, - dev_t dev); -extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, - dev_t dev); -extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, - dev_t dev); -extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, - dev_t dev); - -typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); - -typedef void (blkio_update_group_weight_fn) (void *key, - struct blkio_group *blkg, unsigned int weight); -typedef void (blkio_update_group_read_bps_fn) (void * key, - struct blkio_group *blkg, u64 read_bps); -typedef void (blkio_update_group_write_bps_fn) (void *key, - struct blkio_group *blkg, u64 write_bps); -typedef void (blkio_update_group_read_iops_fn) (void *key, - struct blkio_group *blkg, unsigned int read_iops); -typedef void (blkio_update_group_write_iops_fn) (void *key, - struct blkio_group *blkg, unsigned int write_iops); - -struct blkio_policy_ops { - blkio_unlink_group_fn *blkio_unlink_group_fn; - blkio_update_group_weight_fn *blkio_update_group_weight_fn; - blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; - blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; - blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; - blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + /* reference count */ + int refcnt; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; + + struct rcu_head rcu_head; }; -struct blkio_policy_type { - struct list_head list; - struct blkio_policy_ops ops; - enum blkio_policy_id plid; +typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); + +struct blkcg_policy { + int plid; + /* policy specific private data size */ + size_t pd_size; + /* cgroup files for the policy */ + struct cftype *cftypes; + + /* operations */ + blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; +extern struct blkcg blkcg_root; + +struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup); +struct blkcg *bio_blkcg(struct bio *bio); +struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q); +int blkcg_init_queue(struct request_queue *q); +void blkcg_drain_queue(struct request_queue *q); +void blkcg_exit_queue(struct request_queue *q); + /* Blkio controller policy registration */ -extern void blkio_policy_register(struct blkio_policy_type *); -extern void blkio_policy_unregister(struct blkio_policy_type *); +int blkcg_policy_register(struct blkcg_policy *pol); +void blkcg_policy_unregister(struct blkcg_policy *pol); +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol); + +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total); +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat *rwstat); +u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); + +struct blkg_conf_ctx { + struct gendisk *disk; + struct blkcg_gq *blkg; + u64 v; +}; + +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + const char *input, struct blkg_conf_ctx *ctx); +void blkg_conf_finish(struct blkg_conf_ctx *ctx); + + +/** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) +{ + return blkg ? blkg->pd[pol->plid] : NULL; +} + +/** + * pdata_to_blkg - get blkg associated with policy private data + * @pd: policy private data of interest + * + * @pd is policy private data. Determine the blkg it's associated with. + */ +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) +{ + return pd ? pd->blkg : NULL; +} + +/** + * blkg_path - format cgroup path of blkg + * @blkg: blkg of interest + * @buf: target buffer + * @buflen: target buffer length + * + * Format the path of the cgroup of @blkg into @buf. + */ +static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) +{ + int ret; + + rcu_read_lock(); + ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); + rcu_read_unlock(); + if (ret) + strncpy(buf, "<unavailable>", buflen); + return ret; +} -static inline char *blkg_path(struct blkio_group *blkg) +/** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding queue_lock and an existing reference. + */ +static inline void blkg_get(struct blkcg_gq *blkg) { - return blkg->path; + lockdep_assert_held(blkg->q->queue_lock); + WARN_ON_ONCE(!blkg->refcnt); + blkg->refcnt++; } -#else +void __blkg_release(struct blkcg_gq *blkg); -struct blkio_group { +/** + * blkg_put - put a blkg reference + * @blkg: blkg to put + * + * The caller should be holding queue_lock. + */ +static inline void blkg_put(struct blkcg_gq *blkg) +{ + lockdep_assert_held(blkg->q->queue_lock); + WARN_ON_ONCE(blkg->refcnt <= 0); + if (!--blkg->refcnt) + __blkg_release(blkg); +} + +/** + * blkg_stat_add - add a value to a blkg_stat + * @stat: target blkg_stat + * @val: value to add + * + * Add @val to @stat. The caller is responsible for synchronizing calls to + * this function. + */ +static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) +{ + u64_stats_update_begin(&stat->syncp); + stat->cnt += val; + u64_stats_update_end(&stat->syncp); +} + +/** + * blkg_stat_read - read the current value of a blkg_stat + * @stat: blkg_stat to read + * + * Read the current value of @stat. This function can be called without + * synchroniztion and takes care of u64 atomicity. + */ +static inline uint64_t blkg_stat_read(struct blkg_stat *stat) +{ + unsigned int start; + uint64_t v; + + do { + start = u64_stats_fetch_begin(&stat->syncp); + v = stat->cnt; + } while (u64_stats_fetch_retry(&stat->syncp, start)); + + return v; +} + +/** + * blkg_stat_reset - reset a blkg_stat + * @stat: blkg_stat to reset + */ +static inline void blkg_stat_reset(struct blkg_stat *stat) +{ + stat->cnt = 0; +} + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @rw: mask of REQ_{WRITE|SYNC} + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + int rw, uint64_t val) +{ + u64_stats_update_begin(&rwstat->syncp); + + if (rw & REQ_WRITE) + rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + else + rwstat->cnt[BLKG_RWSTAT_READ] += val; + if (rw & REQ_SYNC) + rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + else + rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + + u64_stats_update_end(&rwstat->syncp); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it as the return value. + * This function can be called without synchronization and takes care of + * u64 atomicity. + */ +static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) +{ + unsigned int start; + struct blkg_rwstat tmp; + + do { + start = u64_stats_fetch_begin(&rwstat->syncp); + tmp = *rwstat; + } while (u64_stats_fetch_retry(&rwstat->syncp, start)); + + return tmp; +} + +/** + * blkg_rwstat_sum - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); + + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); +} + +#else /* CONFIG_BLK_CGROUP */ + +struct cgroup; + +struct blkg_policy_data { }; -struct blkio_policy_type { +struct blkcg_gq { }; -static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } -static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } - -static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } - -#endif - -#define BLKIO_WEIGHT_MIN 10 -#define BLKIO_WEIGHT_MAX 1000 -#define BLKIO_WEIGHT_DEFAULT 500 - -#ifdef CONFIG_DEBUG_BLK_CGROUP -void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); -void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue); -void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); -void blkiocg_update_idle_time_stats(struct blkio_group *blkg); -void blkiocg_set_start_empty_time(struct blkio_group *blkg); - -#define BLKG_FLAG_FNS(name) \ -static inline void blkio_mark_blkg_##name( \ - struct blkio_group_stats *stats) \ -{ \ - stats->flags |= (1 << BLKG_##name); \ -} \ -static inline void blkio_clear_blkg_##name( \ - struct blkio_group_stats *stats) \ -{ \ - stats->flags &= ~(1 << BLKG_##name); \ -} \ -static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ -{ \ - return (stats->flags & (1 << BLKG_##name)) != 0; \ -} \ - -BLKG_FLAG_FNS(waiting) -BLKG_FLAG_FNS(idling) -BLKG_FLAG_FNS(empty) -#undef BLKG_FLAG_FNS -#else -static inline void blkiocg_update_avg_queue_size_stats( - struct blkio_group *blkg) {} -static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) {} -static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) -{} -static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} -static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} -#endif - -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) -extern struct blkio_cgroup blkio_root_cgroup; -extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); -extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); -extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid); -extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); -extern int blkiocg_del_blkio_group(struct blkio_group *blkg); -extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, - void *key); -void blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time, - unsigned long unaccounted_time); -void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, - bool direction, bool sync); -void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); -void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, - bool sync); -void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync); -void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync); -#else -struct cgroup; -static inline struct blkio_cgroup * -cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } -static inline struct blkio_cgroup * -task_blkio_cgroup(struct task_struct *tsk) { return NULL; } - -static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev, - enum blkio_policy_id plid) {} - -static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } - -static inline int -blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } - -static inline struct blkio_group * -blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } -static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time, - unsigned long unaccounted_time) -{} -static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) {} -static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, - uint64_t start_time, uint64_t io_start_time, bool direction, - bool sync) {} -static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) {} -static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -#endif -#endif /* _BLK_CGROUP_H */ +struct blkcg_policy { +}; + +static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } +static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline int blkcg_init_queue(struct request_queue *q) { return 0; } +static inline void blkcg_drain_queue(struct request_queue *q) { } +static inline void blkcg_exit_queue(struct request_queue *q) { } +static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } +static inline int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { return 0; } +static inline void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) { } + +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) { return NULL; } +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } +static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } +static inline void blkg_get(struct blkcg_gq *blkg) { } +static inline void blkg_put(struct blkcg_gq *blkg) { } + +#endif /* CONFIG_BLK_CGROUP */ +#endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 1f61b74..3c923a7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -29,11 +29,13 @@ #include <linux/fault-inject.h> #include <linux/list_sort.h> #include <linux/delay.h> +#include <linux/ratelimit.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" +#include "blk-cgroup.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -280,7 +282,7 @@ EXPORT_SYMBOL(blk_stop_queue); * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevaotor_exit() - * and blk_throtl_exit() to be called with queue lock initialized. + * and blkcg_exit_queue() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) @@ -365,17 +367,23 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) spin_lock_irq(q->queue_lock); - elv_drain_elevator(q); - if (drain_all) - blk_throtl_drain(q); + /* + * The caller might be trying to drain @q before its + * elevator is initialized. + */ + if (q->elevator) + elv_drain_elevator(q); + + blkcg_drain_queue(q); /* * This function might be called on a queue which failed - * driver init after queue creation. Some drivers - * (e.g. fd) get unhappy in such cases. Kick queue iff - * dispatch queue has something on it. + * driver init after queue creation or is not yet fully + * active yet. Some drivers (e.g. fd and loop) get unhappy + * in such cases. Kick queue iff dispatch queue has + * something on it and @q has request_fn set. */ - if (!list_empty(&q->queue_head)) + if (!list_empty(&q->queue_head) && q->request_fn) __blk_run_queue(q); drain |= q->rq.elvpriv; @@ -403,6 +411,49 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) } /** + * blk_queue_bypass_start - enter queue bypass mode + * @q: queue of interest + * + * In bypass mode, only the dispatch FIFO queue of @q is used. This + * function makes @q enter bypass mode and drains all requests which were + * throttled or issued before. On return, it's guaranteed that no request + * is being throttled or has ELVPRIV set and blk_queue_bypass() %true + * inside queue or RCU read lock. + */ +void blk_queue_bypass_start(struct request_queue *q) +{ + bool drain; + + spin_lock_irq(q->queue_lock); + drain = !q->bypass_depth++; + queue_flag_set(QUEUE_FLAG_BYPASS, q); + spin_unlock_irq(q->queue_lock); + + if (drain) { + blk_drain_queue(q, false); + /* ensure blk_queue_bypass() is %true inside RCU read lock */ + synchronize_rcu(); + } +} +EXPORT_SYMBOL_GPL(blk_queue_bypass_start); + +/** + * blk_queue_bypass_end - leave queue bypass mode + * @q: queue of interest + * + * Leave bypass mode and restore the normal queueing behavior. + */ +void blk_queue_bypass_end(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + if (!--q->bypass_depth) + queue_flag_clear(QUEUE_FLAG_BYPASS, q); + WARN_ON_ONCE(q->bypass_depth < 0); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(blk_queue_bypass_end); + +/** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown * @@ -418,6 +469,19 @@ void blk_cleanup_queue(struct request_queue *q) queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); spin_lock_irq(lock); + + /* + * Dead queue is permanently in bypass mode till released. Note + * that, unlike blk_queue_bypass_start(), we aren't performing + * synchronize_rcu() after entering bypass mode to avoid the delay + * as some drivers create and destroy a lot of queues while + * probing. This is still safe because blk_release_queue() will be + * called only after the queue refcnt drops to zero and nothing, + * RCU or not, would be traversing the queue by then. + */ + q->bypass_depth++; + queue_flag_set(QUEUE_FLAG_BYPASS, q); + queue_flag_set(QUEUE_FLAG_NOMERGES, q); queue_flag_set(QUEUE_FLAG_NOXMERGES, q); queue_flag_set(QUEUE_FLAG_DEAD, q); @@ -428,13 +492,8 @@ void blk_cleanup_queue(struct request_queue *q) spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); - /* - * Drain all requests queued before DEAD marking. The caller might - * be trying to tear down @q before its elevator is initialized, in - * which case we don't want to call into draining. - */ - if (q->elevator) - blk_drain_queue(q, true); + /* drain all requests queued before DEAD marking */ + blk_drain_queue(q, true); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); @@ -498,14 +557,15 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (err) goto fail_id; - if (blk_throtl_init(q)) - goto fail_id; - setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); + INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); +#ifdef CONFIG_BLK_CGROUP + INIT_LIST_HEAD(&q->blkg_list); +#endif INIT_LIST_HEAD(&q->flush_queue[0]); INIT_LIST_HEAD(&q->flush_queue[1]); INIT_LIST_HEAD(&q->flush_data_in_flight); @@ -522,6 +582,18 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) */ q->queue_lock = &q->__queue_lock; + /* + * A queue starts its life with bypass turned on to avoid + * unnecessary bypass on/off overhead and nasty surprises during + * init. The initial bypass will be finished at the end of + * blk_init_allocated_queue(). + */ + q->bypass_depth = 1; + __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); + + if (blkcg_init_queue(q)) + goto fail_id; + return q; fail_id: @@ -614,15 +686,15 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, q->sg_reserved_size = INT_MAX; - /* - * all done - */ - if (!elevator_init(q, NULL)) { - blk_queue_congestion_threshold(q); - return q; - } + /* init elevator */ + if (elevator_init(q, NULL)) + return NULL; - return NULL; + blk_queue_congestion_threshold(q); + + /* all done, end the initial bypass */ + blk_queue_bypass_end(q); + return q; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -648,33 +720,6 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) mempool_free(rq, q->rq.rq_pool); } -static struct request * -blk_alloc_request(struct request_queue *q, struct io_cq *icq, - unsigned int flags, gfp_t gfp_mask) -{ - struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); - - if (!rq) - return NULL; - - blk_rq_init(q, rq); - - rq->cmd_flags = flags | REQ_ALLOCED; - - if (flags & REQ_ELVPRIV) { - rq->elv.icq = icq; - if (unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); - return NULL; - } - /* @rq->elv.icq holds on to io_context until @rq is freed */ - if (icq) - get_io_context(icq->ioc); - } - - return rq; -} - /* * ioc_batching returns true if the ioc is a valid batching request and * should be given priority access to a request. @@ -763,6 +808,22 @@ static bool blk_rq_should_init_elevator(struct bio *bio) } /** + * rq_ioc - determine io_context for request allocation + * @bio: request being allocated is for this bio (can be %NULL) + * + * Determine io_context to use for request allocation for @bio. May return + * %NULL if %current->io_context doesn't exist. + */ +static struct io_context *rq_ioc(struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (bio && bio->bi_ioc) + return bio->bi_ioc; +#endif + return current->io_context; +} + +/** * get_request - get a free request * @q: request_queue to allocate request from * @rw_flags: RW and SYNC flags @@ -779,7 +840,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio) static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) { - struct request *rq = NULL; + struct request *rq; struct request_list *rl = &q->rq; struct elevator_type *et; struct io_context *ioc; @@ -789,7 +850,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, int may_queue; retry: et = q->elevator->type; - ioc = current->io_context; + ioc = rq_ioc(bio); if (unlikely(blk_queue_dead(q))) return NULL; @@ -808,7 +869,7 @@ retry: */ if (!ioc && !retried) { spin_unlock_irq(q->queue_lock); - create_io_context(current, gfp_mask, q->node); + create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); retried = true; goto retry; @@ -831,7 +892,7 @@ retry: * process is not a "batcher", and not * exempted by the IO scheduler */ - goto out; + return NULL; } } } @@ -844,7 +905,7 @@ retry: * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - goto out; + return NULL; rl->count[is_sync]++; rl->starved[is_sync] = 0; @@ -859,8 +920,7 @@ retry: * Also, lookup icq while holding queue_lock. If it doesn't exist, * it will be created after releasing queue_lock. */ - if (blk_rq_should_init_elevator(bio) && - !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { + if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { rw_flags |= REQ_ELVPRIV; rl->elvpriv++; if (et->icq_cache && ioc) @@ -871,41 +931,36 @@ retry: rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - /* create icq if missing */ - if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) { - icq = ioc_create_icq(q, gfp_mask); - if (!icq) - goto fail_icq; - } - - rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); + /* allocate and init request */ + rq = mempool_alloc(q->rq.rq_pool, gfp_mask); + if (!rq) + goto fail_alloc; -fail_icq: - if (unlikely(!rq)) { - /* - * Allocation failed presumably due to memory. Undo anything - * we might have messed up. - * - * Allocating task should really be put onto the front of the - * wait queue, but this is pretty rare. - */ - spin_lock_irq(q->queue_lock); - freed_request(q, rw_flags); + blk_rq_init(q, rq); + rq->cmd_flags = rw_flags | REQ_ALLOCED; + + /* init elvpriv */ + if (rw_flags & REQ_ELVPRIV) { + if (unlikely(et->icq_cache && !icq)) { + create_io_context(gfp_mask, q->node); + ioc = rq_ioc(bio); + if (!ioc) + goto fail_elvpriv; + + icq = ioc_create_icq(ioc, q, gfp_mask); + if (!icq) + goto fail_elvpriv; + } - /* - * in the very unlikely event that allocation failed and no - * requests for this direction was pending, mark us starved - * so that freeing of a request in the other direction will - * notice us. another possible fix would be to split the - * rq mempool into READ and WRITE - */ -rq_starved: - if (unlikely(rl->count[is_sync] == 0)) - rl->starved[is_sync] = 1; + rq->elv.icq = icq; + if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) + goto fail_elvpriv; - goto out; + /* @rq->elv.icq holds io_context until @rq is freed */ + if (icq) + get_io_context(icq->ioc); } - +out: /* * ioc may be NULL here, and ioc_batching will be false. That's * OK, if the queue is under the request limit then requests need @@ -916,8 +971,48 @@ rq_starved: ioc->nr_batch_requests--; trace_block_getrq(q, bio, rw_flags & 1); -out: return rq; + +fail_elvpriv: + /* + * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed + * and may fail indefinitely under memory pressure and thus + * shouldn't stall IO. Treat this request as !elvpriv. This will + * disturb iosched and blkcg but weird is bettern than dead. + */ + printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", + dev_name(q->backing_dev_info.dev)); + + rq->cmd_flags &= ~REQ_ELVPRIV; + rq->elv.icq = NULL; + + spin_lock_irq(q->queue_lock); + rl->elvpriv--; + spin_unlock_irq(q->queue_lock); + goto out; + +fail_alloc: + /* + * Allocation failed presumably due to memory. Undo anything we + * might have messed up. + * + * Allocating task should really be put onto the front of the wait + * queue, but this is pretty rare. + */ + spin_lock_irq(q->queue_lock); + freed_request(q, rw_flags); + + /* + * in the very unlikely event that allocation failed and no + * requests for this direction was pending, mark us starved so that + * freeing of a request in the other direction will notice + * us. another possible fix would be to split the rq mempool into + * READ and WRITE + */ +rq_starved: + if (unlikely(rl->count[is_sync] == 0)) + rl->starved[is_sync] = 1; + return NULL; } /** @@ -961,7 +1056,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, * up to a big batch of them for a small period time. * See ioc_batching, ioc_set_batching */ - create_io_context(current, GFP_NOIO, q->node); + create_io_context(GFP_NOIO, q->node); ioc_set_batching(q, current->io_context); spin_lock_irq(q->queue_lock); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index fb95dd2..1e2d53b 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -155,20 +155,20 @@ void put_io_context(struct io_context *ioc) } EXPORT_SYMBOL(put_io_context); -/* Called by the exiting task */ -void exit_io_context(struct task_struct *task) +/** + * put_io_context_active - put active reference on ioc + * @ioc: ioc of interest + * + * Undo get_io_context_active(). If active reference reaches zero after + * put, @ioc can never issue further IOs and ioscheds are notified. + */ +void put_io_context_active(struct io_context *ioc) { - struct io_context *ioc; - struct io_cq *icq; struct hlist_node *n; unsigned long flags; + struct io_cq *icq; - task_lock(task); - ioc = task->io_context; - task->io_context = NULL; - task_unlock(task); - - if (!atomic_dec_and_test(&ioc->nr_tasks)) { + if (!atomic_dec_and_test(&ioc->active_ref)) { put_io_context(ioc); return; } @@ -197,6 +197,20 @@ retry: put_io_context(ioc); } +/* Called by the exiting task */ +void exit_io_context(struct task_struct *task) +{ + struct io_context *ioc; + + task_lock(task); + ioc = task->io_context; + task->io_context = NULL; + task_unlock(task); + + atomic_dec(&ioc->nr_tasks); + put_io_context_active(ioc); +} + /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared @@ -218,19 +232,18 @@ void ioc_clear_queue(struct request_queue *q) } } -void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, - int node) +int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) { struct io_context *ioc; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) - return; + return -ENOMEM; /* initialize */ atomic_long_set(&ioc->refcount, 1); - atomic_set(&ioc->nr_tasks, 1); + atomic_set(&ioc->active_ref, 1); spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ioc->icq_list); @@ -250,6 +263,8 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, else kmem_cache_free(iocontext_cachep, ioc); task_unlock(task); + + return 0; } /** @@ -281,7 +296,7 @@ struct io_context *get_task_io_context(struct task_struct *task, return ioc; } task_unlock(task); - } while (create_io_context(task, gfp_flags, node)); + } while (!create_task_io_context(task, gfp_flags, node)); return NULL; } @@ -325,26 +340,23 @@ EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq + * @ioc: io_context of interest * @q: request_queue of interest * @gfp_mask: allocation mask * - * Make sure io_cq linking %current->io_context and @q exists. If either - * io_context and/or icq don't exist, they will be created using @gfp_mask. + * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they + * will be created using @gfp_mask. * * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ -struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) +struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, + gfp_t gfp_mask) { struct elevator_type *et = q->elevator->type; - struct io_context *ioc; struct io_cq *icq; /* allocate stuff */ - ioc = create_io_context(current, gfp_mask, q->node); - if (!ioc) - return NULL; - icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, q->node); if (!icq) @@ -382,74 +394,6 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) return icq; } -void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags) -{ - struct io_cq *icq; - struct hlist_node *n; - - hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) - icq->flags |= flags; -} - -/** - * ioc_ioprio_changed - notify ioprio change - * @ioc: io_context of interest - * @ioprio: new ioprio - * - * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all - * icq's. iosched is responsible for checking the bit and applying it on - * request issue path. - */ -void ioc_ioprio_changed(struct io_context *ioc, int ioprio) -{ - unsigned long flags; - - spin_lock_irqsave(&ioc->lock, flags); - ioc->ioprio = ioprio; - ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED); - spin_unlock_irqrestore(&ioc->lock, flags); -} - -/** - * ioc_cgroup_changed - notify cgroup change - * @ioc: io_context of interest - * - * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's. - * iosched is responsible for checking the bit and applying it on request - * issue path. - */ -void ioc_cgroup_changed(struct io_context *ioc) -{ - unsigned long flags; - - spin_lock_irqsave(&ioc->lock, flags); - ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED); - spin_unlock_irqrestore(&ioc->lock, flags); -} -EXPORT_SYMBOL(ioc_cgroup_changed); - -/** - * icq_get_changed - fetch and clear icq changed mask - * @icq: icq of interest - * - * Fetch and clear ICQ_*_CHANGED bits from @icq. Grabs and releases - * @icq->ioc->lock. - */ -unsigned icq_get_changed(struct io_cq *icq) -{ - unsigned int changed = 0; - unsigned long flags; - - if (unlikely(icq->flags & ICQ_CHANGED_MASK)) { - spin_lock_irqsave(&icq->ioc->lock, flags); - changed = icq->flags & ICQ_CHANGED_MASK; - icq->flags &= ~ICQ_CHANGED_MASK; - spin_unlock_irqrestore(&icq->ioc->lock, flags); - } - return changed; -} -EXPORT_SYMBOL(icq_get_changed); - static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cf15001..aa41b47 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -9,6 +9,7 @@ #include <linux/blktrace_api.h> #include "blk.h" +#include "blk-cgroup.h" struct queue_sysfs_entry { struct attribute attr; @@ -479,6 +480,8 @@ static void blk_release_queue(struct kobject *kobj) blk_sync_queue(q); + blkcg_exit_queue(q); + if (q->elevator) { spin_lock_irq(q->queue_lock); ioc_clear_queue(q); @@ -486,15 +489,12 @@ static void blk_release_queue(struct kobject *kobj) elevator_exit(q->elevator); } - blk_throtl_exit(q); - if (rl->rq_pool) mempool_destroy(rl->rq_pool); if (q->queue_tags) __blk_queue_free_tags(q); - blk_throtl_release(q); blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f2ddb94..5b06595 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -21,6 +21,8 @@ static int throtl_quantum = 32; /* Throttling is performed over 100ms slice and after that slice is renewed */ static unsigned long throtl_slice = HZ/10; /* 100 ms */ +static struct blkcg_policy blkcg_policy_throtl; + /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; static void throtl_schedule_delayed_work(struct throtl_data *td, @@ -38,9 +40,17 @@ struct throtl_rb_root { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) +/* Per-cpu group stats */ +struct tg_stats_cpu { + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; +}; + struct throtl_grp { - /* List of throtl groups on the request queue*/ - struct hlist_node tg_node; + /* must be the first member */ + struct blkg_policy_data pd; /* active throtl group service_tree member */ struct rb_node rb_node; @@ -52,8 +62,6 @@ struct throtl_grp { */ unsigned long disptime; - struct blkio_group blkg; - atomic_t ref; unsigned int flags; /* Two lists for READ and WRITE */ @@ -80,18 +88,18 @@ struct throtl_grp { /* Some throttle limits got updated for the group */ int limits_changed; - struct rcu_head rcu_head; + /* Per cpu stats pointer */ + struct tg_stats_cpu __percpu *stats_cpu; + + /* List of tgs waiting for per cpu stats memory to be allocated */ + struct list_head stats_alloc_node; }; struct throtl_data { - /* List of throtl groups */ - struct hlist_head tg_list; - /* service tree for active throtl groups */ struct throtl_rb_root tg_service_tree; - struct throtl_grp *root_tg; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ @@ -108,6 +116,33 @@ struct throtl_data int limits_changed; }; +/* list and work item to allocate percpu group stats */ +static DEFINE_SPINLOCK(tg_stats_alloc_lock); +static LIST_HEAD(tg_stats_alloc_list); + +static void tg_stats_alloc_fn(struct work_struct *); +static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); + +static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct throtl_grp, pd) : NULL; +} + +static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) +{ + return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); +} + +static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) +{ + return pd_to_blkg(&tg->pd); +} + +static inline struct throtl_grp *td_root_tg(struct throtl_data *td) +{ + return blkg_to_tg(td->queue->root_blkg); +} + enum tg_state_flags { THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ }; @@ -128,244 +163,150 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \ THROTL_TG_FNS(on_rr); -#define throtl_log_tg(td, tg, fmt, args...) \ - blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ - blkg_path(&(tg)->blkg), ##args); \ +#define throtl_log_tg(td, tg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ +} while (0) #define throtl_log(td, fmt, args...) \ blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) -static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) -{ - if (blkg) - return container_of(blkg, struct throtl_grp, blkg); - - return NULL; -} - static inline unsigned int total_nr_queued(struct throtl_data *td) { return td->nr_queued[0] + td->nr_queued[1]; } -static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) -{ - atomic_inc(&tg->ref); - return tg; -} - -static void throtl_free_tg(struct rcu_head *head) +/* + * Worker for allocating per cpu stat for tgs. This is scheduled on the + * system_nrt_wq once there are some groups on the alloc_list waiting for + * allocation. + */ +static void tg_stats_alloc_fn(struct work_struct *work) { - struct throtl_grp *tg; + static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ + struct delayed_work *dwork = to_delayed_work(work); + bool empty = false; + +alloc_stats: + if (!stats_cpu) { + stats_cpu = alloc_percpu(struct tg_stats_cpu); + if (!stats_cpu) { + /* allocation failed, try again after some time */ + queue_delayed_work(system_nrt_wq, dwork, + msecs_to_jiffies(10)); + return; + } + } - tg = container_of(head, struct throtl_grp, rcu_head); - free_percpu(tg->blkg.stats_cpu); - kfree(tg); -} + spin_lock_irq(&tg_stats_alloc_lock); -static void throtl_put_tg(struct throtl_grp *tg) -{ - BUG_ON(atomic_read(&tg->ref) <= 0); - if (!atomic_dec_and_test(&tg->ref)) - return; + if (!list_empty(&tg_stats_alloc_list)) { + struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, + struct throtl_grp, + stats_alloc_node); + swap(tg->stats_cpu, stats_cpu); + list_del_init(&tg->stats_alloc_node); + } - /* - * A group is freed in rcu manner. But having an rcu lock does not - * mean that one can access all the fields of blkg and assume these - * are valid. For example, don't try to follow throtl_data and - * request queue links. - * - * Having a reference to blkg under an rcu allows acess to only - * values local to groups like group stats and group rate limits - */ - call_rcu(&tg->rcu_head, throtl_free_tg); + empty = list_empty(&tg_stats_alloc_list); + spin_unlock_irq(&tg_stats_alloc_lock); + if (!empty) + goto alloc_stats; } -static void throtl_init_group(struct throtl_grp *tg) +static void throtl_pd_init(struct blkcg_gq *blkg) { - INIT_HLIST_NODE(&tg->tg_node); + struct throtl_grp *tg = blkg_to_tg(blkg); + unsigned long flags; + RB_CLEAR_NODE(&tg->rb_node); bio_list_init(&tg->bio_lists[0]); bio_list_init(&tg->bio_lists[1]); tg->limits_changed = false; - /* Practically unlimited BW */ - tg->bps[0] = tg->bps[1] = -1; - tg->iops[0] = tg->iops[1] = -1; + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; /* - * Take the initial reference that will be released on destroy - * This can be thought of a joint reference by cgroup and - * request queue which will be dropped by either request queue - * exit or cgroup deletion path depending on who is exiting first. + * Ugh... We need to perform per-cpu allocation for tg->stats_cpu + * but percpu allocator can't be called from IO path. Queue tg on + * tg_stats_alloc_list and allocate from work item. */ - atomic_set(&tg->ref, 1); + spin_lock_irqsave(&tg_stats_alloc_lock, flags); + list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); + queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0); + spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); } -/* Should be called with rcu read lock held (needed for blkcg) */ -static void -throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) +static void throtl_pd_exit(struct blkcg_gq *blkg) { - hlist_add_head(&tg->tg_node, &td->tg_list); - td->nr_undestroyed_grps++; -} - -static void -__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) -{ - struct backing_dev_info *bdi = &td->queue->backing_dev_info; - unsigned int major, minor; - - if (!tg || tg->blkg.dev) - return; - - /* - * Fill in device details for a group which might not have been - * filled at group creation time as queue was being instantiated - * and driver had not attached a device yet - */ - if (bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - tg->blkg.dev = MKDEV(major, minor); - } -} - -/* - * Should be called with without queue lock held. Here queue lock will be - * taken rarely. It will be taken only once during life time of a group - * if need be - */ -static void -throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) -{ - if (!tg || tg->blkg.dev) - return; - - spin_lock_irq(td->queue->queue_lock); - __throtl_tg_fill_dev_details(td, tg); - spin_unlock_irq(td->queue->queue_lock); -} - -static void throtl_init_add_tg_lists(struct throtl_data *td, - struct throtl_grp *tg, struct blkio_cgroup *blkcg) -{ - __throtl_tg_fill_dev_details(td, tg); - - /* Add group onto cgroup list */ - blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, - tg->blkg.dev, BLKIO_POLICY_THROTL); + struct throtl_grp *tg = blkg_to_tg(blkg); + unsigned long flags; - tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); - tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); - tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); - tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); + spin_lock_irqsave(&tg_stats_alloc_lock, flags); + list_del_init(&tg->stats_alloc_node); + spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); - throtl_add_group_to_td_list(td, tg); + free_percpu(tg->stats_cpu); } -/* Should be called without queue lock and outside of rcu period */ -static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) +static void throtl_pd_reset_stats(struct blkcg_gq *blkg) { - struct throtl_grp *tg = NULL; - int ret; + struct throtl_grp *tg = blkg_to_tg(blkg); + int cpu; - tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); - if (!tg) - return NULL; + if (tg->stats_cpu == NULL) + return; - ret = blkio_alloc_blkg_stats(&tg->blkg); + for_each_possible_cpu(cpu) { + struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - if (ret) { - kfree(tg); - return NULL; + blkg_rwstat_reset(&sc->service_bytes); + blkg_rwstat_reset(&sc->serviced); } - - throtl_init_group(tg); - return tg; } -static struct -throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) +static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, + struct blkcg *blkcg) { - struct throtl_grp *tg = NULL; - void *key = td; - /* - * This is the common case when there are no blkio cgroups. - * Avoid lookup in this case - */ - if (blkcg == &blkio_root_cgroup) - tg = td->root_tg; - else - tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + * This is the common case when there are no blkcgs. Avoid lookup + * in this case + */ + if (blkcg == &blkcg_root) + return td_root_tg(td); - __throtl_tg_fill_dev_details(td, tg); - return tg; + return blkg_to_tg(blkg_lookup(blkcg, td->queue)); } -static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, + struct blkcg *blkcg) { - struct throtl_grp *tg = NULL, *__tg = NULL; - struct blkio_cgroup *blkcg; struct request_queue *q = td->queue; - - /* no throttling for dead queue */ - if (unlikely(blk_queue_dead(q))) - return NULL; - - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); - tg = throtl_find_tg(td, blkcg); - if (tg) { - rcu_read_unlock(); - return tg; - } - - /* - * Need to allocate a group. Allocation of group also needs allocation - * of per cpu stats which in-turn takes a mutex() and can block. Hence - * we need to drop rcu lock and queue_lock before we call alloc. - */ - rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); - - tg = throtl_alloc_tg(td); - - /* Group allocated and queue is still alive. take the lock */ - spin_lock_irq(q->queue_lock); - - /* Make sure @q is still alive */ - if (unlikely(blk_queue_dead(q))) { - kfree(tg); - return NULL; - } - - /* - * Initialize the new group. After sleeping, read the blkcg again. - */ - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); + struct throtl_grp *tg = NULL; /* - * If some other thread already allocated the group while we were - * not holding queue lock, free up the group + * This is the common case when there are no blkcgs. Avoid lookup + * in this case */ - __tg = throtl_find_tg(td, blkcg); - - if (__tg) { - kfree(tg); - rcu_read_unlock(); - return __tg; - } - - /* Group allocation failed. Account the IO to root group */ - if (!tg) { - tg = td->root_tg; - return tg; + if (blkcg == &blkcg_root) { + tg = td_root_tg(td); + } else { + struct blkcg_gq *blkg; + + blkg = blkg_lookup_create(blkcg, q); + + /* if %NULL and @q is alive, fall back to root_tg */ + if (!IS_ERR(blkg)) + tg = blkg_to_tg(blkg); + else if (!blk_queue_dead(q)) + tg = td_root_tg(td); } - throtl_init_add_tg_lists(td, tg, blkcg); - rcu_read_unlock(); return tg; } @@ -734,16 +675,41 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, return 0; } +static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, + int rw) +{ + struct throtl_grp *tg = blkg_to_tg(blkg); + struct tg_stats_cpu *stats_cpu; + unsigned long flags; + + /* If per cpu stats are not allocated yet, don't do any accounting. */ + if (tg->stats_cpu == NULL) + return; + + /* + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. + */ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(tg->stats_cpu); + + blkg_rwstat_add(&stats_cpu->serviced, rw, 1); + blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); + + local_irq_restore(flags); +} + static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); - bool sync = rw_is_sync(bio->bi_rw); /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_size; tg->io_disp[rw]++; - blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); + throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -753,7 +719,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, bio_list_add(&tg->bio_lists[rw], bio); /* Take a bio reference on tg */ - throtl_ref_get_tg(tg); + blkg_get(tg_to_blkg(tg)); tg->nr_queued[rw]++; td->nr_queued[rw]++; throtl_enqueue_tg(td, tg); @@ -786,8 +752,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, bio = bio_list_pop(&tg->bio_lists[rw]); tg->nr_queued[rw]--; - /* Drop bio reference on tg */ - throtl_put_tg(tg); + /* Drop bio reference on blkg */ + blkg_put(tg_to_blkg(tg)); BUG_ON(td->nr_queued[rw] <= 0); td->nr_queued[rw]--; @@ -865,8 +831,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) static void throtl_process_limit_change(struct throtl_data *td) { - struct throtl_grp *tg; - struct hlist_node *pos, *n; + struct request_queue *q = td->queue; + struct blkcg_gq *blkg, *n; if (!td->limits_changed) return; @@ -875,7 +841,9 @@ static void throtl_process_limit_change(struct throtl_data *td) throtl_log(td, "limits changed"); - hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + struct throtl_grp *tg = blkg_to_tg(blkg); + if (!tg->limits_changed) continue; @@ -973,120 +941,159 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) } } -static void -throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) +static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { - /* Something wrong if we are trying to remove same group twice */ - BUG_ON(hlist_unhashed(&tg->tg_node)); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkg_rwstat rwstat = { }, tmp; + int i, cpu; - hlist_del_init(&tg->tg_node); + for_each_possible_cpu(cpu) { + struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - /* - * Put the reference taken at the time of creation so that when all - * queues are gone, group can be destroyed. - */ - throtl_put_tg(tg); - td->nr_undestroyed_grps--; + tmp = blkg_rwstat_read((void *)sc + off); + for (i = 0; i < BLKG_RWSTAT_NR; i++) + rwstat.cnt[i] += tmp.cnt[i]; + } + + return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static void throtl_release_tgs(struct throtl_data *td) +static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) { - struct hlist_node *pos, *n; - struct throtl_grp *tg; + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); - hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { - /* - * If cgroup removal path got to blk_group first and removed - * it from cgroup list, then it will take care of destroying - * cfqg also. - */ - if (!blkiocg_del_blkio_group(&tg->blkg)) - throtl_destroy_tg(td, tg); - } + blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, + cft->private, true); + return 0; } -/* - * Blk cgroup controller notification saying that blkio_group object is being - * delinked as associated cgroup object is going away. That also means that - * no new IO will come in this group. So get rid of this group as soon as - * any pending IO in the group is finished. - * - * This function is called under rcu_read_lock(). key is the rcu protected - * pointer. That means "key" is a valid throtl_data pointer as long as we are - * rcu read lock. - * - * "key" was fetched from blkio_group under blkio_cgroup->lock. That means - * it should not be NULL as even if queue was going away, cgroup deltion - * path got to it first. - */ -void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) +static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, + int off) { - unsigned long flags; - struct throtl_data *td = key; + struct throtl_grp *tg = pd_to_tg(pd); + u64 v = *(u64 *)((void *)tg + off); - spin_lock_irqsave(td->queue->queue_lock, flags); - throtl_destroy_tg(td, tg_of_blkg(blkg)); - spin_unlock_irqrestore(td->queue->queue_lock, flags); + if (v == -1) + return 0; + return __blkg_prfill_u64(sf, pd, v); } -static void throtl_update_blkio_group_common(struct throtl_data *td, - struct throtl_grp *tg) +static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, + int off) { - xchg(&tg->limits_changed, true); - xchg(&td->limits_changed, true); - /* Schedule a work now to process the limit change */ - throtl_schedule_delayed_work(td, 0); + struct throtl_grp *tg = pd_to_tg(pd); + unsigned int v = *(unsigned int *)((void *)tg + off); + + if (v == -1) + return 0; + return __blkg_prfill_u64(sf, pd, v); } -/* - * For all update functions, key should be a valid pointer because these - * update functions are called under blkcg_lock, that means, blkg is - * valid and in turn key is valid. queue exit path can not race because - * of blkcg_lock - * - * Can not take queue lock in update functions as queue lock under blkcg_lock - * is not allowed. Under other paths we take blkcg_lock under queue_lock. - */ -static void throtl_update_blkio_group_read_bps(void *key, - struct blkio_group *blkg, u64 read_bps) +static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) { - struct throtl_data *td = key; - struct throtl_grp *tg = tg_of_blkg(blkg); - - tg->bps[READ] = read_bps; - throtl_update_blkio_group_common(td, tg); + blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, + &blkcg_policy_throtl, cft->private, false); + return 0; } -static void throtl_update_blkio_group_write_bps(void *key, - struct blkio_group *blkg, u64 write_bps) +static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) { - struct throtl_data *td = key; - struct throtl_grp *tg = tg_of_blkg(blkg); - - tg->bps[WRITE] = write_bps; - throtl_update_blkio_group_common(td, tg); + blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, + &blkcg_policy_throtl, cft->private, false); + return 0; } -static void throtl_update_blkio_group_read_iops(void *key, - struct blkio_group *blkg, unsigned int read_iops) +static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, + bool is_u64) { - struct throtl_data *td = key; - struct throtl_grp *tg = tg_of_blkg(blkg); + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + struct throtl_data *td; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + td = ctx.blkg->q->td; + + if (!ctx.v) + ctx.v = -1; + + if (is_u64) + *(u64 *)((void *)tg + cft->private) = ctx.v; + else + *(unsigned int *)((void *)tg + cft->private) = ctx.v; + + /* XXX: we don't need the following deferred processing */ + xchg(&tg->limits_changed, true); + xchg(&td->limits_changed, true); + throtl_schedule_delayed_work(td, 0); - tg->iops[READ] = read_iops; - throtl_update_blkio_group_common(td, tg); + blkg_conf_finish(&ctx); + return 0; } -static void throtl_update_blkio_group_write_iops(void *key, - struct blkio_group *blkg, unsigned int write_iops) +static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, + const char *buf) { - struct throtl_data *td = key; - struct throtl_grp *tg = tg_of_blkg(blkg); + return tg_set_conf(cgrp, cft, buf, true); +} - tg->iops[WRITE] = write_iops; - throtl_update_blkio_group_common(td, tg); +static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + return tg_set_conf(cgrp, cft, buf, false); } +static struct cftype throtl_files[] = { + { + .name = "throttle.read_bps_device", + .private = offsetof(struct throtl_grp, bps[READ]), + .read_seq_string = tg_print_conf_u64, + .write_string = tg_set_conf_u64, + .max_write_len = 256, + }, + { + .name = "throttle.write_bps_device", + .private = offsetof(struct throtl_grp, bps[WRITE]), + .read_seq_string = tg_print_conf_u64, + .write_string = tg_set_conf_u64, + .max_write_len = 256, + }, + { + .name = "throttle.read_iops_device", + .private = offsetof(struct throtl_grp, iops[READ]), + .read_seq_string = tg_print_conf_uint, + .write_string = tg_set_conf_uint, + .max_write_len = 256, + }, + { + .name = "throttle.write_iops_device", + .private = offsetof(struct throtl_grp, iops[WRITE]), + .read_seq_string = tg_print_conf_uint, + .write_string = tg_set_conf_uint, + .max_write_len = 256, + }, + { + .name = "throttle.io_service_bytes", + .private = offsetof(struct tg_stats_cpu, service_bytes), + .read_seq_string = tg_print_cpu_rwstat, + }, + { + .name = "throttle.io_serviced", + .private = offsetof(struct tg_stats_cpu, serviced), + .read_seq_string = tg_print_cpu_rwstat, + }, + { } /* terminate */ +}; + static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; @@ -1094,19 +1101,13 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_delayed_work_sync(&td->throtl_work); } -static struct blkio_policy_type blkio_policy_throtl = { - .ops = { - .blkio_unlink_group_fn = throtl_unlink_blkio_group, - .blkio_update_group_read_bps_fn = - throtl_update_blkio_group_read_bps, - .blkio_update_group_write_bps_fn = - throtl_update_blkio_group_write_bps, - .blkio_update_group_read_iops_fn = - throtl_update_blkio_group_read_iops, - .blkio_update_group_write_iops_fn = - throtl_update_blkio_group_write_iops, - }, - .plid = BLKIO_POLICY_THROTL, +static struct blkcg_policy blkcg_policy_throtl = { + .pd_size = sizeof(struct throtl_grp), + .cftypes = throtl_files, + + .pd_init_fn = throtl_pd_init, + .pd_exit_fn = throtl_pd_exit, + .pd_reset_stats_fn = throtl_pd_reset_stats, }; bool blk_throtl_bio(struct request_queue *q, struct bio *bio) @@ -1114,7 +1115,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) struct throtl_data *td = q->td; struct throtl_grp *tg; bool rw = bio_data_dir(bio), update_disptime = true; - struct blkio_cgroup *blkcg; + struct blkcg *blkcg; bool throttled = false; if (bio->bi_rw & REQ_THROTTLED) { @@ -1122,33 +1123,31 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) goto out; } + /* bio_associate_current() needs ioc, try creating */ + create_io_context(GFP_ATOMIC, q->node); + /* * A throtl_grp pointer retrieved under rcu can be used to access * basic fields like stats and io rates. If a group has no rules, * just update the dispatch stats in lockless manner and return. */ - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); - tg = throtl_find_tg(td, blkcg); + blkcg = bio_blkcg(bio); + tg = throtl_lookup_tg(td, blkcg); if (tg) { - throtl_tg_fill_dev_details(td, tg); - if (tg_no_rule_group(tg, rw)) { - blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, - rw, rw_is_sync(bio->bi_rw)); - rcu_read_unlock(); - goto out; + throtl_update_dispatch_stats(tg_to_blkg(tg), + bio->bi_size, bio->bi_rw); + goto out_unlock_rcu; } } - rcu_read_unlock(); /* * Either group has not been allocated yet or it is not an unlimited * IO group */ spin_lock_irq(q->queue_lock); - tg = throtl_get_tg(td); + tg = throtl_lookup_create_tg(td, blkcg); if (unlikely(!tg)) goto out_unlock; @@ -1189,6 +1188,7 @@ queue_bio: tg->io_disp[rw], tg->iops[rw], tg->nr_queued[READ], tg->nr_queued[WRITE]); + bio_associate_current(bio); throtl_add_bio_tg(q->td, tg, bio); throttled = true; @@ -1199,6 +1199,8 @@ queue_bio: out_unlock: spin_unlock_irq(q->queue_lock); +out_unlock_rcu: + rcu_read_unlock(); out: return throttled; } @@ -1241,79 +1243,31 @@ void blk_throtl_drain(struct request_queue *q) int blk_throtl_init(struct request_queue *q) { struct throtl_data *td; - struct throtl_grp *tg; + int ret; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; - INIT_HLIST_HEAD(&td->tg_list); td->tg_service_tree = THROTL_RB_ROOT; td->limits_changed = false; INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); - /* alloc and Init root group. */ + q->td = td; td->queue = q; - tg = throtl_alloc_tg(td); - if (!tg) { + /* activate policy */ + ret = blkcg_activate_policy(q, &blkcg_policy_throtl); + if (ret) kfree(td); - return -ENOMEM; - } - - td->root_tg = tg; - - rcu_read_lock(); - throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); - rcu_read_unlock(); - - /* Attach throtl data to request queue */ - q->td = td; - return 0; + return ret; } void blk_throtl_exit(struct request_queue *q) { - struct throtl_data *td = q->td; - bool wait = false; - - BUG_ON(!td); - - throtl_shutdown_wq(q); - - spin_lock_irq(q->queue_lock); - throtl_release_tgs(td); - - /* If there are other groups */ - if (td->nr_undestroyed_grps > 0) - wait = true; - - spin_unlock_irq(q->queue_lock); - - /* - * Wait for tg->blkg->key accessors to exit their grace periods. - * Do this wait only if there are other undestroyed groups out - * there (other than root group). This can happen if cgroup deletion - * path claimed the responsibility of cleaning up a group before - * queue cleanup code get to the group. - * - * Do not call synchronize_rcu() unconditionally as there are drivers - * which create/delete request queue hundreds of times during scan/boot - * and synchronize_rcu() can take significant time and slow down boot. - */ - if (wait) - synchronize_rcu(); - - /* - * Just being safe to make sure after previous flush if some body did - * update limits through cgroup and another work got queued, cancel - * it. - */ + BUG_ON(!q->td); throtl_shutdown_wq(q); -} - -void blk_throtl_release(struct request_queue *q) -{ + blkcg_deactivate_policy(q, &blkcg_policy_throtl); kfree(q->td); } @@ -1323,8 +1277,7 @@ static int __init throtl_init(void) if (!kthrotld_workqueue) panic("Failed to create kthrotld\n"); - blkio_policy_register(&blkio_policy_throtl); - return 0; + return blkcg_policy_register(&blkcg_policy_throtl); } module_init(throtl_init); diff --git a/block/blk.h b/block/blk.h index d45be87..85f6ae4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -23,7 +23,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); -void blk_drain_queue(struct request_queue *q, bool drain_all); +void blk_queue_bypass_start(struct request_queue *q); +void blk_queue_bypass_end(struct request_queue *q); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, @@ -144,9 +145,6 @@ void blk_queue_congestion_threshold(struct request_queue *q); int blk_dev_init(void); -void elv_quiesce_start(struct request_queue *q); -void elv_quiesce_end(struct request_queue *q); - /* * Return the threshold (number of used requests) at which the queue is @@ -186,32 +184,30 @@ static inline int blk_do_io_stat(struct request *rq) */ void get_io_context(struct io_context *ioc); struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); -struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask); +struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, + gfp_t gfp_mask); void ioc_clear_queue(struct request_queue *q); -void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask, - int node); +int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); /** * create_io_context - try to create task->io_context - * @task: target task * @gfp_mask: allocation mask * @node: allocation node * - * If @task->io_context is %NULL, allocate a new io_context and install it. - * Returns the current @task->io_context which may be %NULL if allocation - * failed. + * If %current->io_context is %NULL, allocate a new io_context and install + * it. Returns the current %current->io_context which may be %NULL if + * allocation failed. * * Note that this function can't be called with IRQ disabled because - * task_lock which protects @task->io_context is IRQ-unsafe. + * task_lock which protects %current->io_context is IRQ-unsafe. */ -static inline struct io_context *create_io_context(struct task_struct *task, - gfp_t gfp_mask, int node) +static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) { WARN_ON_ONCE(irqs_disabled()); - if (unlikely(!task->io_context)) - create_io_context_slowpath(task, gfp_mask, node); - return task->io_context; + if (unlikely(!current->io_context)) + create_task_io_context(current, gfp_mask, node); + return current->io_context; } /* @@ -222,7 +218,6 @@ extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); -extern void blk_throtl_release(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) { @@ -231,7 +226,6 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } -static inline void blk_throtl_release(struct request_queue *q) { } #endif /* CONFIG_BLK_DEV_THROTTLING */ #endif /* BLK_INTERNAL_H */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3c38536..673c977 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -15,7 +15,9 @@ #include <linux/ioprio.h> #include <linux/blktrace_api.h> #include "blk.h" -#include "cfq.h" +#include "blk-cgroup.h" + +static struct blkcg_policy blkcg_policy_cfq __maybe_unused; /* * tunables @@ -171,8 +173,53 @@ enum wl_type_t { SYNC_WORKLOAD = 2 }; +struct cfqg_stats { +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total sectors transferred */ + struct blkg_stat sectors; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stat time; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* time not charged to this cgroup */ + struct blkg_stat unaccounted_time; + /* sum of number of ios queued across all samples */ + struct blkg_stat avg_queue_size_sum; + /* count of samples taken for average */ + struct blkg_stat avg_queue_size_samples; + /* how many times this group has been removed from service tree */ + struct blkg_stat dequeue; + /* total time spent waiting for it to be assigned a timeslice. */ + struct blkg_stat group_wait_time; + /* time spent idling for this blkcg_gq */ + struct blkg_stat idle_time; + /* total time with empty current active q with other requests queued */ + struct blkg_stat empty_time; + /* fields after this shouldn't be cleared on stat reset */ + uint64_t start_group_wait_time; + uint64_t start_idle_time; + uint64_t start_empty_time; + uint16_t flags; +#endif /* CONFIG_DEBUG_BLK_CGROUP */ +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ +}; + /* This is per cgroup per device grouping structure */ struct cfq_group { + /* must be the first member */ + struct blkg_policy_data pd; + /* group service_tree member */ struct rb_node rb_node; @@ -180,7 +227,7 @@ struct cfq_group { u64 vdisktime; unsigned int weight; unsigned int new_weight; - bool needs_update; + unsigned int dev_weight; /* number of cfqq currently on this group */ int nr_cfqq; @@ -206,20 +253,21 @@ struct cfq_group { unsigned long saved_workload_slice; enum wl_type_t saved_workload; enum wl_prio_t saved_serving_prio; - struct blkio_group blkg; -#ifdef CONFIG_CFQ_GROUP_IOSCHED - struct hlist_node cfqd_node; - int ref; -#endif + /* number of requests that are on the dispatch list or inside driver */ int dispatched; struct cfq_ttime ttime; + struct cfqg_stats stats; }; struct cfq_io_cq { struct io_cq icq; /* must be the first member */ struct cfq_queue *cfqq[2]; struct cfq_ttime ttime; + int ioprio; /* the current ioprio */ +#ifdef CONFIG_CFQ_GROUP_IOSCHED + uint64_t blkcg_id; /* the current blkcg ID */ +#endif }; /* @@ -229,7 +277,7 @@ struct cfq_data { struct request_queue *queue; /* Root service tree for cfq_groups */ struct cfq_rb_root grp_service_tree; - struct cfq_group root_group; + struct cfq_group *root_group; /* * The priority currently being served @@ -303,12 +351,6 @@ struct cfq_data { struct cfq_queue oom_cfqq; unsigned long last_delayed_sync; - - /* List of cfq groups being managed on this device*/ - struct hlist_head cfqg_list; - - /* Number of groups which are on blkcg->blkg_list */ - unsigned int nr_blkcg_linked_grps; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); @@ -371,21 +413,284 @@ CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS +static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct cfq_group, pd) : NULL; +} + +static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) +{ + return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); +} + +static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) +{ + return pd_to_blkg(&cfqg->pd); +} + +#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) + +/* cfqg stats flags */ +enum cfqg_stats_flags { + CFQG_stats_waiting = 0, + CFQG_stats_idling, + CFQG_stats_empty, +}; + +#define CFQG_FLAG_FNS(name) \ +static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ +{ \ + stats->flags |= (1 << CFQG_stats_##name); \ +} \ +static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ +{ \ + stats->flags &= ~(1 << CFQG_stats_##name); \ +} \ +static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ +{ \ + return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ +} \ + +CFQG_FLAG_FNS(waiting) +CFQG_FLAG_FNS(idling) +CFQG_FLAG_FNS(empty) +#undef CFQG_FLAG_FNS + +/* This should be called with the queue_lock held. */ +static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) +{ + unsigned long long now; + + if (!cfqg_stats_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + blkg_stat_add(&stats->group_wait_time, + now - stats->start_group_wait_time); + cfqg_stats_clear_waiting(stats); +} + +/* This should be called with the queue_lock held. */ +static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, + struct cfq_group *curr_cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + if (cfqg_stats_waiting(stats)) + return; + if (cfqg == curr_cfqg) + return; + stats->start_group_wait_time = sched_clock(); + cfqg_stats_mark_waiting(stats); +} + +/* This should be called with the queue_lock held. */ +static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) +{ + unsigned long long now; + + if (!cfqg_stats_empty(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + blkg_stat_add(&stats->empty_time, + now - stats->start_empty_time); + cfqg_stats_clear_empty(stats); +} + +static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) +{ + blkg_stat_add(&cfqg->stats.dequeue, 1); +} + +static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + if (blkg_rwstat_sum(&stats->queued)) + return; + + /* + * group is already marked empty. This can happen if cfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if (cfqg_stats_empty(stats)) + return; + + stats->start_empty_time = sched_clock(); + cfqg_stats_mark_empty(stats); +} + +static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + if (cfqg_stats_idling(stats)) { + unsigned long long now = sched_clock(); + + if (time_after64(now, stats->start_idle_time)) + blkg_stat_add(&stats->idle_time, + now - stats->start_idle_time); + cfqg_stats_clear_idling(stats); + } +} + +static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + BUG_ON(cfqg_stats_idling(stats)); + + stats->start_idle_time = sched_clock(); + cfqg_stats_mark_idling(stats); +} + +static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) +{ + struct cfqg_stats *stats = &cfqg->stats; + + blkg_stat_add(&stats->avg_queue_size_sum, + blkg_rwstat_sum(&stats->queued)); + blkg_stat_add(&stats->avg_queue_size_samples, 1); + cfqg_stats_update_group_wait_time(stats); +} + +#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ + +static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } +static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } +static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } +static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } +static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } +static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } +static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } + +#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ + #ifdef CONFIG_CFQ_GROUP_IOSCHED -#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + +static inline void cfqg_get(struct cfq_group *cfqg) +{ + return blkg_get(cfqg_to_blkg(cfqg)); +} + +static inline void cfqg_put(struct cfq_group *cfqg) +{ + return blkg_put(cfqg_to_blkg(cfqg)); +} + +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - blkg_path(&(cfqq)->cfqg->blkg), ##args) + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ - blkg_path(&(cfqg)->blkg), ##args) \ +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ +} while (0) + +static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, + struct cfq_group *curr_cfqg, int rw) +{ + blkg_rwstat_add(&cfqg->stats.queued, rw, 1); + cfqg_stats_end_empty_time(&cfqg->stats); + cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); +} + +static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, + unsigned long time, unsigned long unaccounted_time) +{ + blkg_stat_add(&cfqg->stats.time, time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); +#endif +} + +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) +{ + blkg_rwstat_add(&cfqg->stats.queued, rw, -1); +} + +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) +{ + blkg_rwstat_add(&cfqg->stats.merged, rw, 1); +} + +static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, + uint64_t bytes, int rw) +{ + blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); + blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); + blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); +} + +static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, + uint64_t start_time, uint64_t io_start_time, int rw) +{ + struct cfqg_stats *stats = &cfqg->stats; + unsigned long long now = sched_clock(); + + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&stats->wait_time, rw, + io_start_time - start_time); +} + +static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +{ + struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfqg_stats *stats = &cfqg->stats; + + /* queued stats shouldn't be cleared */ + blkg_rwstat_reset(&stats->service_bytes); + blkg_rwstat_reset(&stats->serviced); + blkg_rwstat_reset(&stats->merged); + blkg_rwstat_reset(&stats->service_time); + blkg_rwstat_reset(&stats->wait_time); + blkg_stat_reset(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_reset(&stats->unaccounted_time); + blkg_stat_reset(&stats->avg_queue_size_sum); + blkg_stat_reset(&stats->avg_queue_size_samples); + blkg_stat_reset(&stats->dequeue); + blkg_stat_reset(&stats->group_wait_time); + blkg_stat_reset(&stats->idle_time); + blkg_stat_reset(&stats->empty_time); +#endif +} + +#else /* CONFIG_CFQ_GROUP_IOSCHED */ + +static inline void cfqg_get(struct cfq_group *cfqg) { } +static inline void cfqg_put(struct cfq_group *cfqg) { } -#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) -#endif + +static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, + struct cfq_group *curr_cfqg, int rw) { } +static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, + unsigned long time, unsigned long unaccounted_time) { } +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } +static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, + uint64_t bytes, int rw) { } +static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, + uint64_t start_time, uint64_t io_start_time, int rw) { } + +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ + #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) @@ -466,8 +771,9 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, } static void cfq_dispatch_insert(struct request_queue *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, - struct io_context *, gfp_t); +static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, + struct cfq_io_cq *cic, struct bio *bio, + gfp_t gfp_mask); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -545,7 +851,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) { u64 d = delta << CFQ_SERVICE_SHIFT; - d = d * BLKIO_WEIGHT_DEFAULT; + d = d * CFQ_WEIGHT_DEFAULT; do_div(d, cfqg->weight); return d; } @@ -872,9 +1178,9 @@ static void cfq_update_group_weight(struct cfq_group *cfqg) { BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - if (cfqg->needs_update) { + if (cfqg->new_weight) { cfqg->weight = cfqg->new_weight; - cfqg->needs_update = false; + cfqg->new_weight = 0; } } @@ -936,7 +1242,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); cfq_group_service_tree_del(st, cfqg); cfqg->saved_workload_slice = 0; - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); + cfqg_stats_update_dequeue(cfqg); } static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, @@ -1008,178 +1314,59 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd), cfqq->nr_sectors); - cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, - unaccounted_sl); - cfq_blkiocg_set_start_empty_time(&cfqg->blkg); + cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); + cfqg_stats_set_start_empty_time(cfqg); } -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) -{ - if (blkg) - return container_of(blkg, struct cfq_group, blkg); - return NULL; -} - -static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, - unsigned int weight) -{ - struct cfq_group *cfqg = cfqg_of_blkg(blkg); - cfqg->new_weight = weight; - cfqg->needs_update = true; -} - -static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, - struct cfq_group *cfqg, struct blkio_cgroup *blkcg) -{ - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; - unsigned int major, minor; - - /* - * Add group onto cgroup list. It might happen that bdi->dev is - * not initialized yet. Initialize this new group without major - * and minor info and this info will be filled in once a new thread - * comes for IO. - */ - if (bdi->dev) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, - (void *)cfqd, MKDEV(major, minor)); - } else - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, - (void *)cfqd, 0); - - cfqd->nr_blkcg_linked_grps++; - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); - - /* Add group on cfqd list */ - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); -} - -/* - * Should be called from sleepable context. No request queue lock as per - * cpu stats are allocated dynamically and alloc_percpu needs to be called - * from sleepable context. +/** + * cfq_init_cfqg_base - initialize base part of a cfq_group + * @cfqg: cfq_group to initialize + * + * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED + * is enabled or not. */ -static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) +static void cfq_init_cfqg_base(struct cfq_group *cfqg) { - struct cfq_group *cfqg = NULL; - int i, j, ret; struct cfq_rb_root *st; - - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); - if (!cfqg) - return NULL; + int i, j; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); cfqg->ttime.last_end_request = jiffies; - - /* - * Take the initial reference that will be released on destroy - * This can be thought of a joint reference by cgroup and - * elevator which will be dropped by either elevator exit - * or cgroup deletion path depending on who is exiting first. - */ - cfqg->ref = 1; - - ret = blkio_alloc_blkg_stats(&cfqg->blkg); - if (ret) { - kfree(cfqg); - return NULL; - } - - return cfqg; } -static struct cfq_group * -cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static void cfq_pd_init(struct blkcg_gq *blkg) { - struct cfq_group *cfqg = NULL; - void *key = cfqd; - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; - unsigned int major, minor; - - /* - * This is the common case when there are no blkio cgroups. - * Avoid lookup in this case - */ - if (blkcg == &blkio_root_cgroup) - cfqg = &cfqd->root_group; - else - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); - - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfqg->blkg.dev = MKDEV(major, minor); - } + struct cfq_group *cfqg = blkg_to_cfqg(blkg); - return cfqg; + cfq_init_cfqg_base(cfqg); + cfqg->weight = blkg->blkcg->cfq_weight; } /* * Search for the cfq group current task belongs to. request_queue lock must * be held. */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) +static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct blkio_cgroup *blkcg; - struct cfq_group *cfqg = NULL, *__cfqg = NULL; struct request_queue *q = cfqd->queue; + struct cfq_group *cfqg = NULL; - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); - cfqg = cfq_find_cfqg(cfqd, blkcg); - if (cfqg) { - rcu_read_unlock(); - return cfqg; - } - - /* - * Need to allocate a group. Allocation of group also needs allocation - * of per cpu stats which in-turn takes a mutex() and can block. Hence - * we need to drop rcu lock and queue_lock before we call alloc. - * - * Not taking any queue reference here and assuming that queue is - * around by the time we return. CFQ queue allocation code does - * the same. It might be racy though. - */ - - rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); - - cfqg = cfq_alloc_cfqg(cfqd); - - spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkcg = task_blkio_cgroup(current); - - /* - * If some other thread already allocated the group while we were - * not holding queue lock, free up the group - */ - __cfqg = cfq_find_cfqg(cfqd, blkcg); + /* avoid lookup for the common case where there's no blkcg */ + if (blkcg == &blkcg_root) { + cfqg = cfqd->root_group; + } else { + struct blkcg_gq *blkg; - if (__cfqg) { - kfree(cfqg); - rcu_read_unlock(); - return __cfqg; + blkg = blkg_lookup_create(blkcg, q); + if (!IS_ERR(blkg)) + cfqg = blkg_to_cfqg(blkg); } - if (!cfqg) - cfqg = &cfqd->root_group; - - cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); - rcu_read_unlock(); - return cfqg; -} - -static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) -{ - cfqg->ref++; return cfqg; } @@ -1187,94 +1374,224 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { /* Currently, all async queues are mapped to root group */ if (!cfq_cfqq_sync(cfqq)) - cfqg = &cfqq->cfqd->root_group; + cfqg = cfqq->cfqd->root_group; cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ - cfqq->cfqg->ref++; + cfqg_get(cfqg); } -static void cfq_put_cfqg(struct cfq_group *cfqg) +static u64 cfqg_prfill_weight_device(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { - struct cfq_rb_root *st; - int i, j; + struct cfq_group *cfqg = pd_to_cfqg(pd); - BUG_ON(cfqg->ref <= 0); - cfqg->ref--; - if (cfqg->ref) - return; - for_each_cfqg_st(cfqg, i, j, st) - BUG_ON(!RB_EMPTY_ROOT(&st->rb)); - free_percpu(cfqg->blkg.stats_cpu); - kfree(cfqg); + if (!cfqg->dev_weight) + return 0; + return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); } -static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) +static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) { - /* Something wrong if we are trying to remove same group twice */ - BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); + blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), + cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, + false); + return 0; +} - hlist_del_init(&cfqg->cfqd_node); +static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); + return 0; +} - BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); - cfqd->nr_blkcg_linked_grps--; +static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkg_conf_ctx ctx; + struct cfq_group *cfqg; + int ret; - /* - * Put the reference taken at the time of creation so that when all - * queues are gone, group can be destroyed. - */ - cfq_put_cfqg(cfqg); + ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + cfqg = blkg_to_cfqg(ctx.blkg); + if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + cfqg->dev_weight = ctx.v; + cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; + ret = 0; + } + + blkg_conf_finish(&ctx); + return ret; } -static void cfq_release_cfq_groups(struct cfq_data *cfqd) +static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) { - struct hlist_node *pos, *n; - struct cfq_group *cfqg; + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg_gq *blkg; + struct hlist_node *n; - hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { - /* - * If cgroup removal path got to blk_group first and removed - * it from cgroup list, then it will take care of destroying - * cfqg also. - */ - if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) - cfq_destroy_cfqg(cfqd, cfqg); + if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) + return -EINVAL; + + spin_lock_irq(&blkcg->lock); + blkcg->cfq_weight = (unsigned int)val; + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + struct cfq_group *cfqg = blkg_to_cfqg(blkg); + + if (cfqg && !cfqg->dev_weight) + cfqg->new_weight = blkcg->cfq_weight; } + + spin_unlock_irq(&blkcg->lock); + return 0; } -/* - * Blk cgroup controller notification saying that blkio_group object is being - * delinked as associated cgroup object is going away. That also means that - * no new IO will come in this group. So get rid of this group as soon as - * any pending IO in the group is finished. - * - * This function is called under rcu_read_lock(). key is the rcu protected - * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu - * read lock. - * - * "key" was fetched from blkio_group under blkio_cgroup->lock. That means - * it should not be NULL as even if elevator was exiting, cgroup deltion - * path got to it first. - */ -static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) +static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) { - unsigned long flags; - struct cfq_data *cfqd = key; + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); + blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, + cft->private, false); + return 0; } -#else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) +static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) { - return &cfqd->root_group; + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + + blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, + cft->private, true); + return 0; } -static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) +#ifdef CONFIG_DEBUG_BLK_CGROUP +static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, + struct blkg_policy_data *pd, int off) { - return cfqg; + struct cfq_group *cfqg = pd_to_cfqg(pd); + u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); + u64 v = 0; + + if (samples) { + v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); + do_div(v, samples); + } + __blkg_prfill_u64(sf, pd, v); + return 0; +} + +/* print avg_queue_size */ +static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *sf) +{ + struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, + &blkcg_policy_cfq, 0, false); + return 0; +} +#endif /* CONFIG_DEBUG_BLK_CGROUP */ + +static struct cftype cfq_blkcg_files[] = { + { + .name = "weight_device", + .read_seq_string = cfqg_print_weight_device, + .write_string = cfqg_set_weight_device, + .max_write_len = 256, + }, + { + .name = "weight", + .read_seq_string = cfq_print_weight, + .write_u64 = cfq_set_weight, + }, + { + .name = "time", + .private = offsetof(struct cfq_group, stats.time), + .read_seq_string = cfqg_print_stat, + }, + { + .name = "sectors", + .private = offsetof(struct cfq_group, stats.sectors), + .read_seq_string = cfqg_print_stat, + }, + { + .name = "io_service_bytes", + .private = offsetof(struct cfq_group, stats.service_bytes), + .read_seq_string = cfqg_print_rwstat, + }, + { + .name = "io_serviced", + .private = offsetof(struct cfq_group, stats.serviced), + .read_seq_string = cfqg_print_rwstat, + }, + { + .name = "io_service_time", + .private = offsetof(struct cfq_group, stats.service_time), + .read_seq_string = cfqg_print_rwstat, + }, + { + .name = "io_wait_time", + .private = offsetof(struct cfq_group, stats.wait_time), + .read_seq_string = cfqg_print_rwstat, + }, + { + .name = "io_merged", + .private = offsetof(struct cfq_group, stats.merged), + .read_seq_string = cfqg_print_rwstat, + }, + { + .name = "io_queued", + .private = offsetof(struct cfq_group, stats.queued), + .read_seq_string = cfqg_print_rwstat, + }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "avg_queue_size", + .read_seq_string = cfqg_print_avg_queue_size, + }, + { + .name = "group_wait_time", + .private = offsetof(struct cfq_group, stats.group_wait_time), + .read_seq_string = cfqg_print_stat, + }, + { + .name = "idle_time", + .private = offsetof(struct cfq_group, stats.idle_time), + .read_seq_string = cfqg_print_stat, + }, + { + .name = "empty_time", + .private = offsetof(struct cfq_group, stats.empty_time), + .read_seq_string = cfqg_print_stat, + }, + { + .name = "dequeue", + .private = offsetof(struct cfq_group, stats.dequeue), + .read_seq_string = cfqg_print_stat, + }, + { + .name = "unaccounted_time", + .private = offsetof(struct cfq_group, stats.unaccounted_time), + .read_seq_string = cfqg_print_stat, + }, +#endif /* CONFIG_DEBUG_BLK_CGROUP */ + { } /* terminate */ +}; +#else /* GROUP_IOSCHED */ +static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) +{ + return cfqd->root_group; } static inline void @@ -1282,9 +1599,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { cfqq->cfqg = cfqg; } -static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} -static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} - #endif /* GROUP_IOSCHED */ /* @@ -1551,12 +1865,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; - cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(rq), rq_is_sync(rq)); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); cfq_add_rq_rb(rq); - cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, - &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), - rq_is_sync(rq)); + cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, + rq->cmd_flags); } static struct request * @@ -1612,8 +1924,7 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(rq), rq_is_sync(rq)); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); if (rq->cmd_flags & REQ_PRIO) { WARN_ON(!cfqq->prio_pending); cfqq->prio_pending--; @@ -1648,8 +1959,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, - bio_data_dir(bio), cfq_bio_sync(bio)); + cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw); } static void @@ -1671,8 +1981,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); - cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, - rq_data_dir(next), rq_is_sync(next)); + cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); cfqq = RQ_CFQQ(next); /* @@ -1713,7 +2022,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) { del_timer(&cfqd->idle_slice_timer); - cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); + cfqg_stats_update_idle_time(cfqq->cfqg); } static void __cfq_set_active_queue(struct cfq_data *cfqd, @@ -1722,7 +2031,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, if (cfqq) { cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", cfqd->serving_prio, cfqd->serving_type); - cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); + cfqg_stats_update_avg_queue_size(cfqq->cfqg); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; cfqq->allocated_slice = 0; @@ -2043,7 +2352,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * task has exited, don't wait */ cic = cfqd->active_cic; - if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks)) + if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) return; /* @@ -2070,7 +2379,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); + cfqg_stats_set_start_idle_time(cfqq->cfqg); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, group_idle ? 1 : 0); } @@ -2093,8 +2402,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), - rq_data_dir(rq), rq_is_sync(rq)); + cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); } /* @@ -2677,7 +2985,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); - cfq_put_cfqg(cfqg); + cfqg_put(cfqg); } static void cfq_put_cooperator(struct cfq_queue *cfqq) @@ -2736,7 +3044,7 @@ static void cfq_exit_icq(struct io_cq *icq) } } -static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) +static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) { struct task_struct *tsk = current; int ioprio_class; @@ -2744,7 +3052,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) if (!cfq_cfqq_prio_changed(cfqq)) return; - ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); + ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); switch (ioprio_class) { default: printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); @@ -2756,11 +3064,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) cfqq->ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - cfqq->ioprio = task_ioprio(ioc); + cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); cfqq->ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - cfqq->ioprio = task_ioprio(ioc); + cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); cfqq->ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: @@ -2778,19 +3086,24 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) cfq_clear_cfqq_prio_changed(cfqq); } -static void changed_ioprio(struct cfq_io_cq *cic) +static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) { + int ioprio = cic->icq.ioc->ioprio; struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; - if (unlikely(!cfqd)) + /* + * Check whether ioprio has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; cfqq = cic->cfqq[BLK_RW_ASYNC]; if (cfqq) { struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc, - GFP_ATOMIC); + new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, + GFP_ATOMIC); if (new_cfqq) { cic->cfqq[BLK_RW_ASYNC] = new_cfqq; cfq_put_queue(cfqq); @@ -2800,6 +3113,8 @@ static void changed_ioprio(struct cfq_io_cq *cic) cfqq = cic->cfqq[BLK_RW_SYNC]; if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); + + cic->ioprio = ioprio; } static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, @@ -2823,17 +3138,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void changed_cgroup(struct cfq_io_cq *cic) +static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { - struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); struct cfq_data *cfqd = cic_to_cfqd(cic); - struct request_queue *q; + struct cfq_queue *sync_cfqq; + uint64_t id; - if (unlikely(!cfqd)) - return; + rcu_read_lock(); + id = bio_blkcg(bio)->id; + rcu_read_unlock(); - q = cfqd->queue; + /* + * Check whether blkcg has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) + return; + sync_cfqq = cic_to_cfqq(cic, 1); if (sync_cfqq) { /* * Drop reference to sync queue. A new sync queue will be @@ -2843,21 +3165,26 @@ static void changed_cgroup(struct cfq_io_cq *cic) cic_set_cfqq(cic, NULL, 1); cfq_put_queue(sync_cfqq); } + + cic->blkcg_id = id; } +#else +static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, - struct io_context *ioc, gfp_t gfp_mask) +cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, + struct bio *bio, gfp_t gfp_mask) { + struct blkcg *blkcg; struct cfq_queue *cfqq, *new_cfqq = NULL; - struct cfq_io_cq *cic; struct cfq_group *cfqg; retry: - cfqg = cfq_get_cfqg(cfqd); - cic = cfq_cic_lookup(cfqd, ioc); - /* cic always exists here */ + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); cfqq = cic_to_cfqq(cic, is_sync); /* @@ -2870,6 +3197,7 @@ retry: cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { + rcu_read_unlock(); spin_unlock_irq(cfqd->queue->queue_lock); new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask | __GFP_ZERO, @@ -2885,7 +3213,7 @@ retry: if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, ioc); + cfq_init_prio_data(cfqq, cic); cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else @@ -2895,6 +3223,7 @@ retry: if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); + rcu_read_unlock(); return cfqq; } @@ -2904,6 +3233,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) switch (ioprio_class) { case IOPRIO_CLASS_RT: return &cfqd->async_cfqq[0][ioprio]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_NORM; + /* fall through */ case IOPRIO_CLASS_BE: return &cfqd->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: @@ -2914,11 +3246,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) } static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, - gfp_t gfp_mask) +cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, + struct bio *bio, gfp_t gfp_mask) { - const int ioprio = task_ioprio(ioc); - const int ioprio_class = task_ioprio_class(ioc); + const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); + const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq = NULL; @@ -2928,7 +3260,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, } if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); + cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); /* * pin the queue now that it's allocated, scheduler exit will prune it @@ -3010,7 +3342,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) enable_idle = 0; - else if (!atomic_read(&cic->icq.ioc->nr_tasks) || + else if (!atomic_read(&cic->icq.ioc->active_ref) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; @@ -3174,8 +3506,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_clear_cfqq_wait_request(cfqq); __blk_run_queue(cfqd->queue); } else { - cfq_blkiocg_update_idle_time_stats( - &cfqq->cfqg->blkg); + cfqg_stats_update_idle_time(cfqq->cfqg); cfq_mark_cfqq_must_dispatch(cfqq); } } @@ -3197,14 +3528,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) struct cfq_queue *cfqq = RQ_CFQQ(rq); cfq_log_cfqq(cfqd, cfqq, "insert_request"); - cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc); + cfq_init_prio_data(cfqq, RQ_CIC(rq)); rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, - &cfqd->serving_group->blkg, rq_data_dir(rq), - rq_is_sync(rq)); + cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, + rq->cmd_flags); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -3300,9 +3630,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqd->rq_in_driver--; cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; - cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, - rq_start_time_ns(rq), rq_io_start_time_ns(rq), - rq_data_dir(rq), rq_is_sync(rq)); + cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), + rq_io_start_time_ns(rq), rq->cmd_flags); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; @@ -3399,7 +3728,7 @@ static int cfq_may_queue(struct request_queue *q, int rw) cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); if (cfqq) { - cfq_init_prio_data(cfqq, cic->icq.ioc); + cfq_init_prio_data(cfqq, cic); return __cfq_may_queue(cfqq); } @@ -3421,7 +3750,7 @@ static void cfq_put_request(struct request *rq) cfqq->allocated[rw]--; /* Put down rq reference on cfqg */ - cfq_put_cfqg(RQ_CFQG(rq)); + cfqg_put(RQ_CFQG(rq)); rq->elv.priv[0] = NULL; rq->elv.priv[1] = NULL; @@ -3465,32 +3794,25 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) * Allocate cfq data structures associated with this request. */ static int -cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, + gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - unsigned int changed; might_sleep_if(gfp_mask & __GFP_WAIT); spin_lock_irq(q->queue_lock); - /* handle changed notifications */ - changed = icq_get_changed(&cic->icq); - if (unlikely(changed & ICQ_IOPRIO_CHANGED)) - changed_ioprio(cic); -#ifdef CONFIG_CFQ_GROUP_IOSCHED - if (unlikely(changed & ICQ_CGROUP_CHANGED)) - changed_cgroup(cic); -#endif - + check_ioprio_changed(cic, bio); + check_blkcg_changed(cic, bio); new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -3516,8 +3838,9 @@ new_queue: cfqq->allocated[rw]++; cfqq->ref++; + cfqg_get(cfqq->cfqg); rq->elv.priv[0] = cfqq; - rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg); + rq->elv.priv[1] = cfqq->cfqg; spin_unlock_irq(q->queue_lock); return 0; } @@ -3614,7 +3937,6 @@ static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; struct request_queue *q = cfqd->queue; - bool wait = false; cfq_shutdown_timer_wq(cfqd); @@ -3624,89 +3946,52 @@ static void cfq_exit_queue(struct elevator_queue *e) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); cfq_put_async_queues(cfqd); - cfq_release_cfq_groups(cfqd); - - /* - * If there are groups which we could not unlink from blkcg list, - * wait for a rcu period for them to be freed. - */ - if (cfqd->nr_blkcg_linked_grps) - wait = true; spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); - /* - * Wait for cfqg->blkg->key accessors to exit their grace periods. - * Do this wait only if there are other unlinked groups out - * there. This can happen if cgroup deletion path claimed the - * responsibility of cleaning up a group before queue cleanup code - * get to the group. - * - * Do not call synchronize_rcu() unconditionally as there are drivers - * which create/delete request queue hundreds of times during scan/boot - * and synchronize_rcu() can take significant time and slow down boot. - */ - if (wait) - synchronize_rcu(); - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - /* Free up per cpu stats for root group */ - free_percpu(cfqd->root_group.blkg.stats_cpu); +#ifndef CONFIG_CFQ_GROUP_IOSCHED + kfree(cfqd->root_group); #endif + blkcg_deactivate_policy(q, &blkcg_policy_cfq); kfree(cfqd); } -static void *cfq_init_queue(struct request_queue *q) +static int cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; - int i, j; - struct cfq_group *cfqg; - struct cfq_rb_root *st; + struct blkcg_gq *blkg __maybe_unused; + int i, ret; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) - return NULL; + return -ENOMEM; + + cfqd->queue = q; + q->elevator->elevator_data = cfqd; /* Init root service tree */ cfqd->grp_service_tree = CFQ_RB_ROOT; - /* Init root group */ - cfqg = &cfqd->root_group; - for_each_cfqg_st(cfqg, i, j, st) - *st = CFQ_RB_ROOT; - RB_CLEAR_NODE(&cfqg->rb_node); - - /* Give preference to root group over other groups */ - cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; - + /* Init root group and prefer root group over other groups by default */ #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* - * Set root group reference to 2. One reference will be dropped when - * all groups on cfqd->cfqg_list are being deleted during queue exit. - * Other reference will remain there as we don't want to delete this - * group as it is statically allocated and gets destroyed when - * throtl_data goes away. - */ - cfqg->ref = 2; - - if (blkio_alloc_blkg_stats(&cfqg->blkg)) { - kfree(cfqg); - kfree(cfqd); - return NULL; - } - - rcu_read_lock(); + ret = blkcg_activate_policy(q, &blkcg_policy_cfq); + if (ret) + goto out_free; - cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, - (void *)cfqd, 0); - rcu_read_unlock(); - cfqd->nr_blkcg_linked_grps++; + cfqd->root_group = blkg_to_cfqg(q->root_blkg); +#else + ret = -ENOMEM; + cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), + GFP_KERNEL, cfqd->queue->node); + if (!cfqd->root_group) + goto out_free; - /* Add group on cfqd->cfqg_list */ - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + cfq_init_cfqg_base(cfqd->root_group); #endif + cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; + /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -3718,13 +4003,17 @@ static void *cfq_init_queue(struct request_queue *q) /* * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. + * will not attempt to free it. oom_cfqq is linked to root_group + * but shouldn't hold a reference as it'll never be unlinked. Lose + * the reference from linking right away. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); cfqd->oom_cfqq.ref++; - cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); - cfqd->queue = q; + spin_lock_irq(q->queue_lock); + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); + cfqg_put(cfqd->root_group); + spin_unlock_irq(q->queue_lock); init_timer(&cfqd->idle_slice_timer); cfqd->idle_slice_timer.function = cfq_idle_slice_timer; @@ -3750,7 +4039,11 @@ static void *cfq_init_queue(struct request_queue *q) * second, in order to have larger depth for async operations. */ cfqd->last_delayed_sync = jiffies - HZ; - return cfqd; + return 0; + +out_free: + kfree(cfqd); + return ret; } /* @@ -3877,15 +4170,13 @@ static struct elevator_type iosched_cfq = { }; #ifdef CONFIG_CFQ_GROUP_IOSCHED -static struct blkio_policy_type blkio_policy_cfq = { - .ops = { - .blkio_unlink_group_fn = cfq_unlink_blkio_group, - .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, - }, - .plid = BLKIO_POLICY_PROP, +static struct blkcg_policy blkcg_policy_cfq = { + .pd_size = sizeof(struct cfq_group), + .cftypes = cfq_blkcg_files, + + .pd_init_fn = cfq_pd_init, + .pd_reset_stats_fn = cfq_pd_reset_stats, }; -#else -static struct blkio_policy_type blkio_policy_cfq; #endif static int __init cfq_init(void) @@ -3906,24 +4197,31 @@ static int __init cfq_init(void) #else cfq_group_idle = 0; #endif + + ret = blkcg_policy_register(&blkcg_policy_cfq); + if (ret) + return ret; + cfq_pool = KMEM_CACHE(cfq_queue, 0); if (!cfq_pool) - return -ENOMEM; + goto err_pol_unreg; ret = elv_register(&iosched_cfq); - if (ret) { - kmem_cache_destroy(cfq_pool); - return ret; - } - - blkio_policy_register(&blkio_policy_cfq); + if (ret) + goto err_free_pool; return 0; + +err_free_pool: + kmem_cache_destroy(cfq_pool); +err_pol_unreg: + blkcg_policy_unregister(&blkcg_policy_cfq); + return ret; } static void __exit cfq_exit(void) { - blkio_policy_unregister(&blkio_policy_cfq); + blkcg_policy_unregister(&blkcg_policy_cfq); elv_unregister(&iosched_cfq); kmem_cache_destroy(cfq_pool); } diff --git a/block/cfq.h b/block/cfq.h deleted file mode 100644 index 2a15592..0000000 --- a/block/cfq.h +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef _CFQ_H -#define _CFQ_H -#include "blk-cgroup.h" - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) -{ - blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync); -} - -static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) -{ - blkiocg_update_dequeue_stats(blkg, dequeue); -} - -static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time, unsigned long unaccounted_time) -{ - blkiocg_update_timeslice_used(blkg, time, unaccounted_time); -} - -static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) -{ - blkiocg_set_start_empty_time(blkg); -} - -static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) -{ - blkiocg_update_io_remove_stats(blkg, direction, sync); -} - -static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) -{ - blkiocg_update_io_merged_stats(blkg, direction, sync); -} - -static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) -{ - blkiocg_update_idle_time_stats(blkg); -} - -static inline void -cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) -{ - blkiocg_update_avg_queue_size_stats(blkg); -} - -static inline void -cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) -{ - blkiocg_update_set_idle_time_stats(blkg); -} - -static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) -{ - blkiocg_update_dispatch_stats(blkg, bytes, direction, sync); -} - -static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) -{ - blkiocg_update_completion_stats(blkg, start_time, io_start_time, - direction, sync); -} - -static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) { - blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); -} - -static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - return blkiocg_del_blkio_group(blkg); -} - -#else /* CFQ_GROUP_IOSCHED */ -static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, - struct blkio_group *curr_blkg, bool direction, bool sync) {} - -static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) {} - -static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, - unsigned long time, unsigned long unaccounted_time) {} -static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} -static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, - bool direction, bool sync) {} -static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) -{ -} -static inline void -cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {} - -static inline void -cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {} - -static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, - uint64_t bytes, bool direction, bool sync) {} -static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {} - -static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) {} -static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) -{ - return 0; -} - -#endif /* CFQ_GROUP_IOSCHED */ -#endif diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 7bf12d7..599b12e 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static void *deadline_init_queue(struct request_queue *q) +static int deadline_init_queue(struct request_queue *q) { struct deadline_data *dd; dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); if (!dd) - return NULL; + return -ENOMEM; INIT_LIST_HEAD(&dd->fifo_list[READ]); INIT_LIST_HEAD(&dd->fifo_list[WRITE]); @@ -354,7 +354,9 @@ static void *deadline_init_queue(struct request_queue *q) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; - return dd; + + q->elevator->elevator_data = dd; + return 0; } /* diff --git a/block/elevator.c b/block/elevator.c index f016855..6a55d41 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -38,6 +38,7 @@ #include <trace/events/block.h> #include "blk.h" +#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -121,15 +122,6 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static int elevator_init_queue(struct request_queue *q, - struct elevator_queue *eq) -{ - eq->elevator_data = eq->type->ops.elevator_init_fn(q); - if (eq->elevator_data) - return 0; - return -ENOMEM; -} - static char chosen_elevator[ELV_NAME_MAX]; static int __init elevator_setup(char *str) @@ -188,7 +180,6 @@ static void elevator_release(struct kobject *kobj) int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; - struct elevator_queue *eq; int err; if (unlikely(q->elevator)) @@ -222,17 +213,16 @@ int elevator_init(struct request_queue *q, char *name) } } - eq = elevator_alloc(q, e); - if (!eq) + q->elevator = elevator_alloc(q, e); + if (!q->elevator) return -ENOMEM; - err = elevator_init_queue(q, eq); + err = e->ops.elevator_init_fn(q); if (err) { - kobject_put(&eq->kobj); + kobject_put(&q->elevator->kobj); return err; } - q->elevator = eq; return 0; } EXPORT_SYMBOL(elevator_init); @@ -564,25 +554,6 @@ void elv_drain_elevator(struct request_queue *q) } } -void elv_quiesce_start(struct request_queue *q) -{ - if (!q->elevator) - return; - - spin_lock_irq(q->queue_lock); - queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); - - blk_drain_queue(q, false); -} - -void elv_quiesce_end(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); -} - void __elv_add_request(struct request_queue *q, struct request *rq, int where) { trace_block_rq_insert(q, rq); @@ -692,12 +663,13 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +int elv_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) { struct elevator_queue *e = q->elevator; if (e->type->ops.elevator_set_req_fn) - return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask); + return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); return 0; } @@ -801,8 +773,9 @@ static struct kobj_type elv_ktype = { .release = elevator_release, }; -int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) +int elv_register_queue(struct request_queue *q) { + struct elevator_queue *e = q->elevator; int error; error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); @@ -820,11 +793,6 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) } return error; } - -int elv_register_queue(struct request_queue *q) -{ - return __elv_register_queue(q, q->elevator); -} EXPORT_SYMBOL(elv_register_queue); void elv_unregister_queue(struct request_queue *q) @@ -907,53 +875,60 @@ EXPORT_SYMBOL_GPL(elv_unregister); */ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - struct elevator_queue *old_elevator, *e; + struct elevator_queue *old = q->elevator; + bool registered = old->registered; int err; - /* allocate new elevator */ - e = elevator_alloc(q, new_e); - if (!e) - return -ENOMEM; + /* + * Turn on BYPASS and drain all requests w/ elevator private data. + * Block layer doesn't call into a quiesced elevator - all requests + * are directly put on the dispatch list without elevator data + * using INSERT_BACK. All requests have SOFTBARRIER set and no + * merge happens either. + */ + blk_queue_bypass_start(q); + + /* unregister and clear all auxiliary data of the old elevator */ + if (registered) + elv_unregister_queue(q); + + spin_lock_irq(q->queue_lock); + ioc_clear_queue(q); + spin_unlock_irq(q->queue_lock); - err = elevator_init_queue(q, e); + /* allocate, init and register new elevator */ + err = -ENOMEM; + q->elevator = elevator_alloc(q, new_e); + if (!q->elevator) + goto fail_init; + + err = new_e->ops.elevator_init_fn(q); if (err) { - kobject_put(&e->kobj); - return err; + kobject_put(&q->elevator->kobj); + goto fail_init; } - /* turn on BYPASS and drain all requests w/ elevator private data */ - elv_quiesce_start(q); - - /* unregister old queue, register new one and kill old elevator */ - if (q->elevator->registered) { - elv_unregister_queue(q); - err = __elv_register_queue(q, e); + if (registered) { + err = elv_register_queue(q); if (err) goto fail_register; } - /* done, clear io_cq's, switch elevators and turn off BYPASS */ - spin_lock_irq(q->queue_lock); - ioc_clear_queue(q); - old_elevator = q->elevator; - q->elevator = e; - spin_unlock_irq(q->queue_lock); - - elevator_exit(old_elevator); - elv_quiesce_end(q); + /* done, kill the old one and finish */ + elevator_exit(old); + blk_queue_bypass_end(q); - blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name); + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); return 0; fail_register: - /* - * switch failed, exit the new io scheduler and reattach the old - * one again (along with re-adding the sysfs dir) - */ - elevator_exit(e); + elevator_exit(q->elevator); +fail_init: + /* switch failed, restore and re-register old elevator */ + q->elevator = old; elv_register_queue(q); - elv_quiesce_end(q); + blk_queue_bypass_end(q); return err; } diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 413a0b1..5d1bf70 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -59,15 +59,17 @@ noop_latter_request(struct request_queue *q, struct request *rq) return list_entry(rq->queuelist.next, struct request, queuelist); } -static void *noop_init_queue(struct request_queue *q) +static int noop_init_queue(struct request_queue *q) { struct noop_data *nd; nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); if (!nd) - return NULL; + return -ENOMEM; + INIT_LIST_HEAD(&nd->queue); - return nd; + q->elevator->elevator_data = nd; + return 0; } static void noop_exit_queue(struct elevator_queue *e) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index cf0e63d..e54e31b 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -65,39 +65,80 @@ struct drbd_atodb_wait { int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); +void *drbd_md_get_buffer(struct drbd_conf *mdev) +{ + int r; + + wait_event(mdev->misc_wait, + (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 || + mdev->state.disk <= D_FAILED); + + return r ? NULL : page_address(mdev->md_io_page); +} + +void drbd_md_put_buffer(struct drbd_conf *mdev) +{ + if (atomic_dec_and_test(&mdev->md_io_in_use)) + wake_up(&mdev->misc_wait); +} + +static bool md_io_allowed(struct drbd_conf *mdev) +{ + enum drbd_disk_state ds = mdev->state.disk; + return ds >= D_NEGOTIATING || ds == D_ATTACHING; +} + +void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done) +{ + long dt = bdev->dc.disk_timeout * HZ / 10; + if (dt == 0) + dt = MAX_SCHEDULE_TIMEOUT; + + dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); + if (dt == 0) + dev_err(DEV, "meta-data IO operation timed out\n"); +} + static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, struct page *page, sector_t sector, int rw, int size) { struct bio *bio; - struct drbd_md_io md_io; int ok; - md_io.mdev = mdev; - init_completion(&md_io.event); - md_io.error = 0; + mdev->md_io.done = 0; + mdev->md_io.error = -ENODEV; if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA | REQ_FLUSH; rw |= REQ_SYNC; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; ok = (bio_add_page(bio, page, size, 0) == size); if (!ok) goto out; - bio->bi_private = &md_io; + bio->bi_private = &mdev->md_io; bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); + ok = 0; + goto out; + } + + bio_get(bio); /* one bio_put() is in the completion handler */ + atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_for_completion(&md_io.event); - ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; + wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); + ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; out: bio_put(bio); @@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int offset = 0; struct page *iop = mdev->md_io_page; - D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); + D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); BUG_ON(!bdev->md_bdev); @@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } - mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = (struct al_transaction *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ + if (!buffer) { + dev_err(DEV, "disk failed while waiting for md_io buffer\n"); + complete(&((struct update_al_work *)w)->event); + put_ldev(mdev); + return 1; + } buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); buffer->tr_number = cpu_to_be32(mdev->al_tr_number); @@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); mdev->al_tr_number++; - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); complete(&((struct update_al_work *)w)->event); put_ldev(mdev); @@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) /* lock out all other meta data io for now, * and make sure the page is mapped. */ - mutex_lock(&mdev->md_io_mutex); - buffer = page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + return 0; /* Find the valid transaction in the log */ for (i = 0; i <= mx; i++) { @@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (rv == 0) continue; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } cnr = be32_to_cpu(buffer->tr_number); @@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!found_valid) { dev_warn(DEV, "No usable activity log found.\n"); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 1; } @@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = drbd_al_read_tr(mdev, bdev, buffer, i); ERR_IF(rv == 0) goto cancel; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } @@ -534,7 +581,7 @@ cancel: mdev->al_tr_pos = 0; /* ok, we are done with it */ - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", transactions, active_extents); @@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, else ext->rs_failed += count; if (ext->rs_left < ext->rs_failed) { - dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " - "rs_failed=%d count=%d\n", + dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d cstate=%s\n", (unsigned long long)sector, ext->lce.lc_number, ext->rs_left, - ext->rs_failed, count); - dump_stack(); - - lc_put(mdev->resync, &ext->lce); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return; + ext->rs_failed, count, + drbd_conn_str(mdev->state.conn)); + + /* We don't expect to be able to clear more bits + * than have been set when we originally counted + * the set bits to cache that value in ext->rs_left. + * Whatever the reason (disconnect during resync, + * delayed local completion of an application write), + * try to fix it up by recounting here. */ + ext->rs_left = drbd_bm_e_weight(mdev, enr); } } else { /* Normally this element should be in the cache, @@ -1192,6 +1243,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev) put_ldev(mdev); } spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); return 0; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 3030201..b5c5ff5 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) static void bm_store_page_idx(struct page *page, unsigned long idx) { BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); - page_private(page) |= idx; + set_page_private(page, idx); } static unsigned long bm_page_to_idx(struct page *page) @@ -886,12 +886,21 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) struct bm_aio_ctx { struct drbd_conf *mdev; atomic_t in_flight; - struct completion done; + unsigned int done; unsigned flags; #define BM_AIO_COPY_PAGES 1 int error; + struct kref kref; }; +static void bm_aio_ctx_destroy(struct kref *kref) +{ + struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); + + put_ldev(ctx->mdev); + kfree(ctx); +} + /* bv_page may be a copy, or may be the original */ static void bm_async_io_complete(struct bio *bio, int error) { @@ -930,20 +939,21 @@ static void bm_async_io_complete(struct bio *bio, int error) bm_page_unlock_io(mdev, idx); - /* FIXME give back to page pool */ if (ctx->flags & BM_AIO_COPY_PAGES) - put_page(bio->bi_io_vec[0].bv_page); + mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); bio_put(bio); - if (atomic_dec_and_test(&ctx->in_flight)) - complete(&ctx->done); + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&mdev->misc_wait); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + } } static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) { - /* we are process context. we always get a bio */ - struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct bio *bio = bio_alloc_drbd(GFP_NOIO); struct drbd_conf *mdev = ctx->mdev; struct drbd_bitmap *b = mdev->bitmap; struct page *page; @@ -966,10 +976,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - /* FIXME alloc_page is good enough for now, but actually needs - * to use pre-allocated page pool */ void *src, *dest; - page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); dest = kmap_atomic(page); src = kmap_atomic(b->bm_pages[page_nr]); memcpy(dest, src, PAGE_SIZE); @@ -981,6 +989,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; + /* bio_add_page of a single page to an empty bio will always succeed, + * according to api. Do we want to assert that? */ bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = bm_async_io_complete; @@ -999,14 +1009,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) +static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) { - struct bm_aio_ctx ctx = { - .mdev = mdev, - .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), - .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, - }; + struct bm_aio_ctx *ctx; struct drbd_bitmap *b = mdev->bitmap; int num_pages, i, count = 0; unsigned long now; @@ -1021,7 +1026,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id * For lazy writeout, we don't care for ongoing changes to the bitmap, * as we submit copies of pages anyways. */ - if (!ctx.flags) + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = 0, + .flags = flags, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); + kfree(ctx); + return -ENODEV; + } + + if (!ctx->flags) WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); num_pages = b->bm_number_of_pages; @@ -1046,29 +1071,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id continue; } } - atomic_inc(&ctx.in_flight); - bm_page_io_async(&ctx, i, rw); + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i, rw); ++count; cond_resched(); } /* - * We initialize ctx.in_flight to one to make sure bm_async_io_complete - * will not complete() early, and decrement / test it here. If there + * We initialize ctx->in_flight to one to make sure bm_async_io_complete + * will not set ctx->done early, and decrement / test it here. If there * are still some bios in flight, we need to wait for them here. + * If all IO is done already (or nothing had been submitted), there is + * no need to wait. Still, we need to put the kref associated with the + * "in_flight reached zero, all done" event. */ - if (!atomic_dec_and_test(&ctx.in_flight)) - wait_for_completion(&ctx.done); + if (!atomic_dec_and_test(&ctx->in_flight)) + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); + else + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", rw == WRITE ? "WRITE" : "READ", count, jiffies - now); - if (ctx.error) { + if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); drbd_chk_io_error(mdev, 1, true); - err = -EIO; /* ctx.error ? */ + err = -EIO; /* ctx->error ? */ } + if (atomic_read(&ctx->in_flight)) + err = -EIO; /* Disk failed during IO... */ + now = jiffies; if (rw == WRITE) { drbd_md_flush(mdev); @@ -1082,6 +1116,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); return err; } @@ -1091,7 +1126,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id */ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, READ, 0); + return bm_rw(mdev, READ, 0, 0); } /** @@ -1102,7 +1137,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, WRITE, 0); + return bm_rw(mdev, WRITE, 0, 0); } /** @@ -1112,7 +1147,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) { - return bm_rw(mdev, WRITE, upper_idx); + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx); +} + +/** + * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + * + * Will only write pages that have changed since last IO. + * In contrast to drbd_bm_write(), this will copy the bitmap pages + * to temporary writeout pages. It is intended to trigger a full write-out + * while still allowing the bitmap to change, for example if a resync or online + * verify is aborted due to a failed peer disk, while local IO continues, or + * pending resync acks are still being processed. + */ +int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); } @@ -1130,28 +1181,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l */ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) { - struct bm_aio_ctx ctx = { + struct bm_aio_ctx *ctx; + int err; + + if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { + dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); + return 0; + } + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { .mdev = mdev, .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), + .done = 0, .flags = BM_AIO_COPY_PAGES, + .error = 0, + .kref = { ATOMIC_INIT(2) }, }; - if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { - dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); - return 0; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); + kfree(ctx); + return -ENODEV; } - bm_page_io_async(&ctx, idx, WRITE_SYNC); - wait_for_completion(&ctx.done); + bm_page_io_async(ctx, idx, WRITE_SYNC); + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); - if (ctx.error) + if (ctx->error) drbd_chk_io_error(mdev, 1, true); /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ mdev->bm_writ_cnt++; - return ctx.error; + err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + return err; } /* NOTE diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8d68056..02f013a 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -712,7 +712,6 @@ struct drbd_request { struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ unsigned long rq_state; /* see comments above _req_mod() */ - int seq_num; unsigned long start_time; }; @@ -851,6 +850,7 @@ enum { NEW_CUR_UUID, /* Create new current UUID when thawing IO */ AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ + STATE_SENT, /* Do not change state/UUIDs while this is set */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -862,31 +862,30 @@ enum bm_flag { BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ /* currently locked for bulk operation */ - BM_LOCKED_MASK = 0x7, + BM_LOCKED_MASK = 0xf, /* in detail, that is: */ BM_DONT_CLEAR = 0x1, BM_DONT_SET = 0x2, BM_DONT_TEST = 0x4, + /* so we can mark it locked for bulk operation, + * and still allow all non-bulk operations */ + BM_IS_LOCKED = 0x8, + /* (test bit, count bit) allowed (common case) */ - BM_LOCKED_TEST_ALLOWED = 0x3, + BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED, /* testing bits, as well as setting new bits allowed, but clearing bits * would be unexpected. Used during bitmap receive. Setting new bits * requires sending of "out-of-sync" information, though. */ - BM_LOCKED_SET_ALLOWED = 0x1, + BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED, - /* clear is not expected while bitmap is locked for bulk operation */ + /* for drbd_bm_write_copy_pages, everything is allowed, + * only concurrent bulk operations are locked out. */ + BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED, }; - -/* TODO sort members for performance - * MAYBE group them further */ - -/* THINK maybe we actually want to use the default "event/%s" worker threads - * or similar in linux 2.6, which uses per cpu data and threads. - */ struct drbd_work_queue { struct list_head q; struct semaphore s; /* producers up it, worker down()s it */ @@ -938,8 +937,7 @@ struct drbd_backing_dev { }; struct drbd_md_io { - struct drbd_conf *mdev; - struct completion event; + unsigned int done; int error; }; @@ -1022,6 +1020,7 @@ struct drbd_conf { struct drbd_tl_epoch *newest_tle; struct drbd_tl_epoch *oldest_tle; struct list_head out_of_sequence_requests; + struct list_head barrier_acked_requests; struct hlist_head *tl_hash; unsigned int tl_hash_s; @@ -1056,6 +1055,8 @@ struct drbd_conf { struct crypto_hash *csums_tfm; struct crypto_hash *verify_tfm; + unsigned long last_reattach_jif; + unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread asender; @@ -1094,7 +1095,8 @@ struct drbd_conf { wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ - struct mutex md_io_mutex; /* protects the md_io_buffer */ + struct drbd_md_io md_io; + atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ @@ -1228,8 +1230,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); -extern int _drbd_send_state(struct drbd_conf *mdev); -extern int drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); +extern int drbd_send_current_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); @@ -1461,6 +1463,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); extern size_t drbd_bm_words(struct drbd_conf *mdev); @@ -1493,11 +1496,38 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t *drbd_request_mempool; extern mempool_t *drbd_ee_mempool; -extern struct page *drbd_pp_pool; /* drbd's page pool */ +/* drbd's page pool, used to buffer data received from the peer, + * or data requested by the peer. + * + * This does not have an emergency reserve. + * + * When allocating from this pool, it first takes pages from the pool. + * Only if the pool is depleted will try to allocate from the system. + * + * The assumption is that pages taken from this pool will be processed, + * and given back, "quickly", and then can be recycled, so we can avoid + * frequent calls to alloc_page(), and still will be able to make progress even + * under memory pressure. + */ +extern struct page *drbd_pp_pool; extern spinlock_t drbd_pp_lock; extern int drbd_pp_vacant; extern wait_queue_head_t drbd_pp_wait; +/* We also need a standard (emergency-reserve backed) page pool + * for meta data IO (activity log, bitmap). + * We can keep it global, as long as it is used as "N pages at a time". + * 128 should be plenty, currently we probably can get away with as few as 1. + */ +#define DRBD_MIN_POOL_PAGES 128 +extern mempool_t *drbd_md_io_page_pool; + +/* We also need to make sure we get a bio + * when we need it for housekeeping purposes */ +extern struct bio_set *drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); + extern rwlock_t global_state_lock; extern struct drbd_conf *drbd_new_device(unsigned int minor); @@ -1536,8 +1566,12 @@ extern void resume_next_sg(struct drbd_conf *mdev); extern void suspend_other_sg(struct drbd_conf *mdev); extern int drbd_resync_finished(struct drbd_conf *mdev); /* maybe rather drbd_main.c ? */ +extern void *drbd_md_get_buffer(struct drbd_conf *mdev); +extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, sector_t sector, int rw); + struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done); extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); @@ -1754,19 +1788,6 @@ static inline struct page *page_chain_next(struct page *page) #define page_chain_for_each_safe(page, n) \ for (; page && ({ n = page_chain_next(page); 1; }); page = n) -static inline int drbd_bio_has_active_page(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - __bio_for_each_segment(bvec, bio, i, 0) { - if (page_count(bvec->bv_page) > 1) - return 1; - } - - return 0; -} - static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) { struct page *page = e->pages; @@ -1777,7 +1798,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) return 0; } - static inline void drbd_state_lock(struct drbd_conf *mdev) { wait_event(mdev->misc_wait, @@ -2230,7 +2250,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, * Note: currently we don't support such large bitmaps on 32bit * arch anyways, but no harm done to be prepared for it here. */ - unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10; + unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10; unsigned long left = *bits_left >> shift; unsigned long total = 1UL + (mdev->rs_total >> shift); unsigned long tmp = 1000UL - left * 1000UL/total; @@ -2306,12 +2326,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) case D_OUTDATED: case D_CONSISTENT: case D_UP_TO_DATE: + case D_FAILED: /* disk state is stable as well. */ break; /* no new io accepted during tansitional states */ case D_ATTACHING: - case D_FAILED: case D_NEGOTIATING: case D_UNKNOWN: case D_MASK: diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 211fc44..920ede2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -139,6 +139,8 @@ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; +mempool_t *drbd_md_io_page_pool; +struct bio_set *drbd_md_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -159,7 +161,24 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; -#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) +static void bio_destructor_drbd(struct bio *bio) +{ + bio_free(bio, drbd_md_io_bio_set); +} + +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; + bio->bi_destructor = bio_destructor_drbd; + return bio; +} #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will @@ -208,6 +227,7 @@ static int tl_init(struct drbd_conf *mdev) mdev->oldest_tle = b; mdev->newest_tle = b; INIT_LIST_HEAD(&mdev->out_of_sequence_requests); + INIT_LIST_HEAD(&mdev->barrier_acked_requests); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; @@ -246,9 +266,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) new->n_writes = 0; newest_before = mdev->newest_tle; - /* never send a barrier number == 0, because that is special-cased - * when using TCQ for our write ordering code */ - new->br_number = (newest_before->br_number+1) ?: 1; + new->br_number = newest_before->br_number+1; if (mdev->newest_tle != new) { mdev->newest_tle->next = new; mdev->newest_tle = new; @@ -311,7 +329,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, These have been list_move'd to the out_of_sequence_requests list in _req_mod(, barrier_acked) above. */ - list_del_init(&b->requests); + list_splice_init(&b->requests, &mdev->barrier_acked_requests); nob = b->next; if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { @@ -411,6 +429,23 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) b = tmp; list_splice(&carry_reads, &b->requests); } + + /* Actions operating on the disk state, also want to work on + requests that got barrier acked. */ + switch (what) { + case fail_frozen_disk_io: + case restart_frozen_disk_io: + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); + } + + case connection_lost_while_pending: + case resend: + break; + default: + dev_err(DEV, "what = %d in _tl_restart()\n", what); + } } @@ -458,6 +493,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) } /** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL + * @mdev: DRBD device. + */ +void tl_abort_disk_io(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + struct list_head *le, *tle; + struct drbd_request *req; + + spin_lock_irq(&mdev->req_lock); + b = mdev->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + b = b->next; + } + + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + + spin_unlock_irq(&mdev->req_lock); +} + +/** * cl_wide_st_chg() - true if the state change is a cluster wide one * @mdev: DRBD device. * @os: old (current) state. @@ -470,7 +537,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev, ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || + (os.disk != D_FAILED && ns.disk == D_FAILED))) || (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); } @@ -509,8 +576,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort); + union drbd_state ns, enum sanitize_state_warnings *warn); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -785,6 +860,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) rv = SS_IN_TRANSIENT_STATE; + /* While establishing a connection only allow cstate to change. + Delay/refuse role changes, detach attach etc... */ + if (test_bit(STATE_SENT, &mdev->flags) && + !(os.conn == C_WF_REPORT_PARAMS || + (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) + rv = SS_IN_TRANSIENT_STATE; + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) rv = SS_NEED_CONNECTION; @@ -803,6 +885,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, return rv; } +static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + dev_warn(DEV, "%s\n", msg_table[warn]); +} + /** * sanitize_state() - Resolves implicitly necessary additional changes to a state transition * @mdev: DRBD device. @@ -814,11 +911,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort) + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + if (warn) + *warn = NO_WARNING; + fp = FP_DONT_CARE; if (get_ldev(mdev)) { fp = mdev->ldev->dc.fencing; @@ -833,18 +933,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. * If you try to go into some Sync* state, that shall fail (elsewhere). */ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) ns.conn = os.conn; /* we cannot fail (again) if we already detached */ if (ns.disk == D_FAILED && os.disk == D_DISKLESS) ns.disk = D_DISKLESS; - /* if we are only D_ATTACHING yet, - * we can (and should) go directly to D_DISKLESS. */ - if (ns.disk == D_FAILED && os.disk == D_ATTACHING) - ns.disk = D_DISKLESS; - /* After C_DISCONNECTING only C_STANDALONE may follow */ if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) ns.conn = os.conn; @@ -863,10 +958,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* Abort resync if a disk fails/detaches */ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn_sync_abort) - *warn_sync_abort = - os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? - "Online-verify" : "Resync"; + if (warn) + *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; ns.conn = C_CONNECTED; } @@ -877,7 +971,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = mdev->new_state_tmp.disk; ns.pdsk = mdev->new_state_tmp.pdsk; } else { - dev_alert(DEV, "Connection lost while negotiating, no data!\n"); + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; ns.disk = D_DISKLESS; ns.pdsk = D_UNKNOWN; } @@ -959,16 +1054,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = disk_max; if (ns.disk < disk_min) { - dev_warn(DEV, "Implicitly set disk from %s to %s\n", - drbd_disk_str(ns.disk), drbd_disk_str(disk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; ns.disk = disk_min; } if (ns.pdsk > pdsk_max) ns.pdsk = pdsk_max; if (ns.pdsk < pdsk_min) { - dev_warn(DEV, "Implicitly set pdsk from %s to %s\n", - drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; ns.pdsk = pdsk_min; } @@ -1045,12 +1140,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, { union drbd_state os; enum drbd_state_rv rv = SS_SUCCESS; - const char *warn_sync_abort = NULL; + enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; os = mdev->state; - ns = sanitize_state(mdev, os, ns, &warn_sync_abort); + ns = sanitize_state(mdev, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1076,8 +1171,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, return rv; } - if (warn_sync_abort) - dev_warn(DEV, "%s aborted.\n", warn_sync_abort); + print_sanitize_warnings(mdev, ssw); { char *pbp, pb[300]; @@ -1243,7 +1337,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, drbd_thread_stop_nowait(&mdev->receiver); /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_TEAR_DOWN && + if (os.conn > C_WF_CONNECTION && ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) drbd_thread_restart_nowait(&mdev->receiver); @@ -1251,6 +1345,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) drbd_resume_al(mdev); + /* remember last connect and attach times so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) + mdev->last_reconnect_jif = jiffies; + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + mdev->last_reattach_jif = jiffies; + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); if (ascw) { ascw->os = os; @@ -1354,12 +1457,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ + if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) + mod_timer(&mdev->request_timer, jiffies + HZ); + nsm.i = -1; if (ns.susp_nod) { if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) what = resend; - if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) what = restart_frozen_disk_io; if (what != nothing) @@ -1408,7 +1515,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* No point in queuing send_bitmap if we don't have a connection * anymore, so check also the _current_ state, not only the new state @@ -1441,11 +1548,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(mdev); drbd_send_uuids(mdev); } - /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) /* We may still be Primary ourselves. @@ -1473,14 +1580,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { drbd_send_sizes(mdev, 0, 0); /* to start sync... */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* We want to pause/continue resync, tell peer. */ if (ns.conn >= C_CONNECTED && ((os.aftr_isp != ns.aftr_isp) || (os.user_isp != ns.user_isp))) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* In case one of the isp bits got set, suspend other devices. */ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && @@ -1490,10 +1597,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Make sure the peer gets informed about eventual state changes (ISP bits) while we were in WFReportParams. */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* We are in the progress to start a full sync... */ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || @@ -1513,33 +1620,38 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* first half of local IO error, failure to attach, * or administrative detach */ if (os.disk != D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh; - int was_io_error; + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; /* corresponding get_ldev was in __drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS, - * so it is safe to dreference ldev here. */ - eh = mdev->ldev->dc.on_io_error; - was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (mdev->state.disk != D_FAILED) - dev_err(DEV, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(mdev->state.disk)); - - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I am detaching my disk\n"); - else - dev_err(DEV, "Sending state for detaching disk failed\n"); - - drbd_rs_cancel_all(mdev); - - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(mdev); + * our cleanup here with the transition to D_DISKLESS. + * But is is still not save to dreference ldev here, since + * we might come from an failed Attach before ldev was set. */ + if (mdev->ldev) { + eh = mdev->ldev->dc.on_io_error; + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + /* Immediately allow completion of all application IO, that waits + for completion from the local disk. */ + tl_abort_disk_io(mdev); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + drbd_rs_cancel_all(mdev); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); + } put_ldev(mdev); if (was_io_error && eh == EP_CALL_HELPER) @@ -1561,16 +1673,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I'm now diskless.\n"); + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ put_ldev(mdev); } /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) - drbd_send_state(mdev); + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); /* Disks got bigger while they were detached */ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && @@ -1588,7 +1701,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* sync target done with resync. Explicitly notify peer, even though * it should (at least for non-empty resyncs) already know itself. */ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); + + /* Wake up role changes, that were delayed because of connection establishing */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { + clear_bit(STATE_SENT, &mdev->flags); + wake_up(&mdev->state_wait); + } /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk @@ -1598,8 +1717,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, * No harm done if some bits change during this phase. */ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { - drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, - "write from resync_finished", BM_LOCKED_SET_ALLOWED); + drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); put_ldev(mdev); } @@ -2057,7 +2176,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) D_ASSERT(mdev->state.disk == D_UP_TO_DATE); - uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET; + uuid = mdev->ldev->md.uuid[UI_BITMAP]; + if (uuid && uuid != UUID_JUST_CREATED) + uuid = uuid + UUID_NEW_BM_OFFSET; + else + get_random_bytes(&uuid, sizeof(u64)); drbd_uuid_set(mdev, UI_BITMAP, uuid); drbd_print_uuids(mdev, "updated sync UUID"); drbd_md_sync(mdev); @@ -2089,6 +2212,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } + /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ + if (mdev->agreed_pro_version <= 94) + max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); @@ -2102,10 +2229,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl } /** - * drbd_send_state() - Sends the drbd state to the peer + * drbd_send_current_state() - Sends the drbd state to the peer * @mdev: DRBD device. */ -int drbd_send_state(struct drbd_conf *mdev) +int drbd_send_current_state(struct drbd_conf *mdev) { struct socket *sock; struct p_state p; @@ -2131,6 +2258,37 @@ int drbd_send_state(struct drbd_conf *mdev) return ok; } +/** + * drbd_send_state() - After a state change, sends the new state to the peer + * @mdev: DRBD device. + * @state: the state to send, not necessarily the current state. + * + * Each state change queues an "after_state_ch" work, which will eventually + * send the resulting new state to the peer. If more state changes happen + * between queuing and processing of the after_state_ch work, we still + * want to send each intermediary state in the order it occurred. + */ +int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) +{ + struct socket *sock; + struct p_state p; + int ok = 0; + + mutex_lock(&mdev->data.mutex); + + p.state = cpu_to_be32(state.i); + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header80 *)&p, sizeof(p), 0); + } + + mutex_unlock(&mdev->data.mutex); + + return ok; +} + int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) { @@ -2615,7 +2773,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_no_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2629,7 +2787,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2695,8 +2853,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) p.sector = cpu_to_be64(req->sector); p.block_id = (unsigned long)req; - p.seq_num = cpu_to_be32(req->seq_num = - atomic_add_return(1, &mdev->packet_seq)); + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); @@ -2987,8 +3144,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); atomic_set(&mdev->ap_in_flight, 0); + atomic_set(&mdev->md_io_in_use, 0); - mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); mutex_init(&mdev->meta.mutex); sema_init(&mdev->data.work.s, 0); @@ -3126,6 +3283,10 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_md_io_bio_set) + bioset_free(drbd_md_io_bio_set); + if (drbd_md_io_page_pool) + mempool_destroy(drbd_md_io_page_pool); if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); if (drbd_request_mempool) @@ -3139,6 +3300,8 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_md_io_bio_set = NULL; + drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; drbd_request_mempool = NULL; drbd_ee_cache = NULL; @@ -3162,6 +3325,8 @@ static int drbd_create_mempools(void) drbd_bm_ext_cache = NULL; drbd_al_ext_cache = NULL; drbd_pp_pool = NULL; + drbd_md_io_page_pool = NULL; + drbd_md_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -3185,6 +3350,16 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ +#ifdef COMPAT_HAVE_BIOSET_CREATE + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_bio_set == NULL) + goto Enomem; +#endif + + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_page_pool == NULL) + goto Enomem; + drbd_request_mempool = mempool_create(number, mempool_alloc_slab, mempool_free_slab, drbd_request_cache); if (drbd_request_mempool == NULL) @@ -3262,6 +3437,8 @@ static void drbd_delete_device(unsigned int minor) if (!mdev) return; + del_timer_sync(&mdev->request_timer); + /* paranoia asserts */ if (mdev->open_cnt != 0) dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, @@ -3666,8 +3843,10 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!get_ldev_if_state(mdev, D_FAILED)) return; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; + memset(buffer, 0, 512); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); @@ -3698,7 +3877,8 @@ void drbd_md_sync(struct drbd_conf *mdev) * since we updated it on metadata. */ mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); +out: put_ldev(mdev); } @@ -3718,8 +3898,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!get_ldev_if_state(mdev, D_ATTACHING)) return ERR_IO_MD_DISK; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3780,7 +3961,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) mdev->sync_conf.al_extents = 127; err: - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); + out: put_ldev(mdev); return rv; @@ -4183,12 +4365,11 @@ const char *drbd_buildtag(void) static char buildtag[38] = "\0uilt-in"; if (buildtag[0] == 0) { -#ifdef CONFIG_MODULES - if (THIS_MODULE != NULL) - sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); - else +#ifdef MODULE + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); +#else + buildtag[0] = 'b'; #endif - buildtag[0] = 'b'; } return buildtag; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 946166e..6d4de6a 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data) */ spin_lock_irq(&mdev->req_lock); ns = mdev->state; - if (ns.conn < C_WF_REPORT_PARAMS) { + if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { ns.pdsk = nps; _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); } @@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) /* if this was forced, we should consider sync */ if (forced) drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } drbd_md_sync(mdev); @@ -845,9 +845,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { - if (mdev->agreed_pro_version < 94) - peer = mdev->peer_max_bio_size; - else if (mdev->agreed_pro_version == 94) + if (mdev->agreed_pro_version < 94) { + peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ + } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; else /* drbd 8.3.8 onwards */ peer = DRBD_MAX_BIO_SIZE; @@ -1032,7 +1033,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) nbc->dc.disk_size); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1046,7 +1047,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { - retcode = ERR_MD_DISK_TO_SMALL; + retcode = ERR_MD_DISK_TOO_SMALL; dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); @@ -1057,7 +1058,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * (we may currently be R_PRIMARY with no local disk...) */ if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1138,7 +1139,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto force_diskless_dec; } @@ -1336,17 +1337,34 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, { enum drbd_ret_code retcode; int ret; + struct detach dt = {}; + + if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { + reply->ret_code = ERR_MANDATORY_TAG; + goto out; + } + + if (dt.detach_force) { + drbd_force_state(mdev, NS(disk, D_FAILED)); + reply->ret_code = SS_SUCCESS; + goto out; + } + drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ + drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */ retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); + drbd_md_put_buffer(mdev); /* D_FAILED will transition to DISKLESS. */ ret = wait_event_interruptible(mdev->misc_wait, mdev->state.disk != D_FAILED); drbd_resume_io(mdev); + if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) retcode = ERR_INTR; reply->ret_code = retcode; +out: return 0; } @@ -1711,7 +1729,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, if (rs.no_resync && mdev->agreed_pro_version < 93) { retcode = ERR_NEED_APV_93; - goto fail; + goto fail_ldev; } if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) @@ -1738,6 +1756,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, fail: reply->ret_code = retcode; return 0; + + fail_ldev: + put_ldev(mdev); + goto fail; } static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, @@ -1941,6 +1963,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -1959,6 +1982,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -1980,6 +2004,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); @@ -1998,6 +2023,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re } else retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -2170,11 +2196,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); /* w_make_ov_request expects position to be aligned */ mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + drbd_resume_io(mdev); return 0; } diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2959cdf..869bada 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) if (unlikely(v >= 1000000)) { /* cool: > GiByte/s */ seq_printf(seq, "%ld,", v / 1000000); - v /= 1000000; + v %= 1000000; seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); } else if (likely(v >= 1000)) seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 436f519..ea4836e 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -466,6 +466,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what, goto out; } (*newsock)->ops = sock->ops; + __module_get((*newsock)->ops->owner); out: return err; @@ -750,6 +751,7 @@ static int drbd_connect(struct drbd_conf *mdev) { struct socket *s, *sock, *msock; int try, h, ok; + enum drbd_state_rv rv; D_ASSERT(!mdev->data.socket); @@ -888,25 +890,32 @@ retry: } } - if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) - return 0; - sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set(&mdev->packet_seq, 0); mdev->peer_seq = 0; - drbd_thread_start(&mdev->asender); - if (drbd_send_protocol(mdev) == -1) return -1; + set_bit(STATE_SENT, &mdev->flags); drbd_send_sync_param(mdev, &mdev->sync_conf); drbd_send_sizes(mdev, 0, 0); drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); + + spin_lock_irq(&mdev->req_lock); + rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); + if (mdev->state.conn != C_WF_REPORT_PARAMS) + clear_bit(STATE_SENT, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) + return 0; + + drbd_thread_start(&mdev->asender); mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ return 1; @@ -957,7 +966,7 @@ static void drbd_flush(struct drbd_conf *mdev) rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, NULL); if (rv) { - dev_err(DEV, "local disk flush failed with status %d\n", rv); + dev_info(DEV, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ @@ -1001,13 +1010,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) { + (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { if (!(ev & EV_CLEANUP)) { spin_unlock(&mdev->epoch_lock); drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); spin_lock(&mdev->epoch_lock); } - dec_unacked(mdev); + if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) + dec_unacked(mdev); if (mdev->current_epoch != epoch) { next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); @@ -1096,7 +1106,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the - * request in more than one bio. */ + * request in more than one bio. + * + * Plain bio_alloc is good enough here, this is no DRBD internally + * generated bio, but a bio allocated on behalf of the peer. + */ next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { @@ -1583,6 +1597,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u return ok; } +static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) +{ + + struct drbd_epoch_entry *rs_e; + bool rv = 0; + + spin_lock_irq(&mdev->req_lock); + list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { + if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { + rv = 1; + break; + } + } + spin_unlock_irq(&mdev->req_lock); + + return rv; +} + /* Called from receive_Data. * Synchronize packets on sock with packets on msock. * @@ -1826,6 +1858,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned list_add(&e->w.list, &mdev->active_ee); spin_unlock_irq(&mdev->req_lock); + if (mdev->state.conn == C_SYNC_TARGET) + wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); + switch (mdev->net_conf->wire_protocol) { case DRBD_PROT_C: inc_unacked(mdev); @@ -2420,7 +2455,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; - dev_info(DEV, "Did not got last syncUUID packet, corrected:\n"); + dev_info(DEV, "Lost last syncUUID packet, corrected:\n"); drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); return -1; @@ -2806,10 +2841,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi if (apv >= 88) { if (apv == 88) { - if (data_size > SHARED_SECRET_MAX) { - dev_err(DEV, "verify-alg too long, " - "peer wants %u, accepting only %u byte\n", - data_size, SHARED_SECRET_MAX); + if (data_size > SHARED_SECRET_MAX || data_size == 0) { + dev_err(DEV, "verify-alg of wrong size, " + "peer wants %u, accepting only up to %u byte\n", + data_size, SHARED_SECRET_MAX); return false; } @@ -3168,9 +3203,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); - /* peer says his disk is uptodate, while we think it is inconsistent, - * and this happens while we think we have a sync going on. */ - if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && + /* If some other part of the code (asender thread, timeout) + * already decided to close the connection again, + * we must not "re-establish" it here. */ + if (os.conn <= C_TEAR_DOWN) + return false; + + /* If this is the "end of sync" confirmation, usually the peer disk + * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits + * set) resync started in PausedSyncT, or if the timing of pause-/ + * unpause-sync events has been "just right", the peer disk may + * transition from D_CONSISTENT to D_UP_TO_DATE as well. + */ + if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && + real_peer_disk == D_UP_TO_DATE && os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { /* If we are (becoming) SyncSource, but peer is still in sync * preparation, ignore its uptodate-ness to avoid flapping, it @@ -3288,7 +3334,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned /* Nowadays only used when forcing a node into primary role and setting its disk to UpToDate with that */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } } @@ -3776,6 +3822,13 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (mdev->state.conn == C_STANDALONE) return; + /* We are about to start the cleanup after connection loss. + * Make sure drbd_make_request knows about that. + * Usually we should be in some network failure state already, + * but just in case we are not, we fix it up here. + */ + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + /* asender does not clean up anything. it must not interfere, either */ drbd_thread_stop(&mdev->asender); drbd_free_sock(mdev); @@ -3803,8 +3856,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) atomic_set(&mdev->rs_pending_cnt, 0); wake_up(&mdev->misc_wait); - del_timer(&mdev->request_timer); - /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); resync_timer_fn((unsigned long)mdev); @@ -4433,7 +4484,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) if (mdev->state.conn == C_AHEAD && atomic_read(&mdev->ap_in_flight) == 0 && - !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) { + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { mdev->start_resync_timer.expires = jiffies + HZ; add_timer(&mdev->start_resync_timer); } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4a0f314..9c5c849 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req const int rw = bio_data_dir(bio); int cpu; cpu = part_stat_lock(); + part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); part_inc_in_flight(&mdev->vdisk->part0, rw); @@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) { const unsigned long s = req->rq_state; struct drbd_conf *mdev = req->mdev; - /* only WRITES may end up here without a master bio (on barrier ack) */ - int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; + int rw = req->rq_state & RQ_WRITE ? WRITE : READ; /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) @@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) return; if (s & RQ_NET_PENDING) return; - if (s & RQ_LOCAL_PENDING) + if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) return; if (req->master_bio) { @@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) req->master_bio = NULL; } + if (s & RQ_LOCAL_PENDING) + return; + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { /* this is disconnected (local only) operation, * or protocol C P_WRITE_ACK, @@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case completed_ok: - if (bio_data_dir(req->master_bio) == WRITE) + if (req->rq_state & RQ_WRITE) mdev->writ_cnt += req->size>>9; else mdev->read_cnt += req->size>>9; @@ -438,7 +441,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); + break; + + case abort_disk_io: + req->rq_state |= RQ_LOCAL_ABORTED; + if (req->rq_state & RQ_WRITE) + _req_may_be_done_not_susp(req, m); + else + goto goto_queue_for_net_read; break; case write_completed_with_error: @@ -447,7 +457,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, __drbd_chk_io_error(mdev, false); _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_ahead_completed_with_error: @@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_completed_with_error: @@ -467,7 +475,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(!(req->rq_state & RQ_NET_MASK)); __drbd_chk_io_error(mdev, false); - put_ldev(mdev); + + goto_queue_for_net_read: /* no point in retrying if there is no good remote data, * or we have no connection. */ @@ -556,10 +565,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); break; - case oos_handed_to_network: - /* actually the same */ + case read_retry_remote_canceled: case send_canceled: - /* treat it the same */ case send_failed: /* real cleanup will be done from tl_clear. just update flags * so it is no longer marked as on the worker queue */ @@ -589,17 +596,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, } req->rq_state &= ~RQ_NET_QUEUED; req->rq_state |= RQ_NET_SENT; - /* because _drbd_send_zc_bio could sleep, and may want to - * dereference the bio even after the "write_acked_by_peer" and - * "completed_ok" events came in, once we return from - * _drbd_send_zc_bio (drbd_send_dblock), we have to check - * whether it is done already, and end it. */ _req_may_be_done_not_susp(req, m); break; - case read_retry_remote_canceled: + case oos_handed_to_network: + /* Was not set PENDING, no longer QUEUED, so is now DONE + * as far as this connection is concerned. */ req->rq_state &= ~RQ_NET_QUEUED; - /* fall through, in case we raced with drbd_disconnect */ + req->rq_state |= RQ_NET_DONE; + _req_may_be_done_not_susp(req, m); + break; + case connection_lost_while_pending: /* transfer log cleanup after connection loss */ /* assert something? */ @@ -616,8 +623,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case write_acked_by_peer_and_sis: - req->rq_state |= RQ_NET_SIS; case conflict_discarded_by_peer: /* for discarded conflicting writes of multiple primaries, * there is no need to keep anything in the tl, potential @@ -628,18 +633,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, (unsigned long long)req->sector, req->size); req->rq_state |= RQ_NET_DONE; /* fall through */ + case write_acked_by_peer_and_sis: case write_acked_by_peer: + if (what == write_acked_by_peer_and_sis) + req->rq_state |= RQ_NET_SIS; /* protocol C; successfully written on peer. - * Nothing to do here. + * Nothing more to do here. * We want to keep the tl in place for all protocols, to cater - * for volatile write-back caches on lower level devices. - * - * A barrier request is expected to have forced all prior - * requests onto stable storage, so completion of a barrier - * request could set NET_DONE right here, and not wait for the - * P_BARRIER_ACK, but that is an unnecessary optimization. */ + * for volatile write-back caches on lower level devices. */ - /* this makes it effectively the same as for: */ case recv_acked_by_peer: /* protocol B; pretends to be successfully written on peer. * see also notes above in handed_over_to_network about @@ -773,6 +775,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns int local, remote, send_oos = 0; int err = -EIO; int ret = 0; + union drbd_state s; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -834,8 +837,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns drbd_al_begin_io(mdev, sector); } - remote = remote && drbd_should_do_remote(mdev->state); - send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); + s = mdev->state; + remote = remote && drbd_should_do_remote(s); + send_oos = rw == WRITE && drbd_should_send_oos(s); D_ASSERT(!(remote && send_oos)); if (!(local || remote) && !is_susp(mdev->state)) { @@ -867,7 +871,7 @@ allocate_barrier: if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of - generic_make_request() to restart processing of this + drbd_make_request() to restart processing of this bio. In the next call to drbd_make_request we sleep in inc_ap_bio() */ ret = 1; @@ -1091,7 +1095,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) */ D_ASSERT(bio->bi_size > 0); D_ASSERT((bio->bi_size & 0x1ff) == 0); - D_ASSERT(bio->bi_idx == 0); /* to make some things easier, force alignment of requests within the * granularity of our hash tables */ @@ -1099,8 +1102,9 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; if (likely(s_enr == e_enr)) { - inc_ap_bio(mdev, 1); - drbd_make_request_common(mdev, bio, start_time); + do { + inc_ap_bio(mdev, 1); + } while (drbd_make_request_common(mdev, bio, start_time)); return; } @@ -1196,36 +1200,66 @@ void request_timer_fn(unsigned long data) struct drbd_conf *mdev = (struct drbd_conf *) data; struct drbd_request *req; /* oldest request */ struct list_head *le; - unsigned long et = 0; /* effective timeout = ko_count * timeout */ + unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ + unsigned long now; if (get_net_conf(mdev)) { - et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; + if (mdev->state.conn >= C_WF_REPORT_PARAMS) + ent = mdev->net_conf->timeout*HZ/10 + * mdev->net_conf->ko_count; put_net_conf(mdev); } - if (!et || mdev->state.conn < C_WF_REPORT_PARAMS) + if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ + dt = mdev->ldev->dc.disk_timeout * HZ / 10; + put_ldev(mdev); + } + et = min_not_zero(dt, ent); + + if (!et) return; /* Recurring timer stopped */ + now = jiffies; + spin_lock_irq(&mdev->req_lock); le = &mdev->oldest_tle->requests; if (list_empty(le)) { spin_unlock_irq(&mdev->req_lock); - mod_timer(&mdev->request_timer, jiffies + et); + mod_timer(&mdev->request_timer, now + et); return; } le = le->prev; req = list_entry(le, struct drbd_request, tl_requests); - if (time_is_before_eq_jiffies(req->start_time + et)) { - if (req->rq_state & RQ_NET_PENDING) { - dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL); - } else { - dev_warn(DEV, "Local backing block device frozen?\n"); - mod_timer(&mdev->request_timer, jiffies + et); - } - } else { - mod_timer(&mdev->request_timer, req->start_time + et); - } + /* The request is considered timed out, if + * - we have some effective timeout from the configuration, + * with above state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + if (ent && req->rq_state & RQ_NET_PENDING && + time_after(now, req->start_time + ent) && + !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { + dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); + _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); + } + if (dt && req->rq_state & RQ_LOCAL_PENDING && + time_after(now, req->start_time + dt) && + !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { + dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(mdev, 1); + } + nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; spin_unlock_irq(&mdev->req_lock); + mod_timer(&mdev->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 68a234a..3d21119 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -105,6 +105,7 @@ enum drbd_req_event { read_completed_with_error, read_ahead_completed_with_error, write_completed_with_error, + abort_disk_io, completed_ok, resend, fail_frozen_disk_io, @@ -118,18 +119,21 @@ enum drbd_req_event { * same time, so we should hold the request lock anyways. */ enum drbd_req_state_bits { - /* 210 - * 000: no local possible - * 001: to be submitted + /* 3210 + * 0000: no local possible + * 0001: to be submitted * UNUSED, we could map: 011: submitted, completion still pending - * 110: completed ok - * 010: completed with error + * 0110: completed ok + * 0010: completed with error + * 1001: Aborted (before completion) + * 1x10: Aborted and completed -> free */ __RQ_LOCAL_PENDING, __RQ_LOCAL_COMPLETED, __RQ_LOCAL_OK, + __RQ_LOCAL_ABORTED, - /* 76543 + /* 87654 * 00000: no network possible * 00001: to be send * 00011: to be send, on worker queue @@ -199,8 +203,9 @@ enum drbd_req_state_bits { #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) +#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) -#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ +#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4d3e6f6..620c70f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -70,11 +70,29 @@ rwlock_t global_state_lock; void drbd_md_io_complete(struct bio *bio, int error) { struct drbd_md_io *md_io; + struct drbd_conf *mdev; md_io = (struct drbd_md_io *)bio->bi_private; + mdev = container_of(md_io, struct drbd_conf, md_io); + md_io->error = error; - complete(&md_io->event); + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able + * to timeout on the lower level device, and eventually detach from it. + * If this io completion runs after that timeout expired, this + * drbd_md_put_buffer() may allow us to finally try and re-attach. + * During normal operation, this only puts that extra reference + * down to 1 again. + * Make sure we first drop the reference, and only then signal + * completion, or we may (in drbd_al_read_log()) cycle so fast into the + * next drbd_md_sync_page_io(), that we trigger the + * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there. + */ + drbd_md_put_buffer(mdev); + md_io->done = 1; + wake_up(&mdev->misc_wait); + bio_put(bio); + put_ldev(mdev); } /* reads on behalf of the partner, @@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error) spin_lock_irqsave(&mdev->req_lock, flags); __req_mod(req, what, &m); spin_unlock_irqrestore(&mdev->req_lock, flags); + put_ldev(mdev); if (m.bio) complete_master_bio(mdev, &m); @@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * sg_init_table(&sg, 1); crypto_hash_init(&desc); - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); crypto_hash_update(&desc, &sg, sg.length); } @@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } drbd_start_resync(mdev, C_SYNC_SOURCE); - clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags); + clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); return 1; } @@ -1519,14 +1538,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) } drbd_state_lock(mdev); - + write_lock_irq(&global_state_lock); if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + write_unlock_irq(&global_state_lock); drbd_state_unlock(mdev); return; } - write_lock_irq(&global_state_lock); - ns = mdev->state; + ns.i = mdev->state.i; ns.aftr_isp = !_drbd_may_sync_now(mdev); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index b0b00d7..cce7df3 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -551,7 +551,7 @@ static void floppy_ready(void); static void floppy_start(void); static void process_fd_request(void); static void recalibrate_floppy(void); -static void floppy_shutdown(unsigned long); +static void floppy_shutdown(struct work_struct *); static int floppy_request_regions(int); static void floppy_release_regions(int); @@ -588,6 +588,8 @@ static int buffer_max = -1; static struct floppy_fdc_state fdc_state[N_FDC]; static int fdc; /* current fdc */ +static struct workqueue_struct *floppy_wq; + static struct floppy_struct *_floppy = floppy_type; static unsigned char current_drive; static long current_count_sectors; @@ -629,16 +631,15 @@ static inline void set_debugt(void) { } static inline void debugt(const char *func, const char *msg) { } #endif /* DEBUGT */ -typedef void (*timeout_fn)(unsigned long); -static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0); +static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown); static const char *timeout_message; static void is_alive(const char *func, const char *message) { /* this routine checks whether the floppy driver is "alive" */ if (test_bit(0, &fdc_busy) && command_status < 2 && - !timer_pending(&fd_timeout)) { + !delayed_work_pending(&fd_timeout)) { DPRINT("%s: timeout handler died. %s\n", func, message); } } @@ -666,15 +667,18 @@ static int output_log_pos; static void __reschedule_timeout(int drive, const char *message) { + unsigned long delay; + if (drive == current_reqD) drive = current_drive; - del_timer(&fd_timeout); + if (drive < 0 || drive >= N_DRIVE) { - fd_timeout.expires = jiffies + 20UL * HZ; + delay = 20UL * HZ; drive = 0; } else - fd_timeout.expires = jiffies + UDP->timeout; - add_timer(&fd_timeout); + delay = UDP->timeout; + + queue_delayed_work(floppy_wq, &fd_timeout, delay); if (UDP->flags & FD_DEBUG) DPRINT("reschedule timeout %s\n", message); timeout_message = message; @@ -872,7 +876,7 @@ static int lock_fdc(int drive, bool interruptible) command_status = FD_COMMAND_NONE; - __reschedule_timeout(drive, "lock fdc"); + reschedule_timeout(drive, "lock fdc"); set_fdc(drive); return 0; } @@ -880,23 +884,15 @@ static int lock_fdc(int drive, bool interruptible) /* unlocks the driver */ static void unlock_fdc(void) { - unsigned long flags; - - raw_cmd = NULL; if (!test_bit(0, &fdc_busy)) DPRINT("FDC access conflict!\n"); - if (do_floppy) - DPRINT("device interrupt still active at FDC release: %pf!\n", - do_floppy); + raw_cmd = NULL; command_status = FD_COMMAND_NONE; - spin_lock_irqsave(&floppy_lock, flags); - del_timer(&fd_timeout); + __cancel_delayed_work(&fd_timeout); + do_floppy = NULL; cont = NULL; clear_bit(0, &fdc_busy); - if (current_req || set_next_request()) - do_fd_request(current_req->q); - spin_unlock_irqrestore(&floppy_lock, flags); wake_up(&fdc_wait); } @@ -968,26 +964,24 @@ static DECLARE_WORK(floppy_work, NULL); static void schedule_bh(void (*handler)(void)) { + WARN_ON(work_pending(&floppy_work)); + PREPARE_WORK(&floppy_work, (work_func_t)handler); - schedule_work(&floppy_work); + queue_work(floppy_wq, &floppy_work); } -static DEFINE_TIMER(fd_timer, NULL, 0, 0); +static DECLARE_DELAYED_WORK(fd_timer, NULL); static void cancel_activity(void) { - unsigned long flags; - - spin_lock_irqsave(&floppy_lock, flags); do_floppy = NULL; - PREPARE_WORK(&floppy_work, (work_func_t)empty); - del_timer(&fd_timer); - spin_unlock_irqrestore(&floppy_lock, flags); + cancel_delayed_work_sync(&fd_timer); + cancel_work_sync(&floppy_work); } /* this function makes sure that the disk stays in the drive during the * transfer */ -static void fd_watchdog(void) +static void fd_watchdog(struct work_struct *arg) { debug_dcl(DP->flags, "calling disk change from watchdog\n"); @@ -997,21 +991,20 @@ static void fd_watchdog(void) cont->done(0); reset_fdc(); } else { - del_timer(&fd_timer); - fd_timer.function = (timeout_fn)fd_watchdog; - fd_timer.expires = jiffies + HZ / 10; - add_timer(&fd_timer); + cancel_delayed_work(&fd_timer); + PREPARE_DELAYED_WORK(&fd_timer, fd_watchdog); + queue_delayed_work(floppy_wq, &fd_timer, HZ / 10); } } static void main_command_interrupt(void) { - del_timer(&fd_timer); + cancel_delayed_work(&fd_timer); cont->interrupt(); } /* waits for a delay (spinup or select) to pass */ -static int fd_wait_for_completion(unsigned long delay, timeout_fn function) +static int fd_wait_for_completion(unsigned long expires, work_func_t function) { if (FDCS->reset) { reset_fdc(); /* do the reset during sleep to win time @@ -1020,11 +1013,10 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function) return 1; } - if (time_before(jiffies, delay)) { - del_timer(&fd_timer); - fd_timer.function = function; - fd_timer.expires = delay; - add_timer(&fd_timer); + if (time_before(jiffies, expires)) { + cancel_delayed_work(&fd_timer); + PREPARE_DELAYED_WORK(&fd_timer, function); + queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies); return 1; } return 0; @@ -1342,7 +1334,7 @@ static int fdc_dtr(void) */ FDCS->dtr = raw_cmd->rate & 3; return fd_wait_for_completion(jiffies + 2UL * HZ / 100, - (timeout_fn)floppy_ready); + (work_func_t)floppy_ready); } /* fdc_dtr */ static void tell_sector(void) @@ -1447,7 +1439,7 @@ static void setup_rw_floppy(void) int flags; int dflags; unsigned long ready_date; - timeout_fn function; + work_func_t function; flags = raw_cmd->flags; if (flags & (FD_RAW_READ | FD_RAW_WRITE)) @@ -1461,9 +1453,9 @@ static void setup_rw_floppy(void) */ if (time_after(ready_date, jiffies + DP->select_delay)) { ready_date -= DP->select_delay; - function = (timeout_fn)floppy_start; + function = (work_func_t)floppy_start; } else - function = (timeout_fn)setup_rw_floppy; + function = (work_func_t)setup_rw_floppy; /* wait until the floppy is spinning fast enough */ if (fd_wait_for_completion(ready_date, function)) @@ -1493,7 +1485,7 @@ static void setup_rw_floppy(void) inr = result(); cont->interrupt(); } else if (flags & FD_RAW_NEED_DISK) - fd_watchdog(); + fd_watchdog(NULL); } static int blind_seek; @@ -1802,20 +1794,22 @@ static void show_floppy(void) pr_info("do_floppy=%pf\n", do_floppy); if (work_pending(&floppy_work)) pr_info("floppy_work.func=%pf\n", floppy_work.func); - if (timer_pending(&fd_timer)) - pr_info("fd_timer.function=%pf\n", fd_timer.function); - if (timer_pending(&fd_timeout)) { - pr_info("timer_function=%pf\n", fd_timeout.function); - pr_info("expires=%lu\n", fd_timeout.expires - jiffies); - pr_info("now=%lu\n", jiffies); - } + if (delayed_work_pending(&fd_timer)) + pr_info("delayed work.function=%p expires=%ld\n", + fd_timer.work.func, + fd_timer.timer.expires - jiffies); + if (delayed_work_pending(&fd_timeout)) + pr_info("timer_function=%p expires=%ld\n", + fd_timeout.work.func, + fd_timeout.timer.expires - jiffies); + pr_info("cont=%p\n", cont); pr_info("current_req=%p\n", current_req); pr_info("command_status=%d\n", command_status); pr_info("\n"); } -static void floppy_shutdown(unsigned long data) +static void floppy_shutdown(struct work_struct *arg) { unsigned long flags; @@ -1868,7 +1862,7 @@ static int start_motor(void (*function)(void)) /* wait_for_completion also schedules reset if needed. */ return fd_wait_for_completion(DRS->select_date + DP->select_delay, - (timeout_fn)function); + (work_func_t)function); } static void floppy_ready(void) @@ -2821,7 +2815,6 @@ do_request: spin_lock_irq(&floppy_lock); pending = set_next_request(); spin_unlock_irq(&floppy_lock); - if (!pending) { do_floppy = NULL; unlock_fdc(); @@ -2898,13 +2891,15 @@ static void do_fd_request(struct request_queue *q) current_req->cmd_flags)) return; - if (test_bit(0, &fdc_busy)) { + if (test_and_set_bit(0, &fdc_busy)) { /* fdc busy, this new request will be treated when the current one is done */ is_alive(__func__, "old request running"); return; } - lock_fdc(MAXTIMEOUT, false); + command_status = FD_COMMAND_NONE; + __reschedule_timeout(MAXTIMEOUT, "fd_request"); + set_fdc(0); process_fd_request(); is_alive(__func__, ""); } @@ -3612,9 +3607,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) mutex_lock(&floppy_mutex); mutex_lock(&open_lock); - if (UDRS->fd_ref < 0) - UDRS->fd_ref = 0; - else if (!UDRS->fd_ref--) { + if (!UDRS->fd_ref--) { DPRINT("floppy_release with fd_ref == 0"); UDRS->fd_ref = 0; } @@ -3650,13 +3643,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) set_bit(FD_VERIFY_BIT, &UDRS->flags); } - if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL))) - goto out2; - - if (mode & FMODE_EXCL) - UDRS->fd_ref = -1; - else - UDRS->fd_ref++; + UDRS->fd_ref++; opened_bdev[drive] = bdev; @@ -3719,10 +3706,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) mutex_unlock(&floppy_mutex); return 0; out: - if (UDRS->fd_ref < 0) - UDRS->fd_ref = 0; - else - UDRS->fd_ref--; + UDRS->fd_ref--; + if (!UDRS->fd_ref) opened_bdev[drive] = NULL; out2: @@ -4159,10 +4144,16 @@ static int __init floppy_init(void) goto out_put_disk; } + floppy_wq = alloc_ordered_workqueue("floppy", 0); + if (!floppy_wq) { + err = -ENOMEM; + goto out_put_disk; + } + disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock); if (!disks[dr]->queue) { err = -ENOMEM; - goto out_put_disk; + goto out_destroy_workq; } blk_queue_max_hw_sectors(disks[dr]->queue, 64); @@ -4213,7 +4204,7 @@ static int __init floppy_init(void) use_virtual_dma = can_use_virtual_dma & 1; fdc_state[0].address = FDC1; if (fdc_state[0].address == -1) { - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); err = -ENODEV; goto out_unreg_region; } @@ -4224,7 +4215,7 @@ static int __init floppy_init(void) fdc = 0; /* reset fdc in case of unexpected interrupt */ err = floppy_grab_irq_and_dma(); if (err) { - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); err = -EBUSY; goto out_unreg_region; } @@ -4281,13 +4272,13 @@ static int __init floppy_init(void) user_reset_fdc(-1, FD_RESET_ALWAYS, false); } fdc = 0; - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); current_drive = 0; initialized = true; if (have_no_fdc) { DPRINT("no floppy controllers found\n"); err = have_no_fdc; - goto out_flush_work; + goto out_release_dma; } for (drive = 0; drive < N_DRIVE; drive++) { @@ -4302,7 +4293,7 @@ static int __init floppy_init(void) err = platform_device_register(&floppy_device[drive]); if (err) - goto out_flush_work; + goto out_release_dma; err = device_create_file(&floppy_device[drive].dev, &dev_attr_cmos); @@ -4320,13 +4311,14 @@ static int __init floppy_init(void) out_unreg_platform_dev: platform_device_unregister(&floppy_device[drive]); -out_flush_work: - flush_work_sync(&floppy_work); +out_release_dma: if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); platform_driver_unregister(&floppy_driver); +out_destroy_workq: + destroy_workqueue(floppy_wq); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: @@ -4397,7 +4389,7 @@ static int floppy_grab_irq_and_dma(void) * We might have scheduled a free_irq(), wait it to * drain first: */ - flush_work_sync(&floppy_work); + flush_workqueue(floppy_wq); if (fd_request_irq()) { DPRINT("Unable to grab IRQ%d for the floppy driver\n", @@ -4488,9 +4480,9 @@ static void floppy_release_irq_and_dma(void) pr_info("motor off timer %d still active\n", drive); #endif - if (timer_pending(&fd_timeout)) + if (delayed_work_pending(&fd_timeout)) pr_info("floppy timer still active:%s\n", timeout_message); - if (timer_pending(&fd_timer)) + if (delayed_work_pending(&fd_timer)) pr_info("auxiliary floppy timer still active\n"); if (work_pending(&floppy_work)) pr_info("work still pending\n"); @@ -4560,8 +4552,9 @@ static void __exit floppy_module_exit(void) put_disk(disks[drive]); } - del_timer_sync(&fd_timeout); - del_timer_sync(&fd_timer); + cancel_delayed_work_sync(&fd_timeout); + cancel_delayed_work_sync(&fd_timer); + destroy_workqueue(floppy_wq); if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 4e86393..60eed4b 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -526,6 +526,14 @@ static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) return 0; } +static char *encode_disk_name(char *ptr, unsigned int n) +{ + if (n >= 26) + ptr = encode_disk_name(ptr, n / 26 - 1); + *ptr = 'a' + n % 26; + return ptr + 1; +} + static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, u16 vdisk_info, u16 sector_size) @@ -536,6 +544,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, unsigned int offset; int minor; int nr_parts; + char *ptr; BUG_ON(info->gd != NULL); BUG_ON(info->rq != NULL); @@ -560,7 +569,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, "emulated IDE disks,\n\t choose an xvd device name" "from xvde on\n", info->vdevice); } - err = -ENODEV; + if (minor >> MINORBITS) { + pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n", + info->vdevice, minor); + return -ENODEV; + } if ((minor % nr_parts) == 0) nr_minors = nr_parts; @@ -574,23 +587,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (gd == NULL) goto release; - if (nr_minors > 1) { - if (offset < 26) - sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset); - else - sprintf(gd->disk_name, "%s%c%c", DEV_NAME, - 'a' + ((offset / 26)-1), 'a' + (offset % 26)); - } else { - if (offset < 26) - sprintf(gd->disk_name, "%s%c%d", DEV_NAME, - 'a' + offset, - minor & (nr_parts - 1)); - else - sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME, - 'a' + ((offset / 26) - 1), - 'a' + (offset % 26), - minor & (nr_parts - 1)); - } + strcpy(gd->disk_name, DEV_NAME); + ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); + BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN); + if (nr_minors > 1) + *ptr = 0; + else + snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr, + "%d", minor & (nr_parts - 1)); gd->major = XENVBD_MAJOR; gd->first_minor = minor; @@ -1496,7 +1500,9 @@ module_init(xlblk_init); static void __exit xlblk_exit(void) { - return xenbus_unregister_driver(&blkfront_driver); + xenbus_unregister_driver(&blkfront_driver); + unregister_blkdev(XENVBD_MAJOR, DEV_NAME); + kfree(minors); } module_exit(xlblk_exit); diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 92cea9d..08a7aa7 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -2116,7 +2116,7 @@ out: return ret; } -static int format_check(struct drm_mode_fb_cmd2 *r) +static int format_check(const struct drm_mode_fb_cmd2 *r) { uint32_t format = r->pixel_format & ~DRM_FORMAT_BIG_ENDIAN; @@ -2185,7 +2185,7 @@ static int format_check(struct drm_mode_fb_cmd2 *r) } } -static int framebuffer_check(struct drm_mode_fb_cmd2 *r) +static int framebuffer_check(const struct drm_mode_fb_cmd2 *r) { int ret, hsub, vsub, num_planes, i; @@ -3126,7 +3126,7 @@ int drm_mode_connector_update_edid_property(struct drm_connector *connector, EXPORT_SYMBOL(drm_mode_connector_update_edid_property); static bool drm_property_change_is_valid(struct drm_property *property, - __u64 value) + uint64_t value) { if (property->flags & DRM_MODE_PROP_IMMUTABLE) return false; @@ -3136,7 +3136,7 @@ static bool drm_property_change_is_valid(struct drm_property *property, return true; } else if (property->flags & DRM_MODE_PROP_BITMASK) { int i; - __u64 valid_mask = 0; + uint64_t valid_mask = 0; for (i = 0; i < property->num_values; i++) valid_mask |= (1ULL << property->values[i]); return !(value & ~valid_mask); diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 608bddf..c3b5139 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -66,6 +66,8 @@ #define EDID_QUIRK_FIRST_DETAILED_PREFERRED (1 << 5) /* use +hsync +vsync for detailed mode */ #define EDID_QUIRK_DETAILED_SYNC_PP (1 << 6) +/* Force reduced-blanking timings for detailed modes */ +#define EDID_QUIRK_FORCE_REDUCED_BLANKING (1 << 7) struct detailed_mode_closure { struct drm_connector *connector; @@ -120,6 +122,9 @@ static struct edid_quirk { /* Samsung SyncMaster 22[5-6]BW */ { "SAM", 596, EDID_QUIRK_PREFER_LARGE_60 }, { "SAM", 638, EDID_QUIRK_PREFER_LARGE_60 }, + + /* ViewSonic VA2026w */ + { "VSC", 5020, EDID_QUIRK_FORCE_REDUCED_BLANKING }, }; /*** DDC fetch and block validation ***/ @@ -885,12 +890,19 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, "Wrong Hsync/Vsync pulse width\n"); return NULL; } + + if (quirks & EDID_QUIRK_FORCE_REDUCED_BLANKING) { + mode = drm_cvt_mode(dev, hactive, vactive, 60, true, false, false); + if (!mode) + return NULL; + + goto set_size; + } + mode = drm_mode_create(dev); if (!mode) return NULL; - mode->type = DRM_MODE_TYPE_DRIVER; - if (quirks & EDID_QUIRK_135_CLOCK_TOO_HIGH) timing->pixel_clock = cpu_to_le16(1088); @@ -914,8 +926,6 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, drm_mode_do_interlace_quirk(mode, pt); - drm_mode_set_name(mode); - if (quirks & EDID_QUIRK_DETAILED_SYNC_PP) { pt->misc |= DRM_EDID_PT_HSYNC_POSITIVE | DRM_EDID_PT_VSYNC_POSITIVE; } @@ -925,6 +935,7 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, mode->flags |= (pt->misc & DRM_EDID_PT_VSYNC_POSITIVE) ? DRM_MODE_FLAG_PVSYNC : DRM_MODE_FLAG_NVSYNC; +set_size: mode->width_mm = pt->width_mm_lo | (pt->width_height_mm_hi & 0xf0) << 4; mode->height_mm = pt->height_mm_lo | (pt->width_height_mm_hi & 0xf) << 8; @@ -938,6 +949,9 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, mode->height_mm = edid->height_cm * 10; } + mode->type = DRM_MODE_TYPE_DRIVER; + drm_mode_set_name(mode); + return mode; } diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index eb2b3c2..5363e9c 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2032,6 +2032,8 @@ void i915_debugfs_cleanup(struct drm_minor *minor) 1, minor); drm_debugfs_remove_files((struct drm_info_list *) &i915_ring_stop_fops, 1, minor); + drm_debugfs_remove_files((struct drm_info_list *) &i915_error_state_fops, + 1, minor); } #endif /* CONFIG_DEBUG_FS */ diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c1e5c66..288d7b8 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2063,10 +2063,8 @@ i915_gem_object_unbind(struct drm_i915_gem_object *obj) if (obj->gtt_space == NULL) return 0; - if (obj->pin_count != 0) { - DRM_ERROR("Attempting to unbind pinned buffer\n"); - return -EINVAL; - } + if (obj->pin_count) + return -EBUSY; ret = i915_gem_object_finish_gpu(obj); if (ret) @@ -3293,6 +3291,7 @@ struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev, struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj; struct address_space *mapping; + u32 mask; obj = kzalloc(sizeof(*obj), GFP_KERNEL); if (obj == NULL) @@ -3303,8 +3302,15 @@ struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev, return NULL; } + mask = GFP_HIGHUSER | __GFP_RECLAIMABLE; + if (IS_CRESTLINE(dev) || IS_BROADWATER(dev)) { + /* 965gm cannot relocate objects above 4GiB. */ + mask &= ~__GFP_HIGHMEM; + mask |= __GFP_DMA32; + } + mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping; - mapping_set_gfp_mask(mapping, GFP_HIGHUSER | __GFP_RECLAIMABLE); + mapping_set_gfp_mask(mapping, mask); i915_gem_info_add_obj(dev_priv, size); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index cc4a633..1417660 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -350,8 +350,8 @@ static void gen6_pm_rps_work(struct work_struct *work) { drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, rps_work); - u8 new_delay = dev_priv->cur_delay; u32 pm_iir, pm_imr; + u8 new_delay; spin_lock_irq(&dev_priv->rps_lock); pm_iir = dev_priv->pm_iir; @@ -360,41 +360,18 @@ static void gen6_pm_rps_work(struct work_struct *work) I915_WRITE(GEN6_PMIMR, 0); spin_unlock_irq(&dev_priv->rps_lock); - if (!pm_iir) + if ((pm_iir & GEN6_PM_DEFERRED_EVENTS) == 0) return; mutex_lock(&dev_priv->dev->struct_mutex); - if (pm_iir & GEN6_PM_RP_UP_THRESHOLD) { - if (dev_priv->cur_delay != dev_priv->max_delay) - new_delay = dev_priv->cur_delay + 1; - if (new_delay > dev_priv->max_delay) - new_delay = dev_priv->max_delay; - } else if (pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT)) { - gen6_gt_force_wake_get(dev_priv); - if (dev_priv->cur_delay != dev_priv->min_delay) - new_delay = dev_priv->cur_delay - 1; - if (new_delay < dev_priv->min_delay) { - new_delay = dev_priv->min_delay; - I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, - I915_READ(GEN6_RP_INTERRUPT_LIMITS) | - ((new_delay << 16) & 0x3f0000)); - } else { - /* Make sure we continue to get down interrupts - * until we hit the minimum frequency */ - I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, - I915_READ(GEN6_RP_INTERRUPT_LIMITS) & ~0x3f0000); - } - gen6_gt_force_wake_put(dev_priv); - } + + if (pm_iir & GEN6_PM_RP_UP_THRESHOLD) + new_delay = dev_priv->cur_delay + 1; + else + new_delay = dev_priv->cur_delay - 1; gen6_set_rps(dev_priv->dev, new_delay); - dev_priv->cur_delay = new_delay; - /* - * rps_lock not held here because clearing is non-destructive. There is - * an *extremely* unlikely race with gen6_rps_enable() that is prevented - * by holding struct_mutex for the duration of the write. - */ mutex_unlock(&dev_priv->dev->struct_mutex); } diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index ee61ad1..91478942 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -910,9 +910,10 @@ static void assert_pll(struct drm_i915_private *dev_priv, /* For ILK+ */ static void assert_pch_pll(struct drm_i915_private *dev_priv, - struct intel_crtc *intel_crtc, bool state) + struct intel_pch_pll *pll, + struct intel_crtc *crtc, + bool state) { - int reg; u32 val; bool cur_state; @@ -921,30 +922,37 @@ static void assert_pch_pll(struct drm_i915_private *dev_priv, return; } - if (!intel_crtc->pch_pll) { - WARN(1, "asserting PCH PLL enabled with no PLL\n"); + if (WARN (!pll, + "asserting PCH PLL %s with no PLL\n", state_string(state))) return; - } - if (HAS_PCH_CPT(dev_priv->dev)) { + val = I915_READ(pll->pll_reg); + cur_state = !!(val & DPLL_VCO_ENABLE); + WARN(cur_state != state, + "PCH PLL state for reg %x assertion failure (expected %s, current %s), val=%08x\n", + pll->pll_reg, state_string(state), state_string(cur_state), val); + + /* Make sure the selected PLL is correctly attached to the transcoder */ + if (crtc && HAS_PCH_CPT(dev_priv->dev)) { u32 pch_dpll; pch_dpll = I915_READ(PCH_DPLL_SEL); - - /* Make sure the selected PLL is enabled to the transcoder */ - WARN(!((pch_dpll >> (4 * intel_crtc->pipe)) & 8), - "transcoder %d PLL not enabled\n", intel_crtc->pipe); + cur_state = pll->pll_reg == _PCH_DPLL_B; + if (!WARN(((pch_dpll >> (4 * crtc->pipe)) & 1) != cur_state, + "PLL[%d] not attached to this transcoder %d: %08x\n", + cur_state, crtc->pipe, pch_dpll)) { + cur_state = !!(val >> (4*crtc->pipe + 3)); + WARN(cur_state != state, + "PLL[%d] not %s on this transcoder %d: %08x\n", + pll->pll_reg == _PCH_DPLL_B, + state_string(state), + crtc->pipe, + val); + } } - - reg = intel_crtc->pch_pll->pll_reg; - val = I915_READ(reg); - cur_state = !!(val & DPLL_VCO_ENABLE); - WARN(cur_state != state, - "PCH PLL state assertion failure (expected %s, current %s)\n", - state_string(state), state_string(cur_state)); } -#define assert_pch_pll_enabled(d, p) assert_pch_pll(d, p, true) -#define assert_pch_pll_disabled(d, p) assert_pch_pll(d, p, false) +#define assert_pch_pll_enabled(d, p, c) assert_pch_pll(d, p, c, true) +#define assert_pch_pll_disabled(d, p, c) assert_pch_pll(d, p, c, false) static void assert_fdi_tx(struct drm_i915_private *dev_priv, enum pipe pipe, bool state) @@ -1424,7 +1432,7 @@ static void intel_enable_pch_pll(struct intel_crtc *intel_crtc) assert_pch_refclk_enabled(dev_priv); if (pll->active++ && pll->on) { - assert_pch_pll_enabled(dev_priv, intel_crtc); + assert_pch_pll_enabled(dev_priv, pll, NULL); return; } @@ -1460,12 +1468,12 @@ static void intel_disable_pch_pll(struct intel_crtc *intel_crtc) intel_crtc->base.base.id); if (WARN_ON(pll->active == 0)) { - assert_pch_pll_disabled(dev_priv, intel_crtc); + assert_pch_pll_disabled(dev_priv, pll, NULL); return; } if (--pll->active) { - assert_pch_pll_enabled(dev_priv, intel_crtc); + assert_pch_pll_enabled(dev_priv, pll, NULL); return; } @@ -1495,7 +1503,9 @@ static void intel_enable_transcoder(struct drm_i915_private *dev_priv, BUG_ON(dev_priv->info->gen < 5); /* Make sure PCH DPLL is enabled */ - assert_pch_pll_enabled(dev_priv, to_intel_crtc(crtc)); + assert_pch_pll_enabled(dev_priv, + to_intel_crtc(crtc)->pch_pll, + to_intel_crtc(crtc)); /* FDI must be feeding us bits for PCH ports */ assert_fdi_tx_enabled(dev_priv, pipe); diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 71c7096..296cfc2 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -266,6 +266,9 @@ intel_dp_mode_valid(struct drm_connector *connector, if (mode->clock < 10000) return MODE_CLOCK_LOW; + if (mode->flags & DRM_MODE_FLAG_DBLCLK) + return MODE_H_ILLEGAL; + return MODE_OK; } @@ -702,6 +705,9 @@ intel_dp_mode_fixup(struct drm_encoder *encoder, struct drm_display_mode *mode, mode->clock = intel_dp->panel_fixed_mode->clock; } + if (mode->flags & DRM_MODE_FLAG_DBLCLK) + return false; + DRM_DEBUG_KMS("DP link computation with max lane count %i " "max bw %02x pixel clock %iKHz\n", max_lane_count, bws[max_clock], mode->clock); @@ -1154,11 +1160,10 @@ static void ironlake_edp_panel_off(struct intel_dp *intel_dp) DRM_DEBUG_KMS("Turn eDP power off\n"); - WARN(intel_dp->want_panel_vdd, "Cannot turn power off while VDD is on\n"); - ironlake_panel_vdd_off_sync(intel_dp); /* finish any pending work */ + WARN(!intel_dp->want_panel_vdd, "Need VDD to turn off panel\n"); pp = ironlake_get_pp_control(dev_priv); - pp &= ~(POWER_TARGET_ON | EDP_FORCE_VDD | PANEL_POWER_RESET | EDP_BLC_ENABLE); + pp &= ~(POWER_TARGET_ON | PANEL_POWER_RESET | EDP_BLC_ENABLE); I915_WRITE(PCH_PP_CONTROL, pp); POSTING_READ(PCH_PP_CONTROL); @@ -1266,18 +1271,16 @@ static void intel_dp_prepare(struct drm_encoder *encoder) { struct intel_dp *intel_dp = enc_to_intel_dp(encoder); + + /* Make sure the panel is off before trying to change the mode. But also + * ensure that we have vdd while we switch off the panel. */ + ironlake_edp_panel_vdd_on(intel_dp); ironlake_edp_backlight_off(intel_dp); ironlake_edp_panel_off(intel_dp); - /* Wake up the sink first */ - ironlake_edp_panel_vdd_on(intel_dp); intel_dp_sink_dpms(intel_dp, DRM_MODE_DPMS_ON); intel_dp_link_down(intel_dp); ironlake_edp_panel_vdd_off(intel_dp, false); - - /* Make sure the panel is off before trying to - * change the mode - */ } static void intel_dp_commit(struct drm_encoder *encoder) @@ -1309,10 +1312,11 @@ intel_dp_dpms(struct drm_encoder *encoder, int mode) uint32_t dp_reg = I915_READ(intel_dp->output_reg); if (mode != DRM_MODE_DPMS_ON) { + /* Switching the panel off requires vdd. */ + ironlake_edp_panel_vdd_on(intel_dp); ironlake_edp_backlight_off(intel_dp); ironlake_edp_panel_off(intel_dp); - ironlake_edp_panel_vdd_on(intel_dp); intel_dp_sink_dpms(intel_dp, mode); intel_dp_link_down(intel_dp); ironlake_edp_panel_vdd_off(intel_dp, false); diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c index 4a9707d..1991a44 100644 --- a/drivers/gpu/drm/i915/intel_i2c.c +++ b/drivers/gpu/drm/i915/intel_i2c.c @@ -396,11 +396,22 @@ clear_err: * Wait for bus to IDLE before clearing NAK. * If we clear the NAK while bus is still active, then it will stay * active and the next transaction may fail. + * + * If no ACK is received during the address phase of a transaction, the + * adapter must report -ENXIO. It is not clear what to return if no ACK + * is received at other times. But we have to be careful to not return + * spurious -ENXIO because that will prevent i2c and drm edid functions + * from retrying. So return -ENXIO only when gmbus properly quiescents - + * timing out seems to happen when there _is_ a ddc chip present, but + * it's slow responding and only answers on the 2nd retry. */ + ret = -ENXIO; if (wait_for((I915_READ(GMBUS2 + reg_offset) & GMBUS_ACTIVE) == 0, - 10)) + 10)) { DRM_DEBUG_KMS("GMBUS [%s] timed out after NAK\n", adapter->name); + ret = -ETIMEDOUT; + } /* Toggle the Software Clear Interrupt bit. This has the effect * of resetting the GMBUS controller and so clearing the @@ -414,14 +425,6 @@ clear_err: adapter->name, msgs[i].addr, (msgs[i].flags & I2C_M_RD) ? 'r' : 'w', msgs[i].len); - /* - * If no ACK is received during the address phase of a transaction, - * the adapter must report -ENXIO. - * It is not clear what to return if no ACK is received at other times. - * So, we always return -ENXIO in all NAK cases, to ensure we send - * it at least during the one case that is specified. - */ - ret = -ENXIO; goto out; timeout: diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index 9dee823..08eb04c 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -747,6 +747,14 @@ static const struct dmi_system_id intel_no_lvds[] = { }, { .callback = intel_no_lvds_dmi_callback, + .ident = "Hewlett-Packard HP t5740e Thin Client", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP t5740e Thin Client"), + }, + }, + { + .callback = intel_no_lvds_dmi_callback, .ident = "Hewlett-Packard t5745", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 8e79ff6..d0ce2a5 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -2270,10 +2270,33 @@ void ironlake_disable_drps(struct drm_device *dev) void gen6_set_rps(struct drm_device *dev, u8 val) { struct drm_i915_private *dev_priv = dev->dev_private; - u32 swreq; + u32 limits; - swreq = (val & 0x3ff) << 25; - I915_WRITE(GEN6_RPNSWREQ, swreq); + limits = 0; + if (val >= dev_priv->max_delay) + val = dev_priv->max_delay; + else + limits |= dev_priv->max_delay << 24; + + if (val <= dev_priv->min_delay) + val = dev_priv->min_delay; + else + limits |= dev_priv->min_delay << 16; + + if (val == dev_priv->cur_delay) + return; + + I915_WRITE(GEN6_RPNSWREQ, + GEN6_FREQUENCY(val) | + GEN6_OFFSET(0) | + GEN6_AGGRESSIVE_TURBO); + + /* Make sure we continue to get interrupts + * until we hit the minimum or maximum frequencies. + */ + I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, limits); + + dev_priv->cur_delay = val; } void gen6_disable_rps(struct drm_device *dev) @@ -2327,11 +2350,10 @@ int intel_enable_rc6(const struct drm_device *dev) void gen6_enable_rps(struct drm_i915_private *dev_priv) { struct intel_ring_buffer *ring; - u32 rp_state_cap = I915_READ(GEN6_RP_STATE_CAP); - u32 gt_perf_status = I915_READ(GEN6_GT_PERF_STATUS); + u32 rp_state_cap; + u32 gt_perf_status; u32 pcu_mbox, rc6_mask = 0; u32 gtfifodbg; - int cur_freq, min_freq, max_freq; int rc6_mode; int i; @@ -2352,6 +2374,14 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) gen6_gt_force_wake_get(dev_priv); + rp_state_cap = I915_READ(GEN6_RP_STATE_CAP); + gt_perf_status = I915_READ(GEN6_GT_PERF_STATUS); + + /* In units of 100MHz */ + dev_priv->max_delay = rp_state_cap & 0xff; + dev_priv->min_delay = (rp_state_cap & 0xff0000) >> 16; + dev_priv->cur_delay = 0; + /* disable the counters and set deterministic thresholds */ I915_WRITE(GEN6_RC_CONTROL, 0); @@ -2399,8 +2429,8 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 1000000); I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, - 18 << 24 | - 6 << 16); + dev_priv->max_delay << 24 | + dev_priv->min_delay << 16); I915_WRITE(GEN6_RP_UP_THRESHOLD, 10000); I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 1000000); I915_WRITE(GEN6_RP_UP_EI, 100000); @@ -2408,7 +2438,7 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10); I915_WRITE(GEN6_RP_CONTROL, GEN6_RP_MEDIA_TURBO | - GEN6_RP_MEDIA_HW_MODE | + GEN6_RP_MEDIA_HW_NORMAL_MODE | GEN6_RP_MEDIA_IS_GFX | GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG | @@ -2426,10 +2456,6 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) 500)) DRM_ERROR("timeout waiting for pcode mailbox to finish\n"); - min_freq = (rp_state_cap & 0xff0000) >> 16; - max_freq = rp_state_cap & 0xff; - cur_freq = (gt_perf_status & 0xff00) >> 8; - /* Check for overclock support */ if (wait_for((I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) == 0, 500)) @@ -2440,14 +2466,11 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) 500)) DRM_ERROR("timeout waiting for pcode mailbox to finish\n"); if (pcu_mbox & (1<<31)) { /* OC supported */ - max_freq = pcu_mbox & 0xff; + dev_priv->max_delay = pcu_mbox & 0xff; DRM_DEBUG_DRIVER("overclocking supported, adjusting frequency max to %dMHz\n", pcu_mbox * 50); } - /* In units of 100MHz */ - dev_priv->max_delay = max_freq; - dev_priv->min_delay = min_freq; - dev_priv->cur_delay = cur_freq; + gen6_set_rps(dev_priv->dev, (gt_perf_status & 0xff00) >> 8); /* requires MSI enabled */ I915_WRITE(GEN6_PMIER, @@ -3580,8 +3603,9 @@ static void gen6_sanitize_pm(struct drm_device *dev) limits |= (dev_priv->min_delay & 0x3f) << 16; if (old != limits) { - DRM_ERROR("Power management discrepancy: GEN6_RP_INTERRUPT_LIMITS expected %08x, was %08x\n", - limits, old); + /* Note that the known failure case is to read back 0. */ + DRM_DEBUG_DRIVER("Power management discrepancy: GEN6_RP_INTERRUPT_LIMITS " + "expected %08x, was %08x\n", limits, old); I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, limits); } diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c index a949b73..b6a9d45 100644 --- a/drivers/gpu/drm/i915/intel_sdvo.c +++ b/drivers/gpu/drm/i915/intel_sdvo.c @@ -783,10 +783,12 @@ static void intel_sdvo_get_dtd_from_mode(struct intel_sdvo_dtd *dtd, ((v_sync_len & 0x30) >> 4); dtd->part2.dtd_flags = 0x18; + if (mode->flags & DRM_MODE_FLAG_INTERLACE) + dtd->part2.dtd_flags |= DTD_FLAG_INTERLACE; if (mode->flags & DRM_MODE_FLAG_PHSYNC) - dtd->part2.dtd_flags |= 0x2; + dtd->part2.dtd_flags |= DTD_FLAG_HSYNC_POSITIVE; if (mode->flags & DRM_MODE_FLAG_PVSYNC) - dtd->part2.dtd_flags |= 0x4; + dtd->part2.dtd_flags |= DTD_FLAG_VSYNC_POSITIVE; dtd->part2.sdvo_flags = 0; dtd->part2.v_sync_off_high = v_sync_offset & 0xc0; @@ -820,9 +822,11 @@ static void intel_sdvo_get_mode_from_dtd(struct drm_display_mode * mode, mode->clock = dtd->part1.clock * 10; mode->flags &= ~(DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC); - if (dtd->part2.dtd_flags & 0x2) + if (dtd->part2.dtd_flags & DTD_FLAG_INTERLACE) + mode->flags |= DRM_MODE_FLAG_INTERLACE; + if (dtd->part2.dtd_flags & DTD_FLAG_HSYNC_POSITIVE) mode->flags |= DRM_MODE_FLAG_PHSYNC; - if (dtd->part2.dtd_flags & 0x4) + if (dtd->part2.dtd_flags & DTD_FLAG_VSYNC_POSITIVE) mode->flags |= DRM_MODE_FLAG_PVSYNC; } diff --git a/drivers/gpu/drm/i915/intel_sdvo_regs.h b/drivers/gpu/drm/i915/intel_sdvo_regs.h index 6b7b22f..9d03014 100644 --- a/drivers/gpu/drm/i915/intel_sdvo_regs.h +++ b/drivers/gpu/drm/i915/intel_sdvo_regs.h @@ -61,6 +61,11 @@ struct intel_sdvo_caps { u16 output_flags; } __attribute__((packed)); +/* Note: SDVO detailed timing flags match EDID misc flags. */ +#define DTD_FLAG_HSYNC_POSITIVE (1 << 1) +#define DTD_FLAG_VSYNC_POSITIVE (1 << 2) +#define DTD_FLAG_INTERLACE (1 << 7) + /** This matches the EDID DTD structure, more or less */ struct intel_sdvo_dtd { struct { diff --git a/drivers/gpu/drm/i915/intel_tv.c b/drivers/gpu/drm/i915/intel_tv.c index 3346612..a233a51 100644 --- a/drivers/gpu/drm/i915/intel_tv.c +++ b/drivers/gpu/drm/i915/intel_tv.c @@ -674,6 +674,54 @@ static const struct tv_mode tv_modes[] = { .filter_table = filter_table, }, { + .name = "480p", + .clock = 107520, + .refresh = 59940, + .oversample = TV_OVERSAMPLE_4X, + .component_only = 1, + + .hsync_end = 64, .hblank_end = 122, + .hblank_start = 842, .htotal = 857, + + .progressive = true, .trilevel_sync = false, + + .vsync_start_f1 = 12, .vsync_start_f2 = 12, + .vsync_len = 12, + + .veq_ena = false, + + .vi_end_f1 = 44, .vi_end_f2 = 44, + .nbr_end = 479, + + .burst_ena = false, + + .filter_table = filter_table, + }, + { + .name = "576p", + .clock = 107520, + .refresh = 50000, + .oversample = TV_OVERSAMPLE_4X, + .component_only = 1, + + .hsync_end = 64, .hblank_end = 139, + .hblank_start = 859, .htotal = 863, + + .progressive = true, .trilevel_sync = false, + + .vsync_start_f1 = 10, .vsync_start_f2 = 10, + .vsync_len = 10, + + .veq_ena = false, + + .vi_end_f1 = 48, .vi_end_f2 = 48, + .nbr_end = 575, + + .burst_ena = false, + + .filter_table = filter_table, + }, + { .name = "720p@60Hz", .clock = 148800, .refresh = 60000, @@ -1194,6 +1242,11 @@ intel_tv_detect_type(struct intel_tv *intel_tv, I915_WRITE(TV_DAC, save_tv_dac & ~TVDAC_STATE_CHG_EN); I915_WRITE(TV_CTL, save_tv_ctl); + POSTING_READ(TV_CTL); + + /* For unknown reasons the hw barfs if we don't do this vblank wait. */ + intel_wait_for_vblank(intel_tv->base.base.dev, + to_intel_crtc(intel_tv->base.base.crtc)->pipe); /* Restore interrupt config */ if (connector->polled & DRM_CONNECTOR_POLL_HPD) { diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index b01c2dd..ce4e7cc 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -865,7 +865,7 @@ static void cayman_gpu_init(struct radeon_device *rdev) /* num banks is 8 on all fusion asics. 0 = 4, 1 = 8, 2 = 16 */ if (rdev->flags & RADEON_IS_IGP) - rdev->config.evergreen.tile_config |= 1 << 4; + rdev->config.cayman.tile_config |= 1 << 4; else rdev->config.cayman.tile_config |= ((mc_arb_ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT) << 4; diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 1dc3a4a..492654f 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -848,7 +848,6 @@ struct radeon_cs_parser { s32 priority; }; -extern int radeon_cs_update_pages(struct radeon_cs_parser *p, int pg_idx); extern int radeon_cs_finish_pages(struct radeon_cs_parser *p); extern u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx); diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index f6e69b8..b1e3820 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -444,7 +444,9 @@ static bool radeon_atom_apply_quirks(struct drm_device *dev, */ if ((dev->pdev->device == 0x9498) && (dev->pdev->subsystem_vendor == 0x1682) && - (dev->pdev->subsystem_device == 0x2452)) { + (dev->pdev->subsystem_device == 0x2452) && + (i2c_bus->valid == false) && + !(supported_device & (ATOM_DEVICE_TV_SUPPORT | ATOM_DEVICE_CV_SUPPORT))) { struct radeon_device *rdev = dev->dev_private; *i2c_bus = radeon_lookup_i2c_gpio(rdev, 0x93); } diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index c7d64a7..0137689 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -580,7 +580,7 @@ int radeon_cs_finish_pages(struct radeon_cs_parser *p) return 0; } -int radeon_cs_update_pages(struct radeon_cs_parser *p, int pg_idx) +static int radeon_cs_update_pages(struct radeon_cs_parser *p, int pg_idx) { int new_page; struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx]; @@ -623,3 +623,28 @@ int radeon_cs_update_pages(struct radeon_cs_parser *p, int pg_idx) return new_page; } + +u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx) +{ + struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx]; + u32 pg_idx, pg_offset; + u32 idx_value = 0; + int new_page; + + pg_idx = (idx * 4) / PAGE_SIZE; + pg_offset = (idx * 4) % PAGE_SIZE; + + if (ibc->kpage_idx[0] == pg_idx) + return ibc->kpage[0][pg_offset/4]; + if (ibc->kpage_idx[1] == pg_idx) + return ibc->kpage[1][pg_offset/4]; + + new_page = radeon_cs_update_pages(p, pg_idx); + if (new_page < 0) { + p->parser_error = new_page; + return 0; + } + + idx_value = ibc->kpage[new_page][pg_offset/4]; + return idx_value; +} diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c index 493a7be..983658c 100644 --- a/drivers/gpu/drm/radeon/radeon_ring.c +++ b/drivers/gpu/drm/radeon/radeon_ring.c @@ -39,31 +39,6 @@ */ int radeon_debugfs_sa_init(struct radeon_device *rdev); -u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx) -{ - struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx]; - u32 pg_idx, pg_offset; - u32 idx_value = 0; - int new_page; - - pg_idx = (idx * 4) / PAGE_SIZE; - pg_offset = (idx * 4) % PAGE_SIZE; - - if (ibc->kpage_idx[0] == pg_idx) - return ibc->kpage[0][pg_offset/4]; - if (ibc->kpage_idx[1] == pg_idx) - return ibc->kpage[1][pg_offset/4]; - - new_page = radeon_cs_update_pages(p, pg_idx); - if (new_page < 0) { - p->parser_error = new_page; - return 0; - } - - idx_value = ibc->kpage[new_page][pg_offset/4]; - return idx_value; -} - int radeon_ib_get(struct radeon_device *rdev, int ring, struct radeon_ib *ib, unsigned size) { diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c index 40efd32..97acc9c 100644 --- a/drivers/gpu/drm/udl/udl_gem.c +++ b/drivers/gpu/drm/udl/udl_gem.c @@ -234,7 +234,7 @@ int udl_gem_mmap(struct drm_file *file, struct drm_device *dev, ret = udl_gem_get_pages(gobj, GFP_KERNEL); if (ret) - return ret; + goto out; if (!gobj->base.map_list.map) { ret = drm_gem_create_mmap_offset(obj); if (ret) @@ -257,8 +257,6 @@ static int udl_prime_create(struct drm_device *dev, { struct udl_gem_object *obj; int npages; - int i; - struct scatterlist *iter; npages = size / PAGE_SIZE; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a5bee8e..d90a421 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -450,12 +450,27 @@ static void dump_command(unsigned long phys_addr) static void iommu_print_event(struct amd_iommu *iommu, void *__evt) { - u32 *event = __evt; - int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; - int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; - int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; - int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; - u64 address = (u64)(((u64)event[3]) << 32) | event[2]; + int type, devid, domid, flags; + volatile u32 *event = __evt; + int count = 0; + u64 address; + +retry: + type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; + devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; + domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; + flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; + address = (u64)(((u64)event[3]) << 32) | event[2]; + + if (type == 0) { + /* Did we hit the erratum? */ + if (++count == LOOP_TIMEOUT) { + pr_err("AMD-Vi: No event written to event log\n"); + return; + } + udelay(1); + goto retry; + } printk(KERN_ERR "AMD-Vi: Event logged ["); @@ -508,6 +523,8 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) default: printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); } + + memset(__evt, 0, 4 * sizeof(u32)); } static void iommu_poll_events(struct amd_iommu *iommu) @@ -2035,20 +2052,20 @@ out_err: } /* FIXME: Move this to PCI code */ -#define PCI_PRI_TLP_OFF (1 << 2) +#define PCI_PRI_TLP_OFF (1 << 15) bool pci_pri_tlp_required(struct pci_dev *pdev) { - u16 control; + u16 status; int pos; pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return false; - pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); - return (control & PCI_PRI_TLP_OFF) ? true : false; + return (status & PCI_PRI_TLP_OFF) ? true : false; } /* diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2198b2d..8b9ded8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -119,6 +119,7 @@ EXPORT_SYMBOL_GPL(iommu_present); * iommu_set_fault_handler() - set a fault handler for an iommu domain * @domain: iommu domain * @handler: fault handler + * @token: user data, will be passed back to the fault handler * * This function should be used by IOMMU users which want to be notified * whenever an IOMMU fault happens. @@ -127,11 +128,13 @@ EXPORT_SYMBOL_GPL(iommu_present); * error code otherwise. */ void iommu_set_fault_handler(struct iommu_domain *domain, - iommu_fault_handler_t handler) + iommu_fault_handler_t handler, + void *token) { BUG_ON(!domain); domain->handler = handler; + domain->handler_token = token; } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 6899dcd..e70ee2b 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -41,11 +41,13 @@ * @pgtable: the page table * @iommu_dev: an omap iommu device attached to this domain. only a single * iommu device can be attached for now. + * @dev: Device using this domain. * @lock: domain lock, should be taken when attaching/detaching */ struct omap_iommu_domain { u32 *pgtable; struct omap_iommu *iommu_dev; + struct device *dev; spinlock_t lock; }; @@ -1081,6 +1083,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) } omap_domain->iommu_dev = arch_data->iommu_dev = oiommu; + omap_domain->dev = dev; oiommu->domain = domain; out: @@ -1088,19 +1091,16 @@ out: return ret; } -static void omap_iommu_detach_dev(struct iommu_domain *domain, - struct device *dev) +static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, + struct device *dev) { - struct omap_iommu_domain *omap_domain = domain->priv; - struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; struct omap_iommu *oiommu = dev_to_omap_iommu(dev); - - spin_lock(&omap_domain->lock); + struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; /* only a single device is supported per domain for now */ if (omap_domain->iommu_dev != oiommu) { dev_err(dev, "invalid iommu device\n"); - goto out; + return; } iopgtable_clear_entry_all(oiommu); @@ -1108,8 +1108,16 @@ static void omap_iommu_detach_dev(struct iommu_domain *domain, omap_iommu_detach(oiommu); omap_domain->iommu_dev = arch_data->iommu_dev = NULL; + omap_domain->dev = NULL; +} -out: +static void omap_iommu_detach_dev(struct iommu_domain *domain, + struct device *dev) +{ + struct omap_iommu_domain *omap_domain = domain->priv; + + spin_lock(&omap_domain->lock); + _omap_iommu_detach_dev(omap_domain, dev); spin_unlock(&omap_domain->lock); } @@ -1148,13 +1156,19 @@ out: return -ENOMEM; } -/* assume device was already detached */ static void omap_iommu_domain_destroy(struct iommu_domain *domain) { struct omap_iommu_domain *omap_domain = domain->priv; domain->priv = NULL; + /* + * An iommu device is still attached + * (currently, only one device can be attached) ? + */ + if (omap_domain->iommu_dev) + _omap_iommu_detach_dev(omap_domain, omap_domain->dev); + kfree(omap_domain->pgtable); kfree(omap_domain); } diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index 779306e..0c0a377 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -29,15 +29,17 @@ #include <linux/device.h> #include <linux/io.h> #include <linux/iommu.h> +#include <linux/of.h> #include <asm/cacheflush.h> /* bitmap of the page sizes currently supported */ #define GART_IOMMU_PGSIZES (SZ_4K) -#define GART_CONFIG 0x24 -#define GART_ENTRY_ADDR 0x28 -#define GART_ENTRY_DATA 0x2c +#define GART_REG_BASE 0x24 +#define GART_CONFIG (0x24 - GART_REG_BASE) +#define GART_ENTRY_ADDR (0x28 - GART_REG_BASE) +#define GART_ENTRY_DATA (0x2c - GART_REG_BASE) #define GART_ENTRY_PHYS_ADDR_VALID (1 << 31) #define GART_PAGE_SHIFT 12 @@ -158,7 +160,7 @@ static int gart_iommu_attach_dev(struct iommu_domain *domain, struct gart_client *client, *c; int err = 0; - gart = dev_get_drvdata(dev->parent); + gart = gart_handle; if (!gart) return -EINVAL; domain->priv = gart; @@ -422,6 +424,14 @@ const struct dev_pm_ops tegra_gart_pm_ops = { .resume = tegra_gart_resume, }; +#ifdef CONFIG_OF +static struct of_device_id tegra_gart_of_match[] __devinitdata = { + { .compatible = "nvidia,tegra20-gart", }, + { }, +}; +MODULE_DEVICE_TABLE(of, tegra_gart_of_match); +#endif + static struct platform_driver tegra_gart_driver = { .probe = tegra_gart_probe, .remove = tegra_gart_remove, @@ -429,6 +439,7 @@ static struct platform_driver tegra_gart_driver = { .owner = THIS_MODULE, .name = "tegra-gart", .pm = &tegra_gart_pm_ops, + .of_match_table = of_match_ptr(tegra_gart_of_match), }, }; @@ -448,4 +459,5 @@ module_exit(tegra_gart_exit); MODULE_DESCRIPTION("IOMMU API for GART in Tegra20"); MODULE_AUTHOR("Hiroshi DOYU <hdoyu@nvidia.com>"); +MODULE_ALIAS("platform:tegra-gart"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index eb93c821..ecd6790 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -733,7 +733,7 @@ static int smmu_iommu_attach_dev(struct iommu_domain *domain, pr_info("Reserve \"page zero\" for AVP vectors using a common dummy\n"); } - dev_dbg(smmu->dev, "%s is attached\n", dev_name(c->dev)); + dev_dbg(smmu->dev, "%s is attached\n", dev_name(dev)); return 0; err_client: diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index d6f8ada..8ea7bcc 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -78,7 +78,7 @@ typedef int (*rproc_handle_resource_t)(struct rproc *rproc, void *, int avail); * the recovery of the remote processor. */ static int rproc_iommu_fault(struct iommu_domain *domain, struct device *dev, - unsigned long iova, int flags) + unsigned long iova, int flags, void *token) { dev_err(dev, "iommu fault: da 0x%lx flags 0x%x\n", iova, flags); @@ -117,7 +117,7 @@ static int rproc_enable_iommu(struct rproc *rproc) return -ENOMEM; } - iommu_set_fault_handler(domain, rproc_iommu_fault); + iommu_set_fault_handler(domain, rproc_iommu_fault, rproc); ret = iommu_attach_device(domain, dev); if (ret) { @@ -19,12 +19,14 @@ #include <linux/swap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> +#include <linux/cgroup.h> #include <scsi/sg.h> /* for struct sg_iovec */ #include <trace/events/block.h> @@ -418,6 +420,7 @@ void bio_put(struct bio *bio) * last put frees it */ if (atomic_dec_and_test(&bio->bi_cnt)) { + bio_disassociate_task(bio); bio->bi_next = NULL; bio->bi_destructor(bio); } @@ -1646,6 +1649,64 @@ bad: } EXPORT_SYMBOL(bioset_create); +#ifdef CONFIG_BLK_CGROUP +/** + * bio_associate_current - associate a bio with %current + * @bio: target bio + * + * Associate @bio with %current if it hasn't been associated yet. Block + * layer will treat @bio as if it were issued by %current no matter which + * task actually issues it. + * + * This function takes an extra reference of @task's io_context and blkcg + * which will be put when @bio is released. The caller must own @bio, + * ensure %current->io_context exists, and is responsible for synchronizing + * calls to this function. + */ +int bio_associate_current(struct bio *bio) +{ + struct io_context *ioc; + struct cgroup_subsys_state *css; + + if (bio->bi_ioc) + return -EBUSY; + + ioc = current->io_context; + if (!ioc) + return -ENOENT; + + /* acquire active ref on @ioc and associate */ + get_io_context_active(ioc); + bio->bi_ioc = ioc; + + /* associate blkcg if exists */ + rcu_read_lock(); + css = task_subsys_state(current, blkio_subsys_id); + if (css && css_tryget(css)) + bio->bi_css = css; + rcu_read_unlock(); + + return 0; +} + +/** + * bio_disassociate_task - undo bio_associate_current() + * @bio: target bio + */ +void bio_disassociate_task(struct bio *bio) +{ + if (bio->bi_ioc) { + put_io_context(bio->bi_ioc); + bio->bi_ioc = NULL; + } + if (bio->bi_css) { + css_put(bio->bi_css); + bio->bi_css = NULL; + } +} + +#endif /* CONFIG_BLK_CGROUP */ + static void __init biovec_init_slabs(void) { int i; diff --git a/fs/ioprio.c b/fs/ioprio.c index 5e6dbe89..e50170c 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -50,7 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { - ioc_ioprio_changed(ioc, ioprio); + ioc->ioprio = ioprio; put_io_context(ioc); } diff --git a/fs/splice.c b/fs/splice.c index f847684..406ef2b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1388,7 +1388,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, - struct partial_page *partial, int aligned, + struct partial_page *partial, bool aligned, unsigned int pipe_buffers) { int buffers = 0, error = 0; @@ -1626,7 +1626,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, return -ENOMEM; spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, - spd.partial, flags & SPLICE_F_GIFT, + spd.partial, false, pipe->buffers); if (spd.nr_pages <= 0) ret = spd.nr_pages; diff --git a/include/linux/bio.h b/include/linux/bio.h index 4d94eb8..2643589 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -269,6 +269,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); +#ifdef CONFIG_BLK_CGROUP +int bio_associate_current(struct bio *bio); +void bio_disassociate_task(struct bio *bio); +#else /* CONFIG_BLK_CGROUP */ +static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } +static inline void bio_disassociate_task(struct bio *bio) { } +#endif /* CONFIG_BLK_CGROUP */ + /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4053cbd..0edb65d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -14,6 +14,8 @@ struct bio; struct bio_integrity_payload; struct page; struct block_device; +struct io_context; +struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *, int); typedef void (bio_destructor_t) (struct bio *); @@ -66,6 +68,14 @@ struct bio { bio_end_io_t *bi_end_io; void *bi_private; +#ifdef CONFIG_BLK_CGROUP + /* + * Optional ioc and css associated with this bio. Put on bio + * release. Read comment on top of bio_associate_current(). + */ + struct io_context *bi_ioc; + struct cgroup_subsys_state *bi_css; +#endif #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4d4ac24..ba43f40 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -32,10 +32,17 @@ struct blk_trace; struct request; struct sg_io_hdr; struct bsg_job; +struct blkcg_gq; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ +/* + * Maximum number of blkcg policies allowed to be registered concurrently. + * Defined here to simplify include dependency. + */ +#define BLKCG_MAX_POLS 2 + struct request; typedef void (rq_end_io_fn)(struct request *, int); @@ -363,6 +370,11 @@ struct request_queue { struct list_head timeout_list; struct list_head icq_list; +#ifdef CONFIG_BLK_CGROUP + DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); + struct blkcg_gq *root_blkg; + struct list_head blkg_list; +#endif struct queue_limits limits; @@ -390,12 +402,17 @@ struct request_queue { struct mutex sysfs_lock; + int bypass_depth; + #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; int bsg_job_size; struct bsg_class_device bsg_dev; #endif +#ifdef CONFIG_BLK_CGROUP + struct list_head all_q_node; +#endif #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; @@ -407,7 +424,7 @@ struct request_queue { #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ #define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ -#define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ +#define QUEUE_FLAG_BYPASS 6 /* act as dumb FIFO queue */ #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ @@ -491,6 +508,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) +#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 9e5f560..47e3d48 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.11" +#define REL_VERSION "8.3.13" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 96 @@ -112,8 +112,8 @@ enum drbd_ret_code { ERR_OPEN_MD_DISK = 105, ERR_DISK_NOT_BDEV = 107, ERR_MD_NOT_BDEV = 108, - ERR_DISK_TO_SMALL = 111, - ERR_MD_DISK_TO_SMALL = 112, + ERR_DISK_TOO_SMALL = 111, + ERR_MD_DISK_TOO_SMALL = 112, ERR_BDCLAIM_DISK = 114, ERR_BDCLAIM_MD_DISK = 115, ERR_MD_IDX_INVALID = 116, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 447c367..fb670bf 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -48,6 +48,11 @@ #define DRBD_TIMEOUT_MAX 600 #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ + /* If backing disk takes longer than disk_timeout, mark the disk as failed */ +#define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ +#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ +#define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ + /* active connection retries when C_WF_CONNECTION */ #define DRBD_CONNECT_INT_MIN 1 #define DRBD_CONNECT_INT_MAX 120 @@ -60,7 +65,7 @@ /* timeout for the ping packets.*/ #define DRBD_PING_TIMEO_MIN 1 -#define DRBD_PING_TIMEO_MAX 100 +#define DRBD_PING_TIMEO_MAX 300 #define DRBD_PING_TIMEO_DEF 5 /* max number of write requests between write barriers */ diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index ab6159e4..a8706f0 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -31,9 +31,12 @@ NL_PACKET(disk_conf, 3, NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) + NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout) ) -NL_PACKET(detach, 4, ) +NL_PACKET(detach, 4, + NL_BIT( 88, T_MANDATORY, detach_force) +) NL_PACKET(net_conf, 5, NL_STRING( 8, T_MANDATORY, my_addr, 128) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 7d4e035..c03af76 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -28,12 +28,13 @@ typedef int (elevator_may_queue_fn) (struct request_queue *, int); typedef void (elevator_init_icq_fn) (struct io_cq *); typedef void (elevator_exit_icq_fn) (struct io_cq *); -typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); +typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, + struct bio *, gfp_t); typedef void (elevator_put_req_fn) (struct request *); typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); -typedef void *(elevator_init_fn) (struct request_queue *); +typedef int (elevator_init_fn) (struct request_queue *); typedef void (elevator_exit_fn) (struct elevator_queue *); struct elevator_ops @@ -129,7 +130,8 @@ extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, int); extern void elv_abort_queue(struct request_queue *); extern void elv_completed_request(struct request_queue *, struct request *); -extern int elv_set_request(struct request_queue *, struct request *, gfp_t); +extern int elv_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask); extern void elv_put_request(struct request_queue *, struct request *); extern void elv_drain_elevator(struct request_queue *); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 1a30180..df38db2 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -6,11 +6,7 @@ #include <linux/workqueue.h> enum { - ICQ_IOPRIO_CHANGED = 1 << 0, - ICQ_CGROUP_CHANGED = 1 << 1, ICQ_EXITED = 1 << 2, - - ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED, }; /* @@ -100,6 +96,7 @@ struct io_cq { */ struct io_context { atomic_long_t refcount; + atomic_t active_ref; atomic_t nr_tasks; /* all the fields below are protected by this lock */ @@ -120,29 +117,37 @@ struct io_context { struct work_struct release_work; }; -static inline struct io_context *ioc_task_link(struct io_context *ioc) +/** + * get_io_context_active - get active reference on ioc + * @ioc: ioc of interest + * + * Only iocs with active reference can issue new IOs. This function + * acquires an active reference on @ioc. The caller must already have an + * active reference on @ioc. + */ +static inline void get_io_context_active(struct io_context *ioc) { - /* - * if ref count is zero, don't allow sharing (ioc is going away, it's - * a race). - */ - if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) { - atomic_inc(&ioc->nr_tasks); - return ioc; - } + WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0); + WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0); + atomic_long_inc(&ioc->refcount); + atomic_inc(&ioc->active_ref); +} + +static inline void ioc_task_link(struct io_context *ioc) +{ + get_io_context_active(ioc); - return NULL; + WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); + atomic_inc(&ioc->nr_tasks); } struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); +void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); -void ioc_ioprio_changed(struct io_context *ioc, int ioprio); -void ioc_cgroup_changed(struct io_context *ioc); -unsigned int icq_get_changed(struct io_cq *icq); #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d937580..450293f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -35,12 +35,13 @@ struct iommu_domain; #define IOMMU_FAULT_WRITE 0x1 typedef int (*iommu_fault_handler_t)(struct iommu_domain *, - struct device *, unsigned long, int); + struct device *, unsigned long, int, void *); struct iommu_domain { struct iommu_ops *ops; void *priv; iommu_fault_handler_t handler; + void *handler_token; }; #define IOMMU_CAP_CACHE_COHERENCY 0x1 @@ -95,7 +96,7 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, extern int iommu_domain_has_cap(struct iommu_domain *domain, unsigned long cap); extern void iommu_set_fault_handler(struct iommu_domain *domain, - iommu_fault_handler_t handler); + iommu_fault_handler_t handler, void *token); extern int iommu_device_group(struct device *dev, unsigned int *groupid); /** @@ -132,7 +133,8 @@ static inline int report_iommu_fault(struct iommu_domain *domain, * invoke it. */ if (domain->handler) - ret = domain->handler(domain, dev, iova, flags); + ret = domain->handler(domain, dev, iova, flags, + domain->handler_token); return ret; } @@ -191,7 +193,7 @@ static inline int domain_has_cap(struct iommu_domain *domain, } static inline void iommu_set_fault_handler(struct iommu_domain *domain, - iommu_fault_handler_t handler) + iommu_fault_handler_t handler, void *token) { } diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 76dad48..beb9ce1 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -42,26 +42,14 @@ enum { }; /* - * if process has set io priority explicitly, use that. if not, convert - * the cpu scheduler nice value to an io priority + * Fallback BE priority */ #define IOPRIO_NORM (4) -static inline int task_ioprio(struct io_context *ioc) -{ - if (ioprio_valid(ioc->ioprio)) - return IOPRIO_PRIO_DATA(ioc->ioprio); - - return IOPRIO_NORM; -} - -static inline int task_ioprio_class(struct io_context *ioc) -{ - if (ioprio_valid(ioc->ioprio)) - return IOPRIO_PRIO_CLASS(ioc->ioprio); - - return IOPRIO_CLASS_BE; -} +/* + * if process has set io priority explicitly, use that. if not, convert + * the cpu scheduler nice value to an io priority + */ static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; diff --git a/init/Kconfig b/init/Kconfig index 81816b8..1e004d0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -803,7 +803,7 @@ config RT_GROUP_SCHED endif #CGROUP_SCHED config BLK_CGROUP - tristate "Block IO controller" + bool "Block IO controller" depends on BLOCK default n ---help--- diff --git a/kernel/fork.c b/kernel/fork.c index 017fb23d..31a32c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -976,9 +976,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { - tsk->io_context = ioc_task_link(ioc); - if (unlikely(!tsk->io_context)) - return -ENOMEM; + ioc_task_link(ioc); + tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); if (unlikely(!new_ioc)) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 13ef233..518aea7 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -430,7 +430,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void) */ static struct dma_debug_entry *dma_entry_alloc(void) { - struct dma_debug_entry *entry = NULL; + struct dma_debug_entry *entry; unsigned long flags; spin_lock_irqsave(&free_entries_lock, flags); @@ -438,11 +438,14 @@ static struct dma_debug_entry *dma_entry_alloc(void) if (list_empty(&free_entries)) { pr_err("DMA-API: debugging out of memory - disabling\n"); global_disable = true; - goto out; + spin_unlock_irqrestore(&free_entries_lock, flags); + return NULL; } entry = __dma_entry_alloc(); + spin_unlock_irqrestore(&free_entries_lock, flags); + #ifdef CONFIG_STACKTRACE entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; entry->stacktrace.entries = entry->st_entries; @@ -450,9 +453,6 @@ static struct dma_debug_entry *dma_entry_alloc(void) save_stack_trace(&entry->stacktrace); #endif -out: - spin_unlock_irqrestore(&free_entries_lock, flags); - return entry; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 285a81e..e198831 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3036,7 +3036,8 @@ int hugetlb_reserve_pages(struct inode *inode, region_add(&inode->i_mapping->private_list, from, to); return 0; out_err: - resv_map_put(vma); + if (vma) + resv_map_put(vma); return ret; } |