summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkback17
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkfront10
-rw-r--r--Documentation/cgroups/blkio-controller.txt29
-rw-r--r--MAINTAINERS2
-rw-r--r--block/blk-cgroup.c105
-rw-r--r--block/blk-cgroup.h38
-rw-r--r--block/blk-throttle.c1064
-rw-r--r--drivers/block/Kconfig4
-rw-r--r--drivers/block/drbd/drbd_actlog.c21
-rw-r--r--drivers/block/drbd/drbd_int.h15
-rw-r--r--drivers/block/drbd/drbd_main.c61
-rw-r--r--drivers/block/drbd/drbd_nl.c185
-rw-r--r--drivers/block/drbd/drbd_receiver.c12
-rw-r--r--drivers/block/drbd/drbd_state.c4
-rw-r--r--drivers/block/rsxx/core.c359
-rw-r--r--drivers/block/rsxx/cregs.c14
-rw-r--r--drivers/block/rsxx/dev.c33
-rw-r--r--drivers/block/rsxx/dma.c185
-rw-r--r--drivers/block/rsxx/rsxx_priv.h10
-rw-r--r--drivers/block/xen-blkback/blkback.c872
-rw-r--r--drivers/block/xen-blkback/common.h147
-rw-r--r--drivers/block/xen-blkback/xenbus.c85
-rw-r--r--drivers/block/xen-blkfront.c532
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/drbd.h6
-rw-r--r--include/linux/drbd_genl.h2
-rw-r--r--include/linux/drbd_limits.h9
-rw-r--r--include/xen/interface/io/blkif.h53
-rw-r--r--include/xen/interface/io/ring.h5
29 files changed, 2854 insertions, 1027 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback
new file mode 100644
index 0000000..8bb43b6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback
@@ -0,0 +1,17 @@
+What: /sys/module/xen_blkback/parameters/max_buffer_pages
+Date: March 2013
+KernelVersion: 3.11
+Contact: Roger Pau Monné <roger.pau@citrix.com>
+Description:
+ Maximum number of free pages to keep in each block
+ backend buffer.
+
+What: /sys/module/xen_blkback/parameters/max_persistent_grants
+Date: March 2013
+KernelVersion: 3.11
+Contact: Roger Pau Monné <roger.pau@citrix.com>
+Description:
+ Maximum number of grants to map persistently in
+ blkback. If the frontend tries to use more than
+ max_persistent_grants, the LRU kicks in and starts
+ removing 5% of max_persistent_grants every 100ms.
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkfront b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
new file mode 100644
index 0000000..c0a6cb7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
@@ -0,0 +1,10 @@
+What: /sys/module/xen_blkfront/parameters/max
+Date: June 2013
+KernelVersion: 3.11
+Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Description:
+ Maximum number of segments that the frontend will negotiate
+ with the backend for indirect descriptors. The default value
+ is 32 - higher value means more potential throughput but more
+ memory usage. The backend picks the minimum of the frontend
+ and its default backend value.
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index da272c8..cd556b9 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -94,11 +94,13 @@ Throttling/Upper Limit policy
Hierarchical Cgroups
====================
-- Currently only CFQ supports hierarchical groups. For throttling,
- cgroup interface does allow creation of hierarchical cgroups and
- internally it treats them as flat hierarchy.
- If somebody created a hierarchy like as follows.
+Both CFQ and throttling implement hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows.
root
/ \
@@ -106,21 +108,20 @@ Hierarchical Cgroups
|
test3
- CFQ will handle the hierarchy correctly but and throttling will
- practically treat all groups at same level. For details on CFQ
- hierarchy support, refer to Documentation/block/cfq-iosched.txt.
- Throttling will treat the hierarchy as if it looks like the
- following.
+CFQ by default and throttling with "sane_behavior" will handle the
+hierarchy correctly. For details on CFQ hierarchy support, refer to
+Documentation/block/cfq-iosched.txt. For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following.
pivot
/ / \ \
root test1 test2 test3
- Nesting cgroups, while allowed, isn't officially supported and blkio
- genereates warning when cgroups nest. Once throttling implements
- hierarchy support, hierarchy will be supported and the warning will
- be removed.
-
Various user visible config options
===================================
CONFIG_BLK_CGROUP
diff --git a/MAINTAINERS b/MAINTAINERS
index 58814f0..a0a76fb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3297,7 +3297,7 @@ F: Documentation/firmware_class/
F: drivers/base/firmware*.c
F: include/linux/firmware.h
-FLASHSYSTEM DRIVER (IBM FlashSystem 70/80 PCI SSD Flash Card)
+FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card)
M: Joshua Morris <josh.h.morris@us.ibm.com>
M: Philip Kelleher <pjk1939@linux.vnet.ibm.com>
S: Maintained
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e8918ff..290792a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
- struct request_queue *q, bool update_hint);
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_cgrp: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
- * read locked. If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs. The caller may
- * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
- * subtree.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
- cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
- if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
- (p_blkg)->q, false)))
-
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
{
@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
if (!blkg)
return;
- for (i = 0; i < BLKCG_MAX_POLS; i++) {
- struct blkcg_policy *pol = blkcg_policy[i];
- struct blkg_policy_data *pd = blkg->pd[i];
-
- if (!pd)
- continue;
-
- if (pol && pol->pd_exit_fn)
- pol->pd_exit_fn(blkg);
-
- kfree(pd);
- }
+ for (i = 0; i < BLKCG_MAX_POLS; i++)
+ kfree(blkg->pd[i]);
blk_exit_rl(&blkg->rl);
kfree(blkg);
@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->pd[i] = pd;
pd->blkg = blkg;
pd->plid = i;
-
- /* invoke per-policy init */
- if (pol->pd_init_fn)
- pol->pd_init_fn(blkg);
}
return blkg;
@@ -158,8 +124,8 @@ err_free:
* @q's bypass state. If @update_hint is %true, the caller should be
* holding @q->queue_lock and lookup hint is updated on success.
*/
-static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
- struct request_queue *q, bool update_hint)
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+ bool update_hint)
{
struct blkcg_gq *blkg;
@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
}
blkg = new_blkg;
- /* link parent and insert */
+ /* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
- blkg = ERR_PTR(-EINVAL);
+ ret = -EINVAL;
goto err_put_css;
}
blkg_get(blkg->parent);
}
+ /* invoke per-policy init */
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+
+ if (blkg->pd[i] && pol->pd_init_fn)
+ pol->pd_init_fn(blkg);
+ }
+
+ /* insert */
spin_lock(&blkcg->lock);
ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
if (likely(!ret)) {
@@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q)
q->root_rl.blkg = NULL;
}
-static void blkg_rcu_free(struct rcu_head *rcu_head)
+/*
+ * A group is RCU protected, but having an rcu lock does not mean that one
+ * can access all the fields of blkg and assume these are valid. For
+ * example, don't try to follow throtl_data and request queue links.
+ *
+ * Having a reference to blkg under an rcu allows accesses to only values
+ * local to groups like group stats and group rate limits.
+ */
+void __blkg_release_rcu(struct rcu_head *rcu_head)
{
- blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
-}
+ struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
+ int i;
+
+ /* tell policies that this one is being freed */
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+
+ if (blkg->pd[i] && pol->pd_exit_fn)
+ pol->pd_exit_fn(blkg);
+ }
-void __blkg_release(struct blkcg_gq *blkg)
-{
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
- if (blkg->parent)
+ if (blkg->parent) {
+ spin_lock_irq(blkg->q->queue_lock);
blkg_put(blkg->parent);
+ spin_unlock_irq(blkg->q->queue_lock);
+ }
- /*
- * A group is freed in rcu manner. But having an rcu lock does not
- * mean that one can access all the fields of blkg and assume these
- * are valid. For example, don't try to follow throtl_data and
- * request queue links.
- *
- * Having a reference to blkg under an rcu allows acess to only
- * values local to groups like group stats and group rate limits
- */
- call_rcu(&blkg->rcu_head, blkg_rcu_free);
+ blkg_free(blkg);
}
-EXPORT_SYMBOL_GPL(__blkg_release);
+EXPORT_SYMBOL_GPL(__blkg_release_rcu);
/*
* The next function used by blk_queue_for_each_rl(). It's a bit tricky
@@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
.subsys_id = blkio_subsys_id,
.base_cftypes = blkcg_files,
.module = THIS_MODULE,
-
- /*
- * blkio subsystem is utterly broken in terms of hierarchy support.
- * It treats all cgroups equally regardless of where they're
- * located in the hierarchy - all cgroups are treated as if they're
- * right below the root. Fix it and remove the following.
- */
- .broken_hierarchy = true,
};
EXPORT_SYMBOL_GPL(blkio_subsys);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4e595ee..8056c03 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
blkg->refcnt++;
}
-void __blkg_release(struct blkcg_gq *blkg);
+void __blkg_release_rcu(struct rcu_head *rcu);
/**
* blkg_put - put a blkg reference
@@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg)
lockdep_assert_held(blkg->q->queue_lock);
WARN_ON_ONCE(blkg->refcnt <= 0);
if (!--blkg->refcnt)
- __blkg_release(blkg);
+ call_rcu(&blkg->rcu_head, __blkg_release_rcu);
}
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+ bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
+ * read locked. If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs. The caller may
+ * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
+ * subtree.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
+ cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
+ if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
+ (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead. Synchronization rules are the same.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \
+ cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
+ if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
+ (p_blkg)->q, false)))
+
/**
* blk_get_rl - get request_list to use
* @q: request_queue of interest
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3114622..08a32df 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl;
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
-static void throtl_schedule_delayed_work(struct throtl_data *td,
- unsigned long delay);
-
-struct throtl_rb_root {
- struct rb_root rb;
- struct rb_node *left;
- unsigned int count;
- unsigned long min_disptime;
+
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued. When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from. When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ *
+ * It's also used to track the reference counts on blkg's. A qnode always
+ * belongs to a throtl_grp and gets queued on itself or the parent, so
+ * incrementing the reference of the associated throtl_grp when a qnode is
+ * queued and decrementing when dequeued is enough to keep the whole blkg
+ * tree pinned while bios are in flight.
+ */
+struct throtl_qnode {
+ struct list_head node; /* service_queue->queued[] */
+ struct bio_list bios; /* queued bios */
+ struct throtl_grp *tg; /* tg this qnode belongs to */
};
-#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
- .count = 0, .min_disptime = 0}
+struct throtl_service_queue {
+ struct throtl_service_queue *parent_sq; /* the parent service_queue */
+
+ /*
+ * Bios queued directly to this service_queue or dispatched from
+ * children throtl_grp's.
+ */
+ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
+ unsigned int nr_queued[2]; /* number of queued bios */
+
+ /*
+ * RB tree of active children throtl_grp's, which are sorted by
+ * their ->disptime.
+ */
+ struct rb_root pending_tree; /* RB tree of active tgs */
+ struct rb_node *first_pending; /* first node in the tree */
+ unsigned int nr_pending; /* # queued in the tree */
+ unsigned long first_pending_disptime; /* disptime of the first tg */
+ struct timer_list pending_timer; /* fires on first_pending_disptime */
+};
+
+enum tg_state_flags {
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+};
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
@@ -52,9 +95,26 @@ struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
- /* active throtl group service_tree member */
+ /* active throtl group service_queue member */
struct rb_node rb_node;
+ /* throtl_data this group belongs to */
+ struct throtl_data *td;
+
+ /* this group's service queue */
+ struct throtl_service_queue service_queue;
+
+ /*
+ * qnode_on_self is used when bios are directly queued to this
+ * throtl_grp so that local bios compete fairly with bios
+ * dispatched from children. qnode_on_parent is used when bios are
+ * dispatched from this throtl_grp into its parent and will compete
+ * with the sibling qnode_on_parents and the parent's
+ * qnode_on_self.
+ */
+ struct throtl_qnode qnode_on_self[2];
+ struct throtl_qnode qnode_on_parent[2];
+
/*
* Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as
@@ -64,11 +124,8 @@ struct throtl_grp {
unsigned int flags;
- /* Two lists for READ and WRITE */
- struct bio_list bio_lists[2];
-
- /* Number of queued bios on READ and WRITE lists */
- unsigned int nr_queued[2];
+ /* are there any throtl rules between this group and td? */
+ bool has_rules[2];
/* bytes per second rate limits */
uint64_t bps[2];
@@ -85,9 +142,6 @@ struct throtl_grp {
unsigned long slice_start[2];
unsigned long slice_end[2];
- /* Some throttle limits got updated for the group */
- int limits_changed;
-
/* Per cpu stats pointer */
struct tg_stats_cpu __percpu *stats_cpu;
@@ -98,7 +152,7 @@ struct throtl_grp {
struct throtl_data
{
/* service tree for active throtl groups */
- struct throtl_rb_root tg_service_tree;
+ struct throtl_service_queue service_queue;
struct request_queue *queue;
@@ -111,9 +165,7 @@ struct throtl_data
unsigned int nr_undestroyed_grps;
/* Work for dispatching throttled bios */
- struct delayed_work throtl_work;
-
- int limits_changed;
+ struct work_struct dispatch_work;
};
/* list and work item to allocate percpu group stats */
@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list);
static void tg_stats_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
+static void throtl_pending_timer_fn(unsigned long arg);
+
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
@@ -143,41 +197,65 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
return blkg_to_tg(td->queue->root_blkg);
}
-enum tg_state_flags {
- THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
-};
-
-#define THROTL_TG_FNS(name) \
-static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
-{ \
- (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
-} \
-static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
-{ \
- (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
-} \
-static inline int throtl_tg_##name(const struct throtl_grp *tg) \
-{ \
- return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
+/**
+ * sq_to_tg - return the throl_grp the specified service queue belongs to
+ * @sq: the throtl_service_queue of interest
+ *
+ * Return the throtl_grp @sq belongs to. If @sq is the top-level one
+ * embedded in throtl_data, %NULL is returned.
+ */
+static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
+{
+ if (sq && sq->parent_sq)
+ return container_of(sq, struct throtl_grp, service_queue);
+ else
+ return NULL;
}
-THROTL_TG_FNS(on_rr);
+/**
+ * sq_to_td - return throtl_data the specified service queue belongs to
+ * @sq: the throtl_service_queue of interest
+ *
+ * A service_queue can be embeded in either a throtl_grp or throtl_data.
+ * Determine the associated throtl_data accordingly and return it.
+ */
+static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
+{
+ struct throtl_grp *tg = sq_to_tg(sq);
-#define throtl_log_tg(td, tg, fmt, args...) do { \
- char __pbuf[128]; \
+ if (tg)
+ return tg->td;
+ else
+ return container_of(sq, struct throtl_data, service_queue);
+}
+
+/**
+ * throtl_log - log debug message via blktrace
+ * @sq: the service_queue being reported
+ * @fmt: printf format string
+ * @args: printf args
+ *
+ * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
+ * throtl_grp; otherwise, just "throtl".
+ *
+ * TODO: this should be made a function and name formatting should happen
+ * after testing whether blktrace is enabled.
+ */
+#define throtl_log(sq, fmt, args...) do { \
+ struct throtl_grp *__tg = sq_to_tg((sq)); \
+ struct throtl_data *__td = sq_to_td((sq)); \
+ \
+ (void)__td; \
+ if ((__tg)) { \
+ char __pbuf[128]; \
\
- blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \
- blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
+ blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
+ } else { \
+ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
+ } \
} while (0)
-#define throtl_log(td, fmt, args...) \
- blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
-
-static inline unsigned int total_nr_queued(struct throtl_data *td)
-{
- return td->nr_queued[0] + td->nr_queued[1];
-}
-
/*
* Worker for allocating per cpu stat for tgs. This is scheduled on the
* system_wq once there are some groups on the alloc_list waiting for
@@ -215,15 +293,141 @@ alloc_stats:
goto alloc_stats;
}
+static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
+{
+ INIT_LIST_HEAD(&qn->node);
+ bio_list_init(&qn->bios);
+ qn->tg = tg;
+}
+
+/**
+ * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
+ * @bio: bio being added
+ * @qn: qnode to add bio to
+ * @queued: the service_queue->queued[] list @qn belongs to
+ *
+ * Add @bio to @qn and put @qn on @queued if it's not already on.
+ * @qn->tg's reference count is bumped when @qn is activated. See the
+ * comment on top of throtl_qnode definition for details.
+ */
+static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
+ struct list_head *queued)
+{
+ bio_list_add(&qn->bios, bio);
+ if (list_empty(&qn->node)) {
+ list_add_tail(&qn->node, queued);
+ blkg_get(tg_to_blkg(qn->tg));
+ }
+}
+
+/**
+ * throtl_peek_queued - peek the first bio on a qnode list
+ * @queued: the qnode list to peek
+ */
+static struct bio *throtl_peek_queued(struct list_head *queued)
+{
+ struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+ struct bio *bio;
+
+ if (list_empty(queued))
+ return NULL;
+
+ bio = bio_list_peek(&qn->bios);
+ WARN_ON_ONCE(!bio);
+ return bio;
+}
+
+/**
+ * throtl_pop_queued - pop the first bio form a qnode list
+ * @queued: the qnode list to pop a bio from
+ * @tg_to_put: optional out argument for throtl_grp to put
+ *
+ * Pop the first bio from the qnode list @queued. After popping, the first
+ * qnode is removed from @queued if empty or moved to the end of @queued so
+ * that the popping order is round-robin.
+ *
+ * When the first qnode is removed, its associated throtl_grp should be put
+ * too. If @tg_to_put is NULL, this function automatically puts it;
+ * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
+ * responsible for putting it.
+ */
+static struct bio *throtl_pop_queued(struct list_head *queued,
+ struct throtl_grp **tg_to_put)
+{
+ struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+ struct bio *bio;
+
+ if (list_empty(queued))
+ return NULL;
+
+ bio = bio_list_pop(&qn->bios);
+ WARN_ON_ONCE(!bio);
+
+ if (bio_list_empty(&qn->bios)) {
+ list_del_init(&qn->node);
+ if (tg_to_put)
+ *tg_to_put = qn->tg;
+ else
+ blkg_put(tg_to_blkg(qn->tg));
+ } else {
+ list_move_tail(&qn->node, queued);
+ }
+
+ return bio;
+}
+
+/* init a service_queue, assumes the caller zeroed it */
+static void throtl_service_queue_init(struct throtl_service_queue *sq,
+ struct throtl_service_queue *parent_sq)
+{
+ INIT_LIST_HEAD(&sq->queued[0]);
+ INIT_LIST_HEAD(&sq->queued[1]);
+ sq->pending_tree = RB_ROOT;
+ sq->parent_sq = parent_sq;
+ setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
+ (unsigned long)sq);
+}
+
+static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+{
+ del_timer_sync(&sq->pending_timer);
+}
+
static void throtl_pd_init(struct blkcg_gq *blkg)
{
struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_data *td = blkg->q->td;
+ struct throtl_service_queue *parent_sq;
unsigned long flags;
+ int rw;
+
+ /*
+ * If sane_hierarchy is enabled, we switch to properly hierarchical
+ * behavior where limits on a given throtl_grp are applied to the
+ * whole subtree rather than just the group itself. e.g. If 16M
+ * read_bps limit is set on the root group, the whole system can't
+ * exceed 16M for the device.
+ *
+ * If sane_hierarchy is not enabled, the broken flat hierarchy
+ * behavior is retained where all throtl_grps are treated as if
+ * they're all separate root groups right below throtl_data.
+ * Limits of a group don't interact with limits of other groups
+ * regardless of the position of the group in the hierarchy.
+ */
+ parent_sq = &td->service_queue;
+
+ if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
+ parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
+
+ throtl_service_queue_init(&tg->service_queue, parent_sq);
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+ throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+ }
RB_CLEAR_NODE(&tg->rb_node);
- bio_list_init(&tg->bio_lists[0]);
- bio_list_init(&tg->bio_lists[1]);
- tg->limits_changed = false;
+ tg->td = td;
tg->bps[READ] = -1;
tg->bps[WRITE] = -1;
@@ -241,6 +445,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
}
+/*
+ * Set has_rules[] if @tg or any of its parents have limits configured.
+ * This doesn't require walking up to the top of the hierarchy as the
+ * parent's has_rules[] is guaranteed to be correct.
+ */
+static void tg_update_has_rules(struct throtl_grp *tg)
+{
+ struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
+ int rw;
+
+ for (rw = READ; rw <= WRITE; rw++)
+ tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
+ (tg->bps[rw] != -1 || tg->iops[rw] != -1);
+}
+
+static void throtl_pd_online(struct blkcg_gq *blkg)
+{
+ /*
+ * We don't want new groups to escape the limits of its ancestors.
+ * Update has_rules[] after a new group is brought online.
+ */
+ tg_update_has_rules(blkg_to_tg(blkg));
+}
+
static void throtl_pd_exit(struct blkcg_gq *blkg)
{
struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -251,6 +479,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
free_percpu(tg->stats_cpu);
+
+ throtl_service_queue_exit(&tg->service_queue);
}
static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
@@ -309,17 +539,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
return tg;
}
-static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+static struct throtl_grp *
+throtl_rb_first(struct throtl_service_queue *parent_sq)
{
/* Service tree is empty */
- if (!root->count)
+ if (!parent_sq->nr_pending)
return NULL;
- if (!root->left)
- root->left = rb_first(&root->rb);
+ if (!parent_sq->first_pending)
+ parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
- if (root->left)
- return rb_entry_tg(root->left);
+ if (parent_sq->first_pending)
+ return rb_entry_tg(parent_sq->first_pending);
return NULL;
}
@@ -330,29 +561,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
RB_CLEAR_NODE(n);
}
-static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+static void throtl_rb_erase(struct rb_node *n,
+ struct throtl_service_queue *parent_sq)
{
- if (root->left == n)
- root->left = NULL;
- rb_erase_init(n, &root->rb);
- --root->count;
+ if (parent_sq->first_pending == n)
+ parent_sq->first_pending = NULL;
+ rb_erase_init(n, &parent_sq->pending_tree);
+ --parent_sq->nr_pending;
}
-static void update_min_dispatch_time(struct throtl_rb_root *st)
+static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
{
struct throtl_grp *tg;
- tg = throtl_rb_first(st);
+ tg = throtl_rb_first(parent_sq);
if (!tg)
return;
- st->min_disptime = tg->disptime;
+ parent_sq->first_pending_disptime = tg->disptime;
}
-static void
-tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+static void tg_service_queue_add(struct throtl_grp *tg)
{
- struct rb_node **node = &st->rb.rb_node;
+ struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
+ struct rb_node **node = &parent_sq->pending_tree.rb_node;
struct rb_node *parent = NULL;
struct throtl_grp *__tg;
unsigned long key = tg->disptime;
@@ -371,89 +603,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
}
if (left)
- st->left = &tg->rb_node;
+ parent_sq->first_pending = &tg->rb_node;
rb_link_node(&tg->rb_node, parent, node);
- rb_insert_color(&tg->rb_node, &st->rb);
+ rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
}
-static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void __throtl_enqueue_tg(struct throtl_grp *tg)
{
- struct throtl_rb_root *st = &td->tg_service_tree;
+ tg_service_queue_add(tg);
+ tg->flags |= THROTL_TG_PENDING;
+ tg->service_queue.parent_sq->nr_pending++;
+}
- tg_service_tree_add(st, tg);
- throtl_mark_tg_on_rr(tg);
- st->count++;
+static void throtl_enqueue_tg(struct throtl_grp *tg)
+{
+ if (!(tg->flags & THROTL_TG_PENDING))
+ __throtl_enqueue_tg(tg);
}
-static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void __throtl_dequeue_tg(struct throtl_grp *tg)
{
- if (!throtl_tg_on_rr(tg))
- __throtl_enqueue_tg(td, tg);
+ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
+ tg->flags &= ~THROTL_TG_PENDING;
}
-static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_dequeue_tg(struct throtl_grp *tg)
{
- throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
- throtl_clear_tg_on_rr(tg);
+ if (tg->flags & THROTL_TG_PENDING)
+ __throtl_dequeue_tg(tg);
}
-static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+/* Call with queue lock held */
+static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
+ unsigned long expires)
{
- if (throtl_tg_on_rr(tg))
- __throtl_dequeue_tg(td, tg);
+ mod_timer(&sq->pending_timer, expires);
+ throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
+ expires - jiffies, jiffies);
}
-static void throtl_schedule_next_dispatch(struct throtl_data *td)
+/**
+ * throtl_schedule_next_dispatch - schedule the next dispatch cycle
+ * @sq: the service_queue to schedule dispatch for
+ * @force: force scheduling
+ *
+ * Arm @sq->pending_timer so that the next dispatch cycle starts on the
+ * dispatch time of the first pending child. Returns %true if either timer
+ * is armed or there's no pending child left. %false if the current
+ * dispatch window is still open and the caller should continue
+ * dispatching.
+ *
+ * If @force is %true, the dispatch timer is always scheduled and this
+ * function is guaranteed to return %true. This is to be used when the
+ * caller can't dispatch itself and needs to invoke pending_timer
+ * unconditionally. Note that forced scheduling is likely to induce short
+ * delay before dispatch starts even if @sq->first_pending_disptime is not
+ * in the future and thus shouldn't be used in hot paths.
+ */
+static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
+ bool force)
{
- struct throtl_rb_root *st = &td->tg_service_tree;
+ /* any pending children left? */
+ if (!sq->nr_pending)
+ return true;
- /*
- * If there are more bios pending, schedule more work.
- */
- if (!total_nr_queued(td))
- return;
+ update_min_dispatch_time(sq);
- BUG_ON(!st->count);
+ /* is the next dispatch time in the future? */
+ if (force || time_after(sq->first_pending_disptime, jiffies)) {
+ throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
+ return true;
+ }
- update_min_dispatch_time(st);
+ /* tell the caller to continue dispatching */
+ return false;
+}
- if (time_before_eq(st->min_disptime, jiffies))
- throtl_schedule_delayed_work(td, 0);
- else
- throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
+static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
+ bool rw, unsigned long start)
+{
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
+
+ /*
+ * Previous slice has expired. We must have trimmed it after last
+ * bio dispatch. That means since start of last slice, we never used
+ * that bandwidth. Do try to make use of that bandwidth while giving
+ * credit.
+ */
+ if (time_after_eq(start, tg->slice_start[rw]))
+ tg->slice_start[rw] = start;
+
+ tg->slice_end[rw] = jiffies + throtl_slice;
+ throtl_log(&tg->service_queue,
+ "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', tg->slice_start[rw],
+ tg->slice_end[rw], jiffies);
}
-static inline void
-throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + throtl_slice;
- throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
- rw == READ ? 'R' : 'W', tg->slice_start[rw],
- tg->slice_end[rw], jiffies);
+ throtl_log(&tg->service_queue,
+ "[%c] new slice start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', tg->slice_start[rw],
+ tg->slice_end[rw], jiffies);
}
-static inline void throtl_set_slice_end(struct throtl_data *td,
- struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
+ unsigned long jiffy_end)
{
tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
}
-static inline void throtl_extend_slice(struct throtl_data *td,
- struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
+ unsigned long jiffy_end)
{
tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
- throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
- rw == READ ? 'R' : 'W', tg->slice_start[rw],
- tg->slice_end[rw], jiffies);
+ throtl_log(&tg->service_queue,
+ "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', tg->slice_start[rw],
+ tg->slice_end[rw], jiffies);
}
/* Determine if previously allocated or extended slice is complete or not */
-static bool
-throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
{
if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
return 0;
@@ -462,8 +740,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
}
/* Trim the used slices and adjust slice start accordingly */
-static inline void
-throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
{
unsigned long nr_slices, time_elapsed, io_trim;
u64 bytes_trim, tmp;
@@ -475,7 +752,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
* renewed. Don't try to trim the slice if slice is used. A new
* slice will start when appropriate.
*/
- if (throtl_slice_used(td, tg, rw))
+ if (throtl_slice_used(tg, rw))
return;
/*
@@ -486,7 +763,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
* is bad because it does not allow new slice to start.
*/
- throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+ throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
time_elapsed = jiffies - tg->slice_start[rw];
@@ -515,14 +792,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
tg->slice_start[rw] += nr_slices * throtl_slice;
- throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
- " start=%lu end=%lu jiffies=%lu",
- rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
- tg->slice_start[rw], tg->slice_end[rw], jiffies);
+ throtl_log(&tg->service_queue,
+ "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
+ tg->slice_start[rw], tg->slice_end[rw], jiffies);
}
-static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
- struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
+ unsigned long *wait)
{
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
@@ -571,8 +848,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
return 0;
}
-static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
- struct bio *bio, unsigned long *wait)
+static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
+ unsigned long *wait)
{
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp;
@@ -613,18 +890,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
return 0;
}
-static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
- if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
- return 1;
- return 0;
-}
-
/*
* Returns whether one can dispatch a bio or not. Also returns approx number
* of jiffies to wait before this bio is with-in IO rate and can be dispatched
*/
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
- struct bio *bio, unsigned long *wait)
+static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
+ unsigned long *wait)
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -635,7 +906,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
* this function with a different bio if there are other bios
* queued.
*/
- BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+ BUG_ON(tg->service_queue.nr_queued[rw] &&
+ bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -649,15 +921,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
* existing slice to make sure it is at least throtl_slice interval
* long since now.
*/
- if (throtl_slice_used(td, tg, rw))
- throtl_start_new_slice(td, tg, rw);
+ if (throtl_slice_used(tg, rw))
+ throtl_start_new_slice(tg, rw);
else {
if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
- throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+ throtl_extend_slice(tg, rw, jiffies + throtl_slice);
}
- if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
- && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+ if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
+ tg_with_in_iops_limit(tg, bio, &iops_wait)) {
if (wait)
*wait = 0;
return 1;
@@ -669,7 +941,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
*wait = max_wait;
if (time_before(tg->slice_end[rw], jiffies + max_wait))
- throtl_extend_slice(td, tg, rw, jiffies + max_wait);
+ throtl_extend_slice(tg, rw, jiffies + max_wait);
return 0;
}
@@ -708,65 +980,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
tg->bytes_disp[rw] += bio->bi_size;
tg->io_disp[rw]++;
- throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
+ /*
+ * REQ_THROTTLED is used to prevent the same bio to be throttled
+ * more than once as a throttled bio will go through blk-throtl the
+ * second time when it eventually gets issued. Set it when a bio
+ * is being charged to a tg.
+ *
+ * Dispatch stats aren't recursive and each @bio should only be
+ * accounted by the @tg it was originally associated with. Let's
+ * update the stats when setting REQ_THROTTLED for the first time
+ * which is guaranteed to be for the @bio's original tg.
+ */
+ if (!(bio->bi_rw & REQ_THROTTLED)) {
+ bio->bi_rw |= REQ_THROTTLED;
+ throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
+ bio->bi_rw);
+ }
}
-static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
- struct bio *bio)
+/**
+ * throtl_add_bio_tg - add a bio to the specified throtl_grp
+ * @bio: bio to add
+ * @qn: qnode to use
+ * @tg: the target throtl_grp
+ *
+ * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
+ * tg->qnode_on_self[] is used.
+ */
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
+ struct throtl_grp *tg)
{
+ struct throtl_service_queue *sq = &tg->service_queue;
bool rw = bio_data_dir(bio);
- bio_list_add(&tg->bio_lists[rw], bio);
- /* Take a bio reference on tg */
- blkg_get(tg_to_blkg(tg));
- tg->nr_queued[rw]++;
- td->nr_queued[rw]++;
- throtl_enqueue_tg(td, tg);
+ if (!qn)
+ qn = &tg->qnode_on_self[rw];
+
+ /*
+ * If @tg doesn't currently have any bios queued in the same
+ * direction, queueing @bio can change when @tg should be
+ * dispatched. Mark that @tg was empty. This is automatically
+ * cleaered on the next tg_update_disptime().
+ */
+ if (!sq->nr_queued[rw])
+ tg->flags |= THROTL_TG_WAS_EMPTY;
+
+ throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+
+ sq->nr_queued[rw]++;
+ throtl_enqueue_tg(tg);
}
-static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+static void tg_update_disptime(struct throtl_grp *tg)
{
+ struct throtl_service_queue *sq = &tg->service_queue;
unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
struct bio *bio;
- if ((bio = bio_list_peek(&tg->bio_lists[READ])))
- tg_may_dispatch(td, tg, bio, &read_wait);
+ if ((bio = throtl_peek_queued(&sq->queued[READ])))
+ tg_may_dispatch(tg, bio, &read_wait);
- if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
- tg_may_dispatch(td, tg, bio, &write_wait);
+ if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+ tg_may_dispatch(tg, bio, &write_wait);
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
/* Update dispatch time */
- throtl_dequeue_tg(td, tg);
+ throtl_dequeue_tg(tg);
tg->disptime = disptime;
- throtl_enqueue_tg(td, tg);
+ throtl_enqueue_tg(tg);
+
+ /* see throtl_add_bio_tg() */
+ tg->flags &= ~THROTL_TG_WAS_EMPTY;
}
-static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
- bool rw, struct bio_list *bl)
+static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
+ struct throtl_grp *parent_tg, bool rw)
{
- struct bio *bio;
+ if (throtl_slice_used(parent_tg, rw)) {
+ throtl_start_new_slice_with_credit(parent_tg, rw,
+ child_tg->slice_start[rw]);
+ }
+
+}
- bio = bio_list_pop(&tg->bio_lists[rw]);
- tg->nr_queued[rw]--;
- /* Drop bio reference on blkg */
- blkg_put(tg_to_blkg(tg));
+static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+ struct throtl_service_queue *parent_sq = sq->parent_sq;
+ struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
+ struct throtl_grp *tg_to_put = NULL;
+ struct bio *bio;
- BUG_ON(td->nr_queued[rw] <= 0);
- td->nr_queued[rw]--;
+ /*
+ * @bio is being transferred from @tg to @parent_sq. Popping a bio
+ * from @tg may put its reference and @parent_sq might end up
+ * getting released prematurely. Remember the tg to put and put it
+ * after @bio is transferred to @parent_sq.
+ */
+ bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
+ sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio);
- bio_list_add(bl, bio);
- bio->bi_rw |= REQ_THROTTLED;
- throtl_trim_slice(td, tg, rw);
+ /*
+ * If our parent is another tg, we just need to transfer @bio to
+ * the parent using throtl_add_bio_tg(). If our parent is
+ * @td->service_queue, @bio is ready to be issued. Put it on its
+ * bio_lists[] and decrease total number queued. The caller is
+ * responsible for issuing these bios.
+ */
+ if (parent_tg) {
+ throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
+ start_parent_slice_with_credit(tg, parent_tg, rw);
+ } else {
+ throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
+ &parent_sq->queued[rw]);
+ BUG_ON(tg->td->nr_queued[rw] <= 0);
+ tg->td->nr_queued[rw]--;
+ }
+
+ throtl_trim_slice(tg, rw);
+
+ if (tg_to_put)
+ blkg_put(tg_to_blkg(tg_to_put));
}
-static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
- struct bio_list *bl)
+static int throtl_dispatch_tg(struct throtl_grp *tg)
{
+ struct throtl_service_queue *sq = &tg->service_queue;
unsigned int nr_reads = 0, nr_writes = 0;
unsigned int max_nr_reads = throtl_grp_quantum*3/4;
unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
@@ -774,20 +1117,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
/* Try to dispatch 75% READS and 25% WRITES */
- while ((bio = bio_list_peek(&tg->bio_lists[READ]))
- && tg_may_dispatch(td, tg, bio, NULL)) {
+ while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
+ tg_may_dispatch(tg, bio, NULL)) {
- tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+ tg_dispatch_one_bio(tg, bio_data_dir(bio));
nr_reads++;
if (nr_reads >= max_nr_reads)
break;
}
- while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
- && tg_may_dispatch(td, tg, bio, NULL)) {
+ while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
+ tg_may_dispatch(tg, bio, NULL)) {
- tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+ tg_dispatch_one_bio(tg, bio_data_dir(bio));
nr_writes++;
if (nr_writes >= max_nr_writes)
@@ -797,14 +1140,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
return nr_reads + nr_writes;
}
-static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
{
unsigned int nr_disp = 0;
- struct throtl_grp *tg;
- struct throtl_rb_root *st = &td->tg_service_tree;
while (1) {
- tg = throtl_rb_first(st);
+ struct throtl_grp *tg = throtl_rb_first(parent_sq);
+ struct throtl_service_queue *sq = &tg->service_queue;
if (!tg)
break;
@@ -812,14 +1154,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
if (time_before(jiffies, tg->disptime))
break;
- throtl_dequeue_tg(td, tg);
+ throtl_dequeue_tg(tg);
- nr_disp += throtl_dispatch_tg(td, tg, bl);
+ nr_disp += throtl_dispatch_tg(tg);
- if (tg->nr_queued[0] || tg->nr_queued[1]) {
- tg_update_disptime(td, tg);
- throtl_enqueue_tg(td, tg);
- }
+ if (sq->nr_queued[0] || sq->nr_queued[1])
+ tg_update_disptime(tg);
if (nr_disp >= throtl_quantum)
break;
@@ -828,111 +1168,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
return nr_disp;
}
-static void throtl_process_limit_change(struct throtl_data *td)
+/**
+ * throtl_pending_timer_fn - timer function for service_queue->pending_timer
+ * @arg: the throtl_service_queue being serviced
+ *
+ * This timer is armed when a child throtl_grp with active bio's become
+ * pending and queued on the service_queue's pending_tree and expires when
+ * the first child throtl_grp should be dispatched. This function
+ * dispatches bio's from the children throtl_grps to the parent
+ * service_queue.
+ *
+ * If the parent's parent is another throtl_grp, dispatching is propagated
+ * by either arming its pending_timer or repeating dispatch directly. If
+ * the top-level service_tree is reached, throtl_data->dispatch_work is
+ * kicked so that the ready bio's are issued.
+ */
+static void throtl_pending_timer_fn(unsigned long arg)
{
+ struct throtl_service_queue *sq = (void *)arg;
+ struct throtl_grp *tg = sq_to_tg(sq);
+ struct throtl_data *td = sq_to_td(sq);
struct request_queue *q = td->queue;
- struct blkcg_gq *blkg, *n;
-
- if (!td->limits_changed)
- return;
-
- xchg(&td->limits_changed, false);
-
- throtl_log(td, "limits changed");
-
- list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *parent_sq;
+ bool dispatched;
+ int ret;
- if (!tg->limits_changed)
- continue;
+ spin_lock_irq(q->queue_lock);
+again:
+ parent_sq = sq->parent_sq;
+ dispatched = false;
+
+ while (true) {
+ throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
+ sq->nr_queued[READ] + sq->nr_queued[WRITE],
+ sq->nr_queued[READ], sq->nr_queued[WRITE]);
+
+ ret = throtl_select_dispatch(sq);
+ if (ret) {
+ throtl_log(sq, "bios disp=%u", ret);
+ dispatched = true;
+ }
- if (!xchg(&tg->limits_changed, false))
- continue;
+ if (throtl_schedule_next_dispatch(sq, false))
+ break;
- throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
- " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
- tg->iops[READ], tg->iops[WRITE]);
+ /* this dispatch windows is still open, relax and repeat */
+ spin_unlock_irq(q->queue_lock);
+ cpu_relax();
+ spin_lock_irq(q->queue_lock);
+ }
- /*
- * Restart the slices for both READ and WRITES. It
- * might happen that a group's limit are dropped
- * suddenly and we don't want to account recently
- * dispatched IO with new low rate
- */
- throtl_start_new_slice(td, tg, 0);
- throtl_start_new_slice(td, tg, 1);
+ if (!dispatched)
+ goto out_unlock;
- if (throtl_tg_on_rr(tg))
- tg_update_disptime(td, tg);
+ if (parent_sq) {
+ /* @parent_sq is another throl_grp, propagate dispatch */
+ if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ tg_update_disptime(tg);
+ if (!throtl_schedule_next_dispatch(parent_sq, false)) {
+ /* window is already open, repeat dispatching */
+ sq = parent_sq;
+ tg = sq_to_tg(sq);
+ goto again;
+ }
+ }
+ } else {
+ /* reached the top-level, queue issueing */
+ queue_work(kthrotld_workqueue, &td->dispatch_work);
}
+out_unlock:
+ spin_unlock_irq(q->queue_lock);
}
-/* Dispatch throttled bios. Should be called without queue lock held. */
-static int throtl_dispatch(struct request_queue *q)
+/**
+ * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
+ * @work: work item being executed
+ *
+ * This function is queued for execution when bio's reach the bio_lists[]
+ * of throtl_data->service_queue. Those bio's are ready and issued by this
+ * function.
+ */
+void blk_throtl_dispatch_work_fn(struct work_struct *work)
{
- struct throtl_data *td = q->td;
- unsigned int nr_disp = 0;
+ struct throtl_data *td = container_of(work, struct throtl_data,
+ dispatch_work);
+ struct throtl_service_queue *td_sq = &td->service_queue;
+ struct request_queue *q = td->queue;
struct bio_list bio_list_on_stack;
struct bio *bio;
struct blk_plug plug;
-
- spin_lock_irq(q->queue_lock);
-
- throtl_process_limit_change(td);
-
- if (!total_nr_queued(td))
- goto out;
+ int rw;
bio_list_init(&bio_list_on_stack);
- throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
- total_nr_queued(td), td->nr_queued[READ],
- td->nr_queued[WRITE]);
-
- nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
-
- if (nr_disp)
- throtl_log(td, "bios disp=%u", nr_disp);
-
- throtl_schedule_next_dispatch(td);
-out:
+ spin_lock_irq(q->queue_lock);
+ for (rw = READ; rw <= WRITE; rw++)
+ while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+ bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(q->queue_lock);
- /*
- * If we dispatched some requests, unplug the queue to make sure
- * immediate dispatch
- */
- if (nr_disp) {
+ if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
while((bio = bio_list_pop(&bio_list_on_stack)))
generic_make_request(bio);
blk_finish_plug(&plug);
}
- return nr_disp;
-}
-
-void blk_throtl_work(struct work_struct *work)
-{
- struct throtl_data *td = container_of(work, struct throtl_data,
- throtl_work.work);
- struct request_queue *q = td->queue;
-
- throtl_dispatch(q);
-}
-
-/* Call with queue lock held */
-static void
-throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
-{
-
- struct delayed_work *dwork = &td->throtl_work;
-
- /* schedule work if limits changed even if no bio is queued */
- if (total_nr_queued(td) || td->limits_changed) {
- mod_delayed_work(kthrotld_workqueue, dwork, delay);
- throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
- delay, jiffies);
- }
}
static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
@@ -1007,7 +1347,9 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
- struct throtl_data *td;
+ struct throtl_service_queue *sq;
+ struct blkcg_gq *blkg;
+ struct cgroup *pos_cgrp;
int ret;
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1015,7 +1357,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
return ret;
tg = blkg_to_tg(ctx.blkg);
- td = ctx.blkg->q->td;
+ sq = &tg->service_queue;
if (!ctx.v)
ctx.v = -1;
@@ -1025,10 +1367,37 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
else
*(unsigned int *)((void *)tg + cft->private) = ctx.v;
- /* XXX: we don't need the following deferred processing */
- xchg(&tg->limits_changed, true);
- xchg(&td->limits_changed, true);
- throtl_schedule_delayed_work(td, 0);
+ throtl_log(&tg->service_queue,
+ "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
+ tg->bps[READ], tg->bps[WRITE],
+ tg->iops[READ], tg->iops[WRITE]);
+
+ /*
+ * Update has_rules[] flags for the updated tg's subtree. A tg is
+ * considered to have rules if either the tg itself or any of its
+ * ancestors has rules. This identifies groups without any
+ * restrictions in the whole hierarchy and allows them to bypass
+ * blk-throttle.
+ */
+ tg_update_has_rules(tg);
+ blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
+ tg_update_has_rules(blkg_to_tg(blkg));
+
+ /*
+ * We're already holding queue_lock and know @tg is valid. Let's
+ * apply the new config directly.
+ *
+ * Restart the slices for both READ and WRITES. It might happen
+ * that a group's limit are dropped suddenly and we don't want to
+ * account recently dispatched IO with new low rate.
+ */
+ throtl_start_new_slice(tg, 0);
+ throtl_start_new_slice(tg, 1);
+
+ if (tg->flags & THROTL_TG_PENDING) {
+ tg_update_disptime(tg);
+ throtl_schedule_next_dispatch(sq->parent_sq, true);
+ }
blkg_conf_finish(&ctx);
return 0;
@@ -1092,7 +1461,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
{
struct throtl_data *td = q->td;
- cancel_delayed_work_sync(&td->throtl_work);
+ cancel_work_sync(&td->dispatch_work);
}
static struct blkcg_policy blkcg_policy_throtl = {
@@ -1100,6 +1469,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
.cftypes = throtl_files,
.pd_init_fn = throtl_pd_init,
+ .pd_online_fn = throtl_pd_online,
.pd_exit_fn = throtl_pd_exit,
.pd_reset_stats_fn = throtl_pd_reset_stats,
};
@@ -1107,15 +1477,16 @@ static struct blkcg_policy blkcg_policy_throtl = {
bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
{
struct throtl_data *td = q->td;
+ struct throtl_qnode *qn = NULL;
struct throtl_grp *tg;
- bool rw = bio_data_dir(bio), update_disptime = true;
+ struct throtl_service_queue *sq;
+ bool rw = bio_data_dir(bio);
struct blkcg *blkcg;
bool throttled = false;
- if (bio->bi_rw & REQ_THROTTLED) {
- bio->bi_rw &= ~REQ_THROTTLED;
+ /* see throtl_charge_bio() */
+ if (bio->bi_rw & REQ_THROTTLED)
goto out;
- }
/*
* A throtl_grp pointer retrieved under rcu can be used to access
@@ -1126,7 +1497,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
blkcg = bio_blkcg(bio);
tg = throtl_lookup_tg(td, blkcg);
if (tg) {
- if (tg_no_rule_group(tg, rw)) {
+ if (!tg->has_rules[rw]) {
throtl_update_dispatch_stats(tg_to_blkg(tg),
bio->bi_size, bio->bi_rw);
goto out_unlock_rcu;
@@ -1142,18 +1513,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
if (unlikely(!tg))
goto out_unlock;
- if (tg->nr_queued[rw]) {
- /*
- * There is already another bio queued in same dir. No
- * need to update dispatch time.
- */
- update_disptime = false;
- goto queue_bio;
+ sq = &tg->service_queue;
- }
+ while (true) {
+ /* throtl is FIFO - if bios are already queued, should queue */
+ if (sq->nr_queued[rw])
+ break;
+
+ /* if above limits, break to queue */
+ if (!tg_may_dispatch(tg, bio, NULL))
+ break;
- /* Bio is with-in rate limit of group */
- if (tg_may_dispatch(td, tg, bio, NULL)) {
+ /* within limits, let's charge and dispatch directly */
throtl_charge_bio(tg, bio);
/*
@@ -1167,25 +1538,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
*
* So keep on trimming slice even if bio is not queued.
*/
- throtl_trim_slice(td, tg, rw);
- goto out_unlock;
+ throtl_trim_slice(tg, rw);
+
+ /*
+ * @bio passed through this layer without being throttled.
+ * Climb up the ladder. If we''re already at the top, it
+ * can be executed directly.
+ */
+ qn = &tg->qnode_on_parent[rw];
+ sq = sq->parent_sq;
+ tg = sq_to_tg(sq);
+ if (!tg)
+ goto out_unlock;
}
-queue_bio:
- throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
- " iodisp=%u iops=%u queued=%d/%d",
- rw == READ ? 'R' : 'W',
- tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
- tg->io_disp[rw], tg->iops[rw],
- tg->nr_queued[READ], tg->nr_queued[WRITE]);
+ /* out-of-limit, queue to @tg */
+ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
+ rw == READ ? 'R' : 'W',
+ tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+ tg->io_disp[rw], tg->iops[rw],
+ sq->nr_queued[READ], sq->nr_queued[WRITE]);
bio_associate_current(bio);
- throtl_add_bio_tg(q->td, tg, bio);
+ tg->td->nr_queued[rw]++;
+ throtl_add_bio_tg(bio, qn, tg);
throttled = true;
- if (update_disptime) {
- tg_update_disptime(td, tg);
- throtl_schedule_next_dispatch(td);
+ /*
+ * Update @tg's dispatch time and force schedule dispatch if @tg
+ * was empty before @bio. The forced scheduling isn't likely to
+ * cause undue delay as @bio is likely to be dispatched directly if
+ * its @tg's disptime is not in the future.
+ */
+ if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ tg_update_disptime(tg);
+ throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
}
out_unlock:
@@ -1193,9 +1580,38 @@ out_unlock:
out_unlock_rcu:
rcu_read_unlock();
out:
+ /*
+ * As multiple blk-throtls may stack in the same issue path, we
+ * don't want bios to leave with the flag set. Clear the flag if
+ * being issued.
+ */
+ if (!throttled)
+ bio->bi_rw &= ~REQ_THROTTLED;
return throttled;
}
+/*
+ * Dispatch all bios from all children tg's queued on @parent_sq. On
+ * return, @parent_sq is guaranteed to not have any active children tg's
+ * and all bios from previously active tg's are on @parent_sq->bio_lists[].
+ */
+static void tg_drain_bios(struct throtl_service_queue *parent_sq)
+{
+ struct throtl_grp *tg;
+
+ while ((tg = throtl_rb_first(parent_sq))) {
+ struct throtl_service_queue *sq = &tg->service_queue;
+ struct bio *bio;
+
+ throtl_dequeue_tg(tg);
+
+ while ((bio = throtl_peek_queued(&sq->queued[READ])))
+ tg_dispatch_one_bio(tg, bio_data_dir(bio));
+ while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+ tg_dispatch_one_bio(tg, bio_data_dir(bio));
+ }
+}
+
/**
* blk_throtl_drain - drain throttled bios
* @q: request_queue to drain throttled bios for
@@ -1206,27 +1622,36 @@ void blk_throtl_drain(struct request_queue *q)
__releases(q->queue_lock) __acquires(q->queue_lock)
{
struct throtl_data *td = q->td;
- struct throtl_rb_root *st = &td->tg_service_tree;
- struct throtl_grp *tg;
- struct bio_list bl;
+ struct blkcg_gq *blkg;
+ struct cgroup *pos_cgrp;
struct bio *bio;
+ int rw;
queue_lockdep_assert_held(q);
+ rcu_read_lock();
+
+ /*
+ * Drain each tg while doing post-order walk on the blkg tree, so
+ * that all bios are propagated to td->service_queue. It'd be
+ * better to walk service_queue tree directly but blkg walk is
+ * easier.
+ */
+ blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg)
+ tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
- bio_list_init(&bl);
+ tg_drain_bios(&td_root_tg(td)->service_queue);
- while ((tg = throtl_rb_first(st))) {
- throtl_dequeue_tg(td, tg);
+ /* finally, transfer bios from top-level tg's into the td */
+ tg_drain_bios(&td->service_queue);
- while ((bio = bio_list_peek(&tg->bio_lists[READ])))
- tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
- while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
- tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
- }
+ rcu_read_unlock();
spin_unlock_irq(q->queue_lock);
- while ((bio = bio_list_pop(&bl)))
- generic_make_request(bio);
+ /* all bios now should be in td->service_queue, issue them */
+ for (rw = READ; rw <= WRITE; rw++)
+ while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
+ NULL)))
+ generic_make_request(bio);
spin_lock_irq(q->queue_lock);
}
@@ -1240,9 +1665,8 @@ int blk_throtl_init(struct request_queue *q)
if (!td)
return -ENOMEM;
- td->tg_service_tree = THROTL_RB_ROOT;
- td->limits_changed = false;
- INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+ INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+ throtl_service_queue_init(&td->service_queue, NULL);
q->td = td;
td->queue = q;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index b81ddfe..e07a5fd 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -532,11 +532,11 @@ config BLK_DEV_RBD
If unsure, say N.
config BLK_DEV_RSXX
- tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver"
+ tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver"
depends on PCI
help
Device driver for IBM's high speed PCIe SSD
- storage devices: FlashSystem-70 and FlashSystem-80.
+ storage device: Flash Adapter 900GB Full Height.
To compile this driver as a module, choose M here: the
module will be called rsxx.
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 6608076..28c73ca 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -659,6 +659,27 @@ void drbd_al_shrink(struct drbd_conf *mdev)
wake_up(&mdev->al_wait);
}
+int drbd_initialize_al(struct drbd_conf *mdev, void *buffer)
+{
+ struct al_transaction_on_disk *al = buffer;
+ struct drbd_md *md = &mdev->ldev->md;
+ sector_t al_base = md->md_offset + md->al_offset;
+ int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
+ int i;
+
+ memset(al, 0, 4096);
+ al->magic = cpu_to_be32(DRBD_AL_MAGIC);
+ al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
+ al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+
+ for (i = 0; i < al_size_4k; i++) {
+ int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
static int w_update_odbm(struct drbd_work *w, int unused)
{
struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f943aac..2d7f608 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -832,6 +832,7 @@ struct drbd_tconn { /* is a resource from the config file */
unsigned susp_nod:1; /* IO suspended because no data */
unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
struct mutex cstate_mutex; /* Protects graceful disconnects */
+ unsigned int connect_cnt; /* Inc each time a connection is established */
unsigned long flags;
struct net_conf *net_conf; /* content protected by rcu */
@@ -1132,6 +1133,7 @@ extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
extern void conn_md_sync(struct drbd_tconn *tconn);
+extern void drbd_md_write(struct drbd_conf *mdev, void *buffer);
extern void drbd_md_sync(struct drbd_conf *mdev);
extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
@@ -1466,8 +1468,16 @@ extern void drbd_suspend_io(struct drbd_conf *mdev);
extern void drbd_resume_io(struct drbd_conf *mdev);
extern char *ppsize(char *buf, unsigned long long size);
extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
-enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
-extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
+enum determine_dev_size {
+ DS_ERROR_SHRINK = -3,
+ DS_ERROR_SPACE_MD = -2,
+ DS_ERROR = -1,
+ DS_UNCHANGED = 0,
+ DS_SHRUNK = 1,
+ DS_GREW = 2
+};
+extern enum determine_dev_size
+drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local);
extern void resync_after_online_grow(struct drbd_conf *);
extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
@@ -1633,6 +1643,7 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
#define drbd_set_out_of_sync(mdev, sector, size) \
__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
extern void drbd_al_shrink(struct drbd_conf *mdev);
+extern int drbd_initialize_al(struct drbd_conf *, void *);
/* drbd_nl.c */
/* state info broadcast */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a5dca6a..55635ed 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2762,8 +2762,6 @@ int __init drbd_init(void)
/*
* allocate all necessary structs
*/
- err = -ENOMEM;
-
init_waitqueue_head(&drbd_pp_wait);
drbd_proc = NULL; /* play safe for drbd_cleanup */
@@ -2773,6 +2771,7 @@ int __init drbd_init(void)
if (err)
goto fail;
+ err = -ENOMEM;
drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
if (!drbd_proc) {
printk(KERN_ERR "drbd: unable to register proc file\n");
@@ -2803,7 +2802,6 @@ int __init drbd_init(void)
fail:
drbd_cleanup();
if (err == -ENOMEM)
- /* currently always the case */
printk(KERN_ERR "drbd: ran out of memory\n");
else
printk(KERN_ERR "drbd: initialization failure\n");
@@ -2881,34 +2879,14 @@ struct meta_data_on_disk {
u8 reserved_u8[4096 - (7*8 + 10*4)];
} __packed;
-/**
- * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
- * @mdev: DRBD device.
- */
-void drbd_md_sync(struct drbd_conf *mdev)
+
+
+void drbd_md_write(struct drbd_conf *mdev, void *b)
{
- struct meta_data_on_disk *buffer;
+ struct meta_data_on_disk *buffer = b;
sector_t sector;
int i;
- /* Don't accidentally change the DRBD meta data layout. */
- BUILD_BUG_ON(UI_SIZE != 4);
- BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
-
- del_timer(&mdev->md_sync_timer);
- /* timer may be rearmed by drbd_md_mark_dirty() now. */
- if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
- return;
-
- /* We use here D_FAILED and not D_ATTACHING because we try to write
- * metadata even if we detach due to a disk failure! */
- if (!get_ldev_if_state(mdev, D_FAILED))
- return;
-
- buffer = drbd_md_get_buffer(mdev);
- if (!buffer)
- goto out;
-
memset(buffer, 0, sizeof(*buffer));
buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
@@ -2937,6 +2915,35 @@ void drbd_md_sync(struct drbd_conf *mdev)
dev_err(DEV, "meta data update failed!\n");
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
}
+}
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @mdev: DRBD device.
+ */
+void drbd_md_sync(struct drbd_conf *mdev)
+{
+ struct meta_data_on_disk *buffer;
+
+ /* Don't accidentally change the DRBD meta data layout. */
+ BUILD_BUG_ON(UI_SIZE != 4);
+ BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
+ del_timer(&mdev->md_sync_timer);
+ /* timer may be rearmed by drbd_md_mark_dirty() now. */
+ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
+ return;
+
+ /* We use here D_FAILED and not D_ATTACHING because we try to write
+ * metadata even if we detach due to a disk failure! */
+ if (!get_ldev_if_state(mdev, D_FAILED))
+ return;
+
+ buffer = drbd_md_get_buffer(mdev);
+ if (!buffer)
+ goto out;
+
+ drbd_md_write(mdev, buffer);
/* Update mdev->ldev->md.la_size_sect,
* since we updated it on metadata. */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9e3f441..8cc1e64 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -417,6 +417,7 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
bool conn_try_outdate_peer(struct drbd_tconn *tconn)
{
+ unsigned int connect_cnt;
union drbd_state mask = { };
union drbd_state val = { };
enum drbd_fencing_p fp;
@@ -428,6 +429,10 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
return false;
}
+ spin_lock_irq(&tconn->req_lock);
+ connect_cnt = tconn->connect_cnt;
+ spin_unlock_irq(&tconn->req_lock);
+
fp = highest_fencing_policy(tconn);
switch (fp) {
case FP_NOT_AVAIL:
@@ -492,8 +497,14 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
here, because we might were able to re-establish the connection in the
meantime. */
spin_lock_irq(&tconn->req_lock);
- if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags))
- _conn_request_state(tconn, mask, val, CS_VERBOSE);
+ if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) {
+ if (tconn->connect_cnt != connect_cnt)
+ /* In case the connection was established and droped
+ while the fence-peer handler was running, ignore it */
+ conn_info(tconn, "Ignoring fence-peer exit code\n");
+ else
+ _conn_request_state(tconn, mask, val, CS_VERBOSE);
+ }
spin_unlock_irq(&tconn->req_lock);
return conn_highest_pdsk(tconn) <= D_OUTDATED;
@@ -816,15 +827,20 @@ void drbd_resume_io(struct drbd_conf *mdev)
* Returns 0 on success, negative return values indicate errors.
* You should call drbd_md_sync() after calling this function.
*/
-enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
{
sector_t prev_first_sect, prev_size; /* previous meta location */
sector_t la_size_sect, u_size;
+ struct drbd_md *md = &mdev->ldev->md;
+ u32 prev_al_stripe_size_4k;
+ u32 prev_al_stripes;
sector_t size;
char ppb[10];
+ void *buffer;
int md_moved, la_size_changed;
- enum determine_dev_size rv = unchanged;
+ enum determine_dev_size rv = DS_UNCHANGED;
/* race:
* application request passes inc_ap_bio,
@@ -836,6 +852,11 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
* still lock the act_log to not trigger ASSERTs there.
*/
drbd_suspend_io(mdev);
+ buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */
+ if (!buffer) {
+ drbd_resume_io(mdev);
+ return DS_ERROR;
+ }
/* no wait necessary anymore, actually we could assert that */
wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
@@ -844,7 +865,17 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
prev_size = mdev->ldev->md.md_size_sect;
la_size_sect = mdev->ldev->md.la_size_sect;
- /* TODO: should only be some assert here, not (re)init... */
+ if (rs) {
+ /* rs is non NULL if we should change the AL layout only */
+
+ prev_al_stripes = md->al_stripes;
+ prev_al_stripe_size_4k = md->al_stripe_size_4k;
+
+ md->al_stripes = rs->al_stripes;
+ md->al_stripe_size_4k = rs->al_stripe_size / 4;
+ md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
+ }
+
drbd_md_set_sector_offsets(mdev, mdev->ldev);
rcu_read_lock();
@@ -852,6 +883,21 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
rcu_read_unlock();
size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
+ if (size < la_size_sect) {
+ if (rs && u_size == 0) {
+ /* Remove "rs &&" later. This check should always be active, but
+ right now the receiver expects the permissive behavior */
+ dev_warn(DEV, "Implicit shrink not allowed. "
+ "Use --size=%llus for explicit shrink.\n",
+ (unsigned long long)size);
+ rv = DS_ERROR_SHRINK;
+ }
+ if (u_size > size)
+ rv = DS_ERROR_SPACE_MD;
+ if (rv != DS_UNCHANGED)
+ goto err_out;
+ }
+
if (drbd_get_capacity(mdev->this_bdev) != size ||
drbd_bm_capacity(mdev) != size) {
int err;
@@ -867,7 +913,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
"Leaving size unchanged at size = %lu KB\n",
(unsigned long)size);
}
- rv = dev_size_error;
+ rv = DS_ERROR;
}
/* racy, see comments above. */
drbd_set_my_capacity(mdev, size);
@@ -875,38 +921,57 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
(unsigned long long)size>>1);
}
- if (rv == dev_size_error)
- goto out;
+ if (rv <= DS_ERROR)
+ goto err_out;
la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
|| prev_size != mdev->ldev->md.md_size_sect;
- if (la_size_changed || md_moved) {
- int err;
+ if (la_size_changed || md_moved || rs) {
+ u32 prev_flags;
drbd_al_shrink(mdev); /* All extents inactive. */
+
+ prev_flags = md->flags;
+ md->flags &= ~MDF_PRIMARY_IND;
+ drbd_md_write(mdev, buffer);
+
dev_info(DEV, "Writing the whole bitmap, %s\n",
la_size_changed && md_moved ? "size changed and md moved" :
la_size_changed ? "size changed" : "md moved");
/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
- err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
- "size changed", BM_LOCKED_MASK);
- if (err) {
- rv = dev_size_error;
- goto out;
- }
- drbd_md_mark_dirty(mdev);
+ drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+ "size changed", BM_LOCKED_MASK);
+ drbd_initialize_al(mdev, buffer);
+
+ md->flags = prev_flags;
+ drbd_md_write(mdev, buffer);
+
+ if (rs)
+ dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
+ md->al_stripes, md->al_stripe_size_4k * 4);
}
if (size > la_size_sect)
- rv = grew;
+ rv = DS_GREW;
if (size < la_size_sect)
- rv = shrunk;
-out:
+ rv = DS_SHRUNK;
+
+ if (0) {
+ err_out:
+ if (rs) {
+ md->al_stripes = prev_al_stripes;
+ md->al_stripe_size_4k = prev_al_stripe_size_4k;
+ md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
+
+ drbd_md_set_sector_offsets(mdev, mdev->ldev);
+ }
+ }
lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
+ drbd_md_put_buffer(mdev);
drbd_resume_io(mdev);
return rv;
@@ -1607,11 +1672,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
!drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
set_bit(USE_DEGR_WFC_T, &mdev->flags);
- dd = drbd_determine_dev_size(mdev, 0);
- if (dd == dev_size_error) {
+ dd = drbd_determine_dev_size(mdev, 0, NULL);
+ if (dd <= DS_ERROR) {
retcode = ERR_NOMEM_BITMAP;
goto force_diskless_dec;
- } else if (dd == grew)
+ } else if (dd == DS_GREW)
set_bit(RESYNC_AFTER_NEG, &mdev->flags);
if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
@@ -2305,6 +2370,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
struct drbd_conf *mdev;
enum drbd_ret_code retcode;
enum determine_dev_size dd;
+ bool change_al_layout = false;
enum dds_flags ddsf;
sector_t u_size;
int err;
@@ -2315,31 +2381,33 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
if (retcode != NO_ERROR)
goto fail;
+ mdev = adm_ctx.mdev;
+ if (!get_ldev(mdev)) {
+ retcode = ERR_NO_DISK;
+ goto fail;
+ }
+
memset(&rs, 0, sizeof(struct resize_parms));
+ rs.al_stripes = mdev->ldev->md.al_stripes;
+ rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4;
if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
err = resize_parms_from_attrs(&rs, info);
if (err) {
retcode = ERR_MANDATORY_TAG;
drbd_msg_put_info(from_attrs_err_to_txt(err));
- goto fail;
+ goto fail_ldev;
}
}
- mdev = adm_ctx.mdev;
if (mdev->state.conn > C_CONNECTED) {
retcode = ERR_RESIZE_RESYNC;
- goto fail;
+ goto fail_ldev;
}
if (mdev->state.role == R_SECONDARY &&
mdev->state.peer == R_SECONDARY) {
retcode = ERR_NO_PRIMARY;
- goto fail;
- }
-
- if (!get_ldev(mdev)) {
- retcode = ERR_NO_DISK;
- goto fail;
+ goto fail_ldev;
}
if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
@@ -2358,6 +2426,28 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
}
}
+ if (mdev->ldev->md.al_stripes != rs.al_stripes ||
+ mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
+ u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
+
+ if (al_size_k > (16 * 1024 * 1024)) {
+ retcode = ERR_MD_LAYOUT_TOO_BIG;
+ goto fail_ldev;
+ }
+
+ if (al_size_k < MD_32kB_SECT/2) {
+ retcode = ERR_MD_LAYOUT_TOO_SMALL;
+ goto fail_ldev;
+ }
+
+ if (mdev->state.conn != C_CONNECTED) {
+ retcode = ERR_MD_LAYOUT_CONNECTED;
+ goto fail_ldev;
+ }
+
+ change_al_layout = true;
+ }
+
if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
@@ -2373,16 +2463,22 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
}
ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
- dd = drbd_determine_dev_size(mdev, ddsf);
+ dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL);
drbd_md_sync(mdev);
put_ldev(mdev);
- if (dd == dev_size_error) {
+ if (dd == DS_ERROR) {
retcode = ERR_NOMEM_BITMAP;
goto fail;
+ } else if (dd == DS_ERROR_SPACE_MD) {
+ retcode = ERR_MD_LAYOUT_NO_FIT;
+ goto fail;
+ } else if (dd == DS_ERROR_SHRINK) {
+ retcode = ERR_IMPLICIT_SHRINK;
+ goto fail;
}
if (mdev->state.conn == C_CONNECTED) {
- if (dd == grew)
+ if (dd == DS_GREW)
set_bit(RESIZE_PENDING, &mdev->flags);
drbd_send_uuids(mdev);
@@ -2658,7 +2754,6 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
const struct sib_info *sib)
{
struct state_info *si = NULL; /* for sizeof(si->member); */
- struct net_conf *nc;
struct nlattr *nla;
int got_ldev;
int err = 0;
@@ -2688,13 +2783,19 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
goto nla_put_failure;
rcu_read_lock();
- if (got_ldev)
- if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
- goto nla_put_failure;
+ if (got_ldev) {
+ struct disk_conf *disk_conf;
- nc = rcu_dereference(mdev->tconn->net_conf);
- if (nc)
- err = net_conf_to_skb(skb, nc, exclude_sensitive);
+ disk_conf = rcu_dereference(mdev->ldev->disk_conf);
+ err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
+ }
+ if (!err) {
+ struct net_conf *nc;
+
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (nc)
+ err = net_conf_to_skb(skb, nc, exclude_sensitive);
+ }
rcu_read_unlock();
if (err)
goto nla_put_failure;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 4222aff..cc29cd3 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1039,6 +1039,8 @@ randomize:
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr) {
kref_get(&mdev->kref);
+ rcu_read_unlock();
+
/* Prevent a race between resync-handshake and
* being promoted to Primary.
*
@@ -1049,8 +1051,6 @@ randomize:
mutex_lock(mdev->state_mutex);
mutex_unlock(mdev->state_mutex);
- rcu_read_unlock();
-
if (discard_my_data)
set_bit(DISCARD_MY_DATA, &mdev->flags);
else
@@ -3545,7 +3545,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
{
struct drbd_conf *mdev;
struct p_sizes *p = pi->data;
- enum determine_dev_size dd = unchanged;
+ enum determine_dev_size dd = DS_UNCHANGED;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
@@ -3617,9 +3617,9 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
ddsf = be16_to_cpu(p->dds_flags);
if (get_ldev(mdev)) {
- dd = drbd_determine_dev_size(mdev, ddsf);
+ dd = drbd_determine_dev_size(mdev, ddsf, NULL);
put_ldev(mdev);
- if (dd == dev_size_error)
+ if (dd == DS_ERROR)
return -EIO;
drbd_md_sync(mdev);
} else {
@@ -3647,7 +3647,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
drbd_send_sizes(mdev, 0, ddsf);
}
if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
- (dd == grew && mdev->state.conn == C_CONNECTED)) {
+ (dd == DS_GREW && mdev->state.conn == C_CONNECTED)) {
if (mdev->state.pdsk >= D_INCONSISTENT &&
mdev->state.disk >= D_INCONSISTENT) {
if (ddsf & DDSF_NO_RESYNC)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 90c5be2..216d47b 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1115,8 +1115,10 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
drbd_thread_restart_nowait(&mdev->tconn->receiver);
/* Resume AL writing if we get a connection */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
drbd_resume_al(mdev);
+ mdev->tconn->connect_cnt++;
+ }
/* remember last attach time so request_timer_fn() won't
* kill newly established sessions while we are still trying to thaw
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 5af21f2..6e85e21 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -31,6 +31,8 @@
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/genhd.h>
#include <linux/idr.h>
@@ -39,8 +41,9 @@
#include "rsxx_cfg.h"
#define NO_LEGACY 0
+#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */
-MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver");
+MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver");
MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM");
MODULE_LICENSE("GPL");
MODULE_VERSION(DRIVER_VERSION);
@@ -49,9 +52,282 @@ static unsigned int force_legacy = NO_LEGACY;
module_param(force_legacy, uint, 0444);
MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts");
+static unsigned int sync_start = 1;
+module_param(sync_start, uint, 0444);
+MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete "
+ "until the card startup has completed.");
+
static DEFINE_IDA(rsxx_disk_ida);
static DEFINE_SPINLOCK(rsxx_ida_lock);
+/* --------------------Debugfs Setup ------------------- */
+
+struct rsxx_cram {
+ u32 f_pos;
+ u32 offset;
+ void *i_private;
+};
+
+static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p)
+{
+ struct rsxx_cardinfo *card = m->private;
+
+ seq_printf(m, "HWID 0x%08x\n",
+ ioread32(card->regmap + HWID));
+ seq_printf(m, "SCRATCH 0x%08x\n",
+ ioread32(card->regmap + SCRATCH));
+ seq_printf(m, "IER 0x%08x\n",
+ ioread32(card->regmap + IER));
+ seq_printf(m, "IPR 0x%08x\n",
+ ioread32(card->regmap + IPR));
+ seq_printf(m, "CREG_CMD 0x%08x\n",
+ ioread32(card->regmap + CREG_CMD));
+ seq_printf(m, "CREG_ADD 0x%08x\n",
+ ioread32(card->regmap + CREG_ADD));
+ seq_printf(m, "CREG_CNT 0x%08x\n",
+ ioread32(card->regmap + CREG_CNT));
+ seq_printf(m, "CREG_STAT 0x%08x\n",
+ ioread32(card->regmap + CREG_STAT));
+ seq_printf(m, "CREG_DATA0 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA0));
+ seq_printf(m, "CREG_DATA1 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA1));
+ seq_printf(m, "CREG_DATA2 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA2));
+ seq_printf(m, "CREG_DATA3 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA3));
+ seq_printf(m, "CREG_DATA4 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA4));
+ seq_printf(m, "CREG_DATA5 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA5));
+ seq_printf(m, "CREG_DATA6 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA6));
+ seq_printf(m, "CREG_DATA7 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA7));
+ seq_printf(m, "INTR_COAL 0x%08x\n",
+ ioread32(card->regmap + INTR_COAL));
+ seq_printf(m, "HW_ERROR 0x%08x\n",
+ ioread32(card->regmap + HW_ERROR));
+ seq_printf(m, "DEBUG0 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG0));
+ seq_printf(m, "DEBUG1 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG1));
+ seq_printf(m, "DEBUG2 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG2));
+ seq_printf(m, "DEBUG3 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG3));
+ seq_printf(m, "DEBUG4 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG4));
+ seq_printf(m, "DEBUG5 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG5));
+ seq_printf(m, "DEBUG6 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG6));
+ seq_printf(m, "DEBUG7 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG7));
+ seq_printf(m, "RECONFIG 0x%08x\n",
+ ioread32(card->regmap + PCI_RECONFIG));
+
+ return 0;
+}
+
+static int rsxx_attr_stats_show(struct seq_file *m, void *p)
+{
+ struct rsxx_cardinfo *card = m->private;
+ int i;
+
+ for (i = 0; i < card->n_targets; i++) {
+ seq_printf(m, "Ctrl %d CRC Errors = %d\n",
+ i, card->ctrl[i].stats.crc_errors);
+ seq_printf(m, "Ctrl %d Hard Errors = %d\n",
+ i, card->ctrl[i].stats.hard_errors);
+ seq_printf(m, "Ctrl %d Soft Errors = %d\n",
+ i, card->ctrl[i].stats.soft_errors);
+ seq_printf(m, "Ctrl %d Writes Issued = %d\n",
+ i, card->ctrl[i].stats.writes_issued);
+ seq_printf(m, "Ctrl %d Writes Failed = %d\n",
+ i, card->ctrl[i].stats.writes_failed);
+ seq_printf(m, "Ctrl %d Reads Issued = %d\n",
+ i, card->ctrl[i].stats.reads_issued);
+ seq_printf(m, "Ctrl %d Reads Failed = %d\n",
+ i, card->ctrl[i].stats.reads_failed);
+ seq_printf(m, "Ctrl %d Reads Retried = %d\n",
+ i, card->ctrl[i].stats.reads_retried);
+ seq_printf(m, "Ctrl %d Discards Issued = %d\n",
+ i, card->ctrl[i].stats.discards_issued);
+ seq_printf(m, "Ctrl %d Discards Failed = %d\n",
+ i, card->ctrl[i].stats.discards_failed);
+ seq_printf(m, "Ctrl %d DMA SW Errors = %d\n",
+ i, card->ctrl[i].stats.dma_sw_err);
+ seq_printf(m, "Ctrl %d DMA HW Faults = %d\n",
+ i, card->ctrl[i].stats.dma_hw_fault);
+ seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n",
+ i, card->ctrl[i].stats.dma_cancelled);
+ seq_printf(m, "Ctrl %d SW Queue Depth = %d\n",
+ i, card->ctrl[i].stats.sw_q_depth);
+ seq_printf(m, "Ctrl %d HW Queue Depth = %d\n",
+ i, atomic_read(&card->ctrl[i].stats.hw_q_depth));
+ }
+
+ return 0;
+}
+
+static int rsxx_attr_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rsxx_attr_stats_show, inode->i_private);
+}
+
+static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rsxx_attr_pci_regs_show, inode->i_private);
+}
+
+static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct rsxx_cram *info = fp->private_data;
+ struct rsxx_cardinfo *card = info->i_private;
+ char *buf;
+ int st;
+
+ buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ info->f_pos = (u32)*ppos + info->offset;
+
+ st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
+ if (st)
+ return st;
+
+ st = copy_to_user(ubuf, buf, cnt);
+ if (st)
+ return st;
+
+ info->offset += cnt;
+
+ kfree(buf);
+
+ return cnt;
+}
+
+static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct rsxx_cram *info = fp->private_data;
+ struct rsxx_cardinfo *card = info->i_private;
+ char *buf;
+ int st;
+
+ buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ st = copy_from_user(buf, ubuf, cnt);
+ if (st)
+ return st;
+
+ info->f_pos = (u32)*ppos + info->offset;
+
+ st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
+ if (st)
+ return st;
+
+ info->offset += cnt;
+
+ kfree(buf);
+
+ return cnt;
+}
+
+static int rsxx_cram_open(struct inode *inode, struct file *file)
+{
+ struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->i_private = inode->i_private;
+ info->f_pos = file->f_pos;
+ file->private_data = info;
+
+ return 0;
+}
+
+static int rsxx_cram_release(struct inode *inode, struct file *file)
+{
+ struct rsxx_cram *info = file->private_data;
+
+ if (!info)
+ return 0;
+
+ kfree(info);
+ file->private_data = NULL;
+
+ return 0;
+}
+
+static const struct file_operations debugfs_cram_fops = {
+ .owner = THIS_MODULE,
+ .open = rsxx_cram_open,
+ .read = rsxx_cram_read,
+ .write = rsxx_cram_write,
+ .release = rsxx_cram_release,
+};
+
+static const struct file_operations debugfs_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = rsxx_attr_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations debugfs_pci_regs_fops = {
+ .owner = THIS_MODULE,
+ .open = rsxx_attr_pci_regs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
+{
+ struct dentry *debugfs_stats;
+ struct dentry *debugfs_pci_regs;
+ struct dentry *debugfs_cram;
+
+ card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL);
+ if (IS_ERR_OR_NULL(card->debugfs_dir))
+ goto failed_debugfs_dir;
+
+ debugfs_stats = debugfs_create_file("stats", S_IRUGO,
+ card->debugfs_dir, card,
+ &debugfs_stats_fops);
+ if (IS_ERR_OR_NULL(debugfs_stats))
+ goto failed_debugfs_stats;
+
+ debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
+ card->debugfs_dir, card,
+ &debugfs_pci_regs_fops);
+ if (IS_ERR_OR_NULL(debugfs_pci_regs))
+ goto failed_debugfs_pci_regs;
+
+ debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
+ card->debugfs_dir, card,
+ &debugfs_cram_fops);
+ if (IS_ERR_OR_NULL(debugfs_cram))
+ goto failed_debugfs_cram;
+
+ return;
+failed_debugfs_cram:
+ debugfs_remove(debugfs_pci_regs);
+failed_debugfs_pci_regs:
+ debugfs_remove(debugfs_stats);
+failed_debugfs_stats:
+ debugfs_remove(card->debugfs_dir);
+failed_debugfs_dir:
+ card->debugfs_dir = NULL;
+}
+
/*----------------- Interrupt Control & Handling -------------------*/
static void rsxx_mask_interrupts(struct rsxx_cardinfo *card)
@@ -163,12 +439,13 @@ static irqreturn_t rsxx_isr(int irq, void *pdata)
}
if (isr & CR_INTR_CREG) {
- schedule_work(&card->creg_ctrl.done_work);
+ queue_work(card->creg_ctrl.creg_wq,
+ &card->creg_ctrl.done_work);
handled++;
}
if (isr & CR_INTR_EVENT) {
- schedule_work(&card->event_work);
+ queue_work(card->event_wq, &card->event_work);
rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
handled++;
}
@@ -329,7 +606,7 @@ static int rsxx_eeh_frozen(struct pci_dev *dev)
int i;
int st;
- dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n");
+ dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n");
card->eeh_state = 1;
rsxx_mask_interrupts(card);
@@ -367,15 +644,26 @@ static void rsxx_eeh_failure(struct pci_dev *dev)
{
struct rsxx_cardinfo *card = pci_get_drvdata(dev);
int i;
+ int cnt = 0;
- dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n");
+ dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n");
card->eeh_state = 1;
+ card->halt = 1;
- for (i = 0; i < card->n_targets; i++)
- del_timer_sync(&card->ctrl[i].activity_timer);
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_bh(&card->ctrl[i].queue_lock);
+ cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
+ &card->ctrl[i].queue);
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
+
+ cnt += rsxx_dma_cancel(&card->ctrl[i]);
- rsxx_eeh_cancel_dmas(card);
+ if (cnt)
+ dev_info(CARD_TO_DEV(card),
+ "Freed %d queued DMAs on channel %d\n",
+ cnt, card->ctrl[i].id);
+ }
}
static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card)
@@ -432,7 +720,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
int st;
dev_warn(&dev->dev,
- "IBM FlashSystem PCI: recovering from slot reset.\n");
+ "IBM Flash Adapter PCI: recovering from slot reset.\n");
st = pci_enable_device(dev);
if (st)
@@ -485,7 +773,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
&card->ctrl[i].issue_dma_work);
}
- dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n");
+ dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n");
return PCI_ERS_RESULT_RECOVERED;
@@ -528,6 +816,7 @@ static int rsxx_pci_probe(struct pci_dev *dev,
{
struct rsxx_cardinfo *card;
int st;
+ unsigned int sync_timeout;
dev_info(&dev->dev, "PCI-Flash SSD discovered\n");
@@ -610,7 +899,11 @@ static int rsxx_pci_probe(struct pci_dev *dev,
}
/************* Setup Processor Command Interface *************/
- rsxx_creg_setup(card);
+ st = rsxx_creg_setup(card);
+ if (st) {
+ dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n");
+ goto failed_creg_setup;
+ }
spin_lock_irq(&card->irq_lock);
rsxx_enable_ier_and_isr(card, CR_INTR_CREG);
@@ -650,6 +943,12 @@ static int rsxx_pci_probe(struct pci_dev *dev,
}
/************* Setup Card Event Handler *************/
+ card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event");
+ if (!card->event_wq) {
+ dev_err(CARD_TO_DEV(card), "Failed card event setup.\n");
+ goto failed_event_handler;
+ }
+
INIT_WORK(&card->event_work, card_event_handler);
st = rsxx_setup_dev(card);
@@ -676,6 +975,33 @@ static int rsxx_pci_probe(struct pci_dev *dev,
if (st)
dev_crit(CARD_TO_DEV(card),
"Failed issuing card startup\n");
+ if (sync_start) {
+ sync_timeout = SYNC_START_TIMEOUT;
+
+ dev_info(CARD_TO_DEV(card),
+ "Waiting for card to startup\n");
+
+ do {
+ ssleep(1);
+ sync_timeout--;
+
+ rsxx_get_card_state(card, &card->state);
+ } while (sync_timeout &&
+ (card->state == CARD_STATE_STARTING));
+
+ if (card->state == CARD_STATE_STARTING) {
+ dev_warn(CARD_TO_DEV(card),
+ "Card startup timed out\n");
+ card->size8 = 0;
+ } else {
+ dev_info(CARD_TO_DEV(card),
+ "card state: %s\n",
+ rsxx_card_state_to_str(card->state));
+ st = rsxx_get_card_size8(card, &card->size8);
+ if (st)
+ card->size8 = 0;
+ }
+ }
} else if (card->state == CARD_STATE_GOOD ||
card->state == CARD_STATE_RD_ONLY_FAULT) {
st = rsxx_get_card_size8(card, &card->size8);
@@ -685,12 +1011,21 @@ static int rsxx_pci_probe(struct pci_dev *dev,
rsxx_attach_dev(card);
+ /************* Setup Debugfs *************/
+ rsxx_debugfs_dev_new(card);
+
return 0;
failed_create_dev:
+ destroy_workqueue(card->event_wq);
+ card->event_wq = NULL;
+failed_event_handler:
rsxx_dma_destroy(card);
failed_dma_setup:
failed_compatiblity_check:
+ destroy_workqueue(card->creg_ctrl.creg_wq);
+ card->creg_ctrl.creg_wq = NULL;
+failed_creg_setup:
spin_lock_irq(&card->irq_lock);
rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
spin_unlock_irq(&card->irq_lock);
@@ -756,6 +1091,8 @@ static void rsxx_pci_remove(struct pci_dev *dev)
/* Prevent work_structs from re-queuing themselves. */
card->halt = 1;
+ debugfs_remove_recursive(card->debugfs_dir);
+
free_irq(dev->irq, card);
if (!force_legacy)
diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c
index 4b5c020..926dce9 100644
--- a/drivers/block/rsxx/cregs.c
+++ b/drivers/block/rsxx/cregs.c
@@ -431,6 +431,15 @@ static int __issue_creg_rw(struct rsxx_cardinfo *card,
*hw_stat = completion.creg_status;
if (completion.st) {
+ /*
+ * This read is needed to verify that there has not been any
+ * extreme errors that might have occurred, i.e. EEH. The
+ * function iowrite32 will not detect EEH errors, so it is
+ * necessary that we recover if such an error is the reason
+ * for the timeout. This is a dummy read.
+ */
+ ioread32(card->regmap + SCRATCH);
+
dev_warn(CARD_TO_DEV(card),
"creg command failed(%d x%08x)\n",
completion.st, addr);
@@ -727,6 +736,11 @@ int rsxx_creg_setup(struct rsxx_cardinfo *card)
{
card->creg_ctrl.active_cmd = NULL;
+ card->creg_ctrl.creg_wq =
+ create_singlethread_workqueue(DRIVER_NAME"_creg");
+ if (!card->creg_ctrl.creg_wq)
+ return -ENOMEM;
+
INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done);
mutex_init(&card->creg_ctrl.reset_lock);
INIT_LIST_HEAD(&card->creg_ctrl.queue);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 4346d17..d7af441 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -155,7 +155,8 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
atomic_set(&meta->error, 1);
if (atomic_dec_and_test(&meta->pending_dmas)) {
- disk_stats_complete(card, meta->bio, meta->start_time);
+ if (!card->eeh_state && card->gendisk)
+ disk_stats_complete(card, meta->bio, meta->start_time);
bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0);
kmem_cache_free(bio_meta_pool, meta);
@@ -170,6 +171,12 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
might_sleep();
+ if (!card)
+ goto req_err;
+
+ if (bio->bi_sector + (bio->bi_size >> 9) > get_capacity(card->gendisk))
+ goto req_err;
+
if (unlikely(card->halt)) {
st = -EFAULT;
goto req_err;
@@ -196,7 +203,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
atomic_set(&bio_meta->pending_dmas, 0);
bio_meta->start_time = jiffies;
- disk_stats_start(card, bio);
+ if (!unlikely(card->halt))
+ disk_stats_start(card, bio);
dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
bio_data_dir(bio) ? 'W' : 'R', bio_meta,
@@ -225,24 +233,6 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
return (pci_rev >= RSXX_DISCARD_SUPPORT);
}
-static unsigned short rsxx_get_logical_block_size(
- struct rsxx_cardinfo *card)
-{
- u32 capabilities = 0;
- int st;
-
- st = rsxx_get_card_capabilities(card, &capabilities);
- if (st)
- dev_warn(CARD_TO_DEV(card),
- "Failed reading card capabilities register\n");
-
- /* Earlier firmware did not have support for 512 byte accesses */
- if (capabilities & CARD_CAP_SUBPAGE_WRITES)
- return 512;
- else
- return RSXX_HW_BLK_SIZE;
-}
-
int rsxx_attach_dev(struct rsxx_cardinfo *card)
{
mutex_lock(&card->dev_lock);
@@ -305,7 +295,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
return -ENOMEM;
}
- blk_size = rsxx_get_logical_block_size(card);
+ blk_size = card->config.data.block_size;
blk_queue_make_request(card->queue, rsxx_make_request);
blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
@@ -347,6 +337,7 @@ void rsxx_destroy_dev(struct rsxx_cardinfo *card)
card->gendisk = NULL;
blk_cleanup_queue(card->queue);
+ card->queue->queuedata = NULL;
unregister_blkdev(card->major, DRIVER_NAME);
}
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 0607513..bed32f1 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -245,6 +245,22 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
kmem_cache_free(rsxx_dma_pool, dma);
}
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
+ struct list_head *q)
+{
+ struct rsxx_dma *dma;
+ struct rsxx_dma *tmp;
+ int cnt = 0;
+
+ list_for_each_entry_safe(dma, tmp, q, list) {
+ list_del(&dma->list);
+ rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+ cnt++;
+ }
+
+ return cnt;
+}
+
static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
struct rsxx_dma *dma)
{
@@ -252,9 +268,10 @@ static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
* Requeued DMAs go to the front of the queue so they are issued
* first.
*/
- spin_lock(&ctrl->queue_lock);
+ spin_lock_bh(&ctrl->queue_lock);
+ ctrl->stats.sw_q_depth++;
list_add(&dma->list, &ctrl->queue);
- spin_unlock(&ctrl->queue_lock);
+ spin_unlock_bh(&ctrl->queue_lock);
}
static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
@@ -329,6 +346,7 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
static void dma_engine_stalled(unsigned long data)
{
struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
+ int cnt;
if (atomic_read(&ctrl->stats.hw_q_depth) == 0 ||
unlikely(ctrl->card->eeh_state))
@@ -349,18 +367,28 @@ static void dma_engine_stalled(unsigned long data)
"DMA channel %d has stalled, faulting interface.\n",
ctrl->id);
ctrl->card->dma_fault = 1;
+
+ /* Clean up the DMA queue */
+ spin_lock(&ctrl->queue_lock);
+ cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
+ spin_unlock(&ctrl->queue_lock);
+
+ cnt += rsxx_dma_cancel(ctrl);
+
+ if (cnt)
+ dev_info(CARD_TO_DEV(ctrl->card),
+ "Freed %d queued DMAs on channel %d\n",
+ cnt, ctrl->id);
}
}
-static void rsxx_issue_dmas(struct work_struct *work)
+static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
{
- struct rsxx_dma_ctrl *ctrl;
struct rsxx_dma *dma;
int tag;
int cmds_pending = 0;
struct hw_cmd *hw_cmd_buf;
- ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
hw_cmd_buf = ctrl->cmd.buf;
if (unlikely(ctrl->card->halt) ||
@@ -368,22 +396,22 @@ static void rsxx_issue_dmas(struct work_struct *work)
return;
while (1) {
- spin_lock(&ctrl->queue_lock);
+ spin_lock_bh(&ctrl->queue_lock);
if (list_empty(&ctrl->queue)) {
- spin_unlock(&ctrl->queue_lock);
+ spin_unlock_bh(&ctrl->queue_lock);
break;
}
- spin_unlock(&ctrl->queue_lock);
+ spin_unlock_bh(&ctrl->queue_lock);
tag = pop_tracker(ctrl->trackers);
if (tag == -1)
break;
- spin_lock(&ctrl->queue_lock);
+ spin_lock_bh(&ctrl->queue_lock);
dma = list_entry(ctrl->queue.next, struct rsxx_dma, list);
list_del(&dma->list);
ctrl->stats.sw_q_depth--;
- spin_unlock(&ctrl->queue_lock);
+ spin_unlock_bh(&ctrl->queue_lock);
/*
* This will catch any DMAs that slipped in right before the
@@ -440,9 +468,8 @@ static void rsxx_issue_dmas(struct work_struct *work)
}
}
-static void rsxx_dma_done(struct work_struct *work)
+static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl)
{
- struct rsxx_dma_ctrl *ctrl;
struct rsxx_dma *dma;
unsigned long flags;
u16 count;
@@ -450,7 +477,6 @@ static void rsxx_dma_done(struct work_struct *work)
u8 tag;
struct hw_status *hw_st_buf;
- ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
hw_st_buf = ctrl->status.buf;
if (unlikely(ctrl->card->halt) ||
@@ -520,33 +546,32 @@ static void rsxx_dma_done(struct work_struct *work)
rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id));
spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
- spin_lock(&ctrl->queue_lock);
+ spin_lock_bh(&ctrl->queue_lock);
if (ctrl->stats.sw_q_depth)
queue_work(ctrl->issue_wq, &ctrl->issue_dma_work);
- spin_unlock(&ctrl->queue_lock);
+ spin_unlock_bh(&ctrl->queue_lock);
}
-static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card,
- struct list_head *q)
+static void rsxx_schedule_issue(struct work_struct *work)
{
- struct rsxx_dma *dma;
- struct rsxx_dma *tmp;
- int cnt = 0;
+ struct rsxx_dma_ctrl *ctrl;
- list_for_each_entry_safe(dma, tmp, q, list) {
- list_del(&dma->list);
+ ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
- if (dma->dma_addr)
- pci_unmap_page(card->dev, dma->dma_addr,
- get_dma_size(dma),
- (dma->cmd == HW_CMD_BLK_WRITE) ?
- PCI_DMA_TODEVICE :
- PCI_DMA_FROMDEVICE);
- kmem_cache_free(rsxx_dma_pool, dma);
- cnt++;
- }
+ mutex_lock(&ctrl->work_lock);
+ rsxx_issue_dmas(ctrl);
+ mutex_unlock(&ctrl->work_lock);
+}
- return cnt;
+static void rsxx_schedule_done(struct work_struct *work)
+{
+ struct rsxx_dma_ctrl *ctrl;
+
+ ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
+
+ mutex_lock(&ctrl->work_lock);
+ rsxx_dma_done(ctrl);
+ mutex_unlock(&ctrl->work_lock);
}
static int rsxx_queue_discard(struct rsxx_cardinfo *card,
@@ -698,10 +723,10 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
for (i = 0; i < card->n_targets; i++) {
if (!list_empty(&dma_list[i])) {
- spin_lock(&card->ctrl[i].queue_lock);
+ spin_lock_bh(&card->ctrl[i].queue_lock);
card->ctrl[i].stats.sw_q_depth += dma_cnt[i];
list_splice_tail(&dma_list[i], &card->ctrl[i].queue);
- spin_unlock(&card->ctrl[i].queue_lock);
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
queue_work(card->ctrl[i].issue_wq,
&card->ctrl[i].issue_dma_work);
@@ -711,8 +736,11 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
return 0;
bvec_err:
- for (i = 0; i < card->n_targets; i++)
- rsxx_cleanup_dma_queue(card, &dma_list[i]);
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_bh(&card->ctrl[i].queue_lock);
+ rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]);
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
+ }
return st;
}
@@ -780,6 +808,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
spin_lock_init(&ctrl->trackers->lock);
spin_lock_init(&ctrl->queue_lock);
+ mutex_init(&ctrl->work_lock);
INIT_LIST_HEAD(&ctrl->queue);
setup_timer(&ctrl->activity_timer, dma_engine_stalled,
@@ -793,8 +822,8 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
if (!ctrl->done_wq)
return -ENOMEM;
- INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas);
- INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done);
+ INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue);
+ INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done);
st = rsxx_hw_buffers_init(dev, ctrl);
if (st)
@@ -918,13 +947,30 @@ failed_dma_setup:
return st;
}
+int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl)
+{
+ struct rsxx_dma *dma;
+ int i;
+ int cnt = 0;
+
+ /* Clean up issued DMAs */
+ for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
+ dma = get_tracker_dma(ctrl->trackers, i);
+ if (dma) {
+ atomic_dec(&ctrl->stats.hw_q_depth);
+ rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+ push_tracker(ctrl->trackers, i);
+ cnt++;
+ }
+ }
+
+ return cnt;
+}
void rsxx_dma_destroy(struct rsxx_cardinfo *card)
{
struct rsxx_dma_ctrl *ctrl;
- struct rsxx_dma *dma;
- int i, j;
- int cnt = 0;
+ int i;
for (i = 0; i < card->n_targets; i++) {
ctrl = &card->ctrl[i];
@@ -943,33 +989,11 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card)
del_timer_sync(&ctrl->activity_timer);
/* Clean up the DMA queue */
- spin_lock(&ctrl->queue_lock);
- cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue);
- spin_unlock(&ctrl->queue_lock);
-
- if (cnt)
- dev_info(CARD_TO_DEV(card),
- "Freed %d queued DMAs on channel %d\n",
- cnt, i);
-
- /* Clean up issued DMAs */
- for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) {
- dma = get_tracker_dma(ctrl->trackers, j);
- if (dma) {
- pci_unmap_page(card->dev, dma->dma_addr,
- get_dma_size(dma),
- (dma->cmd == HW_CMD_BLK_WRITE) ?
- PCI_DMA_TODEVICE :
- PCI_DMA_FROMDEVICE);
- kmem_cache_free(rsxx_dma_pool, dma);
- cnt++;
- }
- }
+ spin_lock_bh(&ctrl->queue_lock);
+ rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
+ spin_unlock_bh(&ctrl->queue_lock);
- if (cnt)
- dev_info(CARD_TO_DEV(card),
- "Freed %d pending DMAs on channel %d\n",
- cnt, i);
+ rsxx_dma_cancel(ctrl);
vfree(ctrl->trackers);
@@ -1013,7 +1037,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
cnt++;
}
- spin_lock(&card->ctrl[i].queue_lock);
+ spin_lock_bh(&card->ctrl[i].queue_lock);
list_splice(&issued_dmas[i], &card->ctrl[i].queue);
atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
@@ -1028,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
PCI_DMA_TODEVICE :
PCI_DMA_FROMDEVICE);
}
- spin_unlock(&card->ctrl[i].queue_lock);
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
}
kfree(issued_dmas);
@@ -1036,30 +1060,13 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
return 0;
}
-void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card)
-{
- struct rsxx_dma *dma;
- struct rsxx_dma *tmp;
- int i;
-
- for (i = 0; i < card->n_targets; i++) {
- spin_lock(&card->ctrl[i].queue_lock);
- list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) {
- list_del(&dma->list);
-
- rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED);
- }
- spin_unlock(&card->ctrl[i].queue_lock);
- }
-}
-
int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
{
struct rsxx_dma *dma;
int i;
for (i = 0; i < card->n_targets; i++) {
- spin_lock(&card->ctrl[i].queue_lock);
+ spin_lock_bh(&card->ctrl[i].queue_lock);
list_for_each_entry(dma, &card->ctrl[i].queue, list) {
dma->dma_addr = pci_map_page(card->dev, dma->page,
dma->pg_off, get_dma_size(dma),
@@ -1067,12 +1074,12 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
PCI_DMA_TODEVICE :
PCI_DMA_FROMDEVICE);
if (!dma->dma_addr) {
- spin_unlock(&card->ctrl[i].queue_lock);
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
kmem_cache_free(rsxx_dma_pool, dma);
return -ENOMEM;
}
}
- spin_unlock(&card->ctrl[i].queue_lock);
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
}
return 0;
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 382e8bf..5ad5055 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -39,6 +39,7 @@
#include <linux/vmalloc.h>
#include <linux/timer.h>
#include <linux/ioctl.h>
+#include <linux/delay.h>
#include "rsxx.h"
#include "rsxx_cfg.h"
@@ -114,6 +115,7 @@ struct rsxx_dma_ctrl {
struct timer_list activity_timer;
struct dma_tracker_list *trackers;
struct rsxx_dma_stats stats;
+ struct mutex work_lock;
};
struct rsxx_cardinfo {
@@ -134,6 +136,7 @@ struct rsxx_cardinfo {
spinlock_t lock;
bool active;
struct creg_cmd *active_cmd;
+ struct workqueue_struct *creg_wq;
struct work_struct done_work;
struct list_head queue;
unsigned int q_depth;
@@ -154,6 +157,7 @@ struct rsxx_cardinfo {
int buf_len;
} log;
+ struct workqueue_struct *event_wq;
struct work_struct event_work;
unsigned int state;
u64 size8;
@@ -181,6 +185,8 @@ struct rsxx_cardinfo {
int n_targets;
struct rsxx_dma_ctrl *ctrl;
+
+ struct dentry *debugfs_dir;
};
enum rsxx_pci_regmap {
@@ -283,6 +289,7 @@ enum rsxx_creg_addr {
CREG_ADD_CAPABILITIES = 0x80001050,
CREG_ADD_LOG = 0x80002000,
CREG_ADD_NUM_TARGETS = 0x80003000,
+ CREG_ADD_CRAM = 0xA0000000,
CREG_ADD_CONFIG = 0xB0000000,
};
@@ -372,6 +379,8 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
int rsxx_dma_setup(struct rsxx_cardinfo *card);
void rsxx_dma_destroy(struct rsxx_cardinfo *card);
int rsxx_dma_init(void);
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q);
+int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
void rsxx_dma_cleanup(void);
void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
int rsxx_dma_configure(struct rsxx_cardinfo *card);
@@ -382,7 +391,6 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
void *cb_data);
int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl);
int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card);
-void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card);
int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card);
/***** cregs.c *****/
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index dd5b2fe..bf4b9d2 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -50,110 +50,118 @@
#include "common.h"
/*
- * These are rather arbitrary. They are fairly large because adjacent requests
- * pulled from a communication ring are quite likely to end up being part of
- * the same scatter/gather request at the disc.
+ * Maximum number of unused free pages to keep in the internal buffer.
+ * Setting this to a value too low will reduce memory used in each backend,
+ * but can have a performance penalty.
*
- * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
- *
- * This will increase the chances of being able to write whole tracks.
- * 64 should be enough to keep us competitive with Linux.
+ * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
+ * be set to a lower value that might degrade performance on some intensive
+ * IO workloads.
*/
-static int xen_blkif_reqs = 64;
-module_param_named(reqs, xen_blkif_reqs, int, 0);
-MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
-/* Run-time switchable: /sys/module/blkback/parameters/ */
-static unsigned int log_stats;
-module_param(log_stats, int, 0644);
+static int xen_blkif_max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in each block backend buffer");
/*
- * Each outstanding request that we've passed to the lower device layers has a
- * 'pending_req' allocated to it. Each buffer_head that completes decrements
- * the pendcnt towards zero. When it hits zero, the specified domain has a
- * response queued for it, with the saved 'id' passed back.
+ * Maximum number of grants to map persistently in blkback. For maximum
+ * performance this should be the total numbers of grants that can be used
+ * to fill the ring, but since this might become too high, specially with
+ * the use of indirect descriptors, we set it to a value that provides good
+ * performance without using too much memory.
+ *
+ * When the list of persistent grants is full we clean it up using a LRU
+ * algorithm.
*/
-struct pending_req {
- struct xen_blkif *blkif;
- u64 id;
- int nr_pages;
- atomic_t pendcnt;
- unsigned short operation;
- int status;
- struct list_head free_list;
- DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-};
-#define BLKBACK_INVALID_HANDLE (~0)
+static int xen_blkif_max_pgrants = 1056;
+module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
+MODULE_PARM_DESC(max_persistent_grants,
+ "Maximum number of grants to map persistently");
-struct xen_blkbk {
- struct pending_req *pending_reqs;
- /* List of all 'pending_req' available */
- struct list_head pending_free;
- /* And its spinlock. */
- spinlock_t pending_free_lock;
- wait_queue_head_t pending_free_wq;
- /* The list of all pages that are available. */
- struct page **pending_pages;
- /* And the grant handles that are available. */
- grant_handle_t *pending_grant_handles;
-};
-
-static struct xen_blkbk *blkbk;
+/*
+ * The LRU mechanism to clean the lists of persistent grants needs to
+ * be executed periodically. The time interval between consecutive executions
+ * of the purge mechanism is set in ms.
+ */
+#define LRU_INTERVAL 100
/*
- * Maximum number of grant pages that can be mapped in blkback.
- * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
- * pages that blkback will persistently map.
- * Currently, this is:
- * RING_SIZE = 32 (for all known ring types)
- * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
- * sizeof(struct persistent_gnt) = 48
- * So the maximum memory used to store the grants is:
- * 32 * 11 * 48 = 16896 bytes
+ * When the persistent grants list is full we will remove unused grants
+ * from the list. The percent number of grants to be removed at each LRU
+ * execution.
*/
-static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
+#define LRU_PERCENT_CLEAN 5
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats;
+module_param(log_stats, int, 0644);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+/* Number of free pages to remove on each call to free_xenballooned_pages */
+#define NUM_BATCH_FREE_PAGES 10
+
+static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
{
- switch (protocol) {
- case BLKIF_PROTOCOL_NATIVE:
- return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
- BLKIF_MAX_SEGMENTS_PER_REQUEST;
- case BLKIF_PROTOCOL_X86_32:
- return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
- BLKIF_MAX_SEGMENTS_PER_REQUEST;
- case BLKIF_PROTOCOL_X86_64:
- return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
- BLKIF_MAX_SEGMENTS_PER_REQUEST;
- default:
- BUG();
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ if (list_empty(&blkif->free_pages)) {
+ BUG_ON(blkif->free_pages_num != 0);
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ return alloc_xenballooned_pages(1, page, false);
}
+ BUG_ON(blkif->free_pages_num == 0);
+ page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+ list_del(&page[0]->lru);
+ blkif->free_pages_num--;
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+
return 0;
}
-
-/*
- * Little helpful macro to figure out the index and virtual address of the
- * pending_pages[..]. For each 'pending_req' we have have up to
- * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
- * 10 and would index in the pending_pages[..].
- */
-static inline int vaddr_pagenr(struct pending_req *req, int seg)
+static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+ int num)
{
- return (req - blkbk->pending_reqs) *
- BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
-}
+ unsigned long flags;
+ int i;
-#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ for (i = 0; i < num; i++)
+ list_add(&page[i]->lru, &blkif->free_pages);
+ blkif->free_pages_num += num;
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+}
-static inline unsigned long vaddr(struct pending_req *req, int seg)
+static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
{
- unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
- return (unsigned long)pfn_to_kaddr(pfn);
-}
+ /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
+ struct page *page[NUM_BATCH_FREE_PAGES];
+ unsigned int num_pages = 0;
+ unsigned long flags;
-#define pending_handle(_req, _seg) \
- (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ while (blkif->free_pages_num > num) {
+ BUG_ON(list_empty(&blkif->free_pages));
+ page[num_pages] = list_first_entry(&blkif->free_pages,
+ struct page, lru);
+ list_del(&page[num_pages]->lru);
+ blkif->free_pages_num--;
+ if (++num_pages == NUM_BATCH_FREE_PAGES) {
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ free_xenballooned_pages(num_pages, page);
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ num_pages = 0;
+ }
+ }
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ if (num_pages != 0)
+ free_xenballooned_pages(num_pages, page);
+}
+#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
static int do_block_io_op(struct xen_blkif *blkif);
static int dispatch_rw_block_io(struct xen_blkif *blkif,
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id,
(n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
-static void add_persistent_gnt(struct rb_root *root,
+/*
+ * We don't need locking around the persistent grant helpers
+ * because blkback uses a single-thread for each backed, so we
+ * can be sure that this functions will never be called recursively.
+ *
+ * The only exception to that is put_persistent_grant, that can be called
+ * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
+ * bit operations to modify the flags of a persistent grant and to count
+ * the number of used grants.
+ */
+static int add_persistent_gnt(struct xen_blkif *blkif,
struct persistent_gnt *persistent_gnt)
{
- struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct rb_node **new = NULL, *parent = NULL;
struct persistent_gnt *this;
+ if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+ if (!blkif->vbd.overflow_max_grants)
+ blkif->vbd.overflow_max_grants = 1;
+ return -EBUSY;
+ }
/* Figure out where to put new node */
+ new = &blkif->persistent_gnts.rb_node;
while (*new) {
this = container_of(*new, struct persistent_gnt, node);
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root,
else if (persistent_gnt->gnt > this->gnt)
new = &((*new)->rb_right);
else {
- pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
- BUG();
+ pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n");
+ return -EINVAL;
}
}
+ bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
+ set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
/* Add new node and rebalance tree. */
rb_link_node(&(persistent_gnt->node), parent, new);
- rb_insert_color(&(persistent_gnt->node), root);
+ rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
+ blkif->persistent_gnt_c++;
+ atomic_inc(&blkif->persistent_gnt_in_use);
+ return 0;
}
-static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
grant_ref_t gref)
{
struct persistent_gnt *data;
- struct rb_node *node = root->rb_node;
+ struct rb_node *node = NULL;
+ node = blkif->persistent_gnts.rb_node;
while (node) {
data = container_of(node, struct persistent_gnt, node);
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
node = node->rb_left;
else if (gref > data->gnt)
node = node->rb_right;
- else
+ else {
+ if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
+ pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n");
+ return NULL;
+ }
+ set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
+ atomic_inc(&blkif->persistent_gnt_in_use);
return data;
+ }
}
return NULL;
}
-static void free_persistent_gnts(struct rb_root *root, unsigned int num)
+static void put_persistent_gnt(struct xen_blkif *blkif,
+ struct persistent_gnt *persistent_gnt)
+{
+ if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+ pr_alert_ratelimited(DRV_PFX " freeing a grant already unused");
+ set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+ clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
+ atomic_dec(&blkif->persistent_gnt_in_use);
+}
+
+static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+ unsigned int num)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
ret = gnttab_unmap_refs(unmap, NULL, pages,
segs_to_unmap);
BUG_ON(ret);
- free_xenballooned_pages(segs_to_unmap, pages);
+ put_free_pages(blkif, pages, segs_to_unmap);
segs_to_unmap = 0;
}
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
BUG_ON(num != 0);
}
+static void unmap_purged_grants(struct work_struct *work)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct persistent_gnt *persistent_gnt;
+ int ret, segs_to_unmap = 0;
+ struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+
+ while(!list_empty(&blkif->persistent_purge_list)) {
+ persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+ struct persistent_gnt,
+ remove_node);
+ list_del(&persistent_gnt->remove_node);
+
+ gnttab_set_unmap_op(&unmap[segs_to_unmap],
+ vaddr(persistent_gnt->page),
+ GNTMAP_host_map,
+ persistent_gnt->handle);
+
+ pages[segs_to_unmap] = persistent_gnt->page;
+
+ if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ ret = gnttab_unmap_refs(unmap, NULL, pages,
+ segs_to_unmap);
+ BUG_ON(ret);
+ put_free_pages(blkif, pages, segs_to_unmap);
+ segs_to_unmap = 0;
+ }
+ kfree(persistent_gnt);
+ }
+ if (segs_to_unmap > 0) {
+ ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
+ BUG_ON(ret);
+ put_free_pages(blkif, pages, segs_to_unmap);
+ }
+}
+
+static void purge_persistent_gnt(struct xen_blkif *blkif)
+{
+ struct persistent_gnt *persistent_gnt;
+ struct rb_node *n;
+ unsigned int num_clean, total;
+ bool scan_used = false, clean_used = false;
+ struct rb_root *root;
+
+ if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
+ (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
+ !blkif->vbd.overflow_max_grants)) {
+ return;
+ }
+
+ if (work_pending(&blkif->persistent_purge_work)) {
+ pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n");
+ return;
+ }
+
+ num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
+ num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+ num_clean = min(blkif->persistent_gnt_c, num_clean);
+ if ((num_clean == 0) ||
+ (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
+ return;
+
+ /*
+ * At this point, we can assure that there will be no calls
+ * to get_persistent_grant (because we are executing this code from
+ * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
+ * which means that the number of currently used grants will go down,
+ * but never up, so we will always be able to remove the requested
+ * number of grants.
+ */
+
+ total = num_clean;
+
+ pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
+
+ INIT_LIST_HEAD(&blkif->persistent_purge_list);
+ root = &blkif->persistent_gnts;
+purge_list:
+ foreach_grant_safe(persistent_gnt, n, root, node) {
+ BUG_ON(persistent_gnt->handle ==
+ BLKBACK_INVALID_HANDLE);
+
+ if (clean_used) {
+ clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+ continue;
+ }
+
+ if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+ continue;
+ if (!scan_used &&
+ (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
+ continue;
+
+ rb_erase(&persistent_gnt->node, root);
+ list_add(&persistent_gnt->remove_node,
+ &blkif->persistent_purge_list);
+ if (--num_clean == 0)
+ goto finished;
+ }
+ /*
+ * If we get here it means we also need to start cleaning
+ * grants that were used since last purge in order to cope
+ * with the requested num
+ */
+ if (!scan_used && !clean_used) {
+ pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean);
+ scan_used = true;
+ goto purge_list;
+ }
+finished:
+ if (!clean_used) {
+ pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n");
+ clean_used = true;
+ goto purge_list;
+ }
+
+ blkif->persistent_gnt_c -= (total - num_clean);
+ blkif->vbd.overflow_max_grants = 0;
+
+ /* We can defer this work */
+ INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
+ schedule_work(&blkif->persistent_purge_work);
+ pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
+ return;
+}
+
/*
* Retrieve from the 'pending_reqs' a free pending_req structure to be used.
*/
-static struct pending_req *alloc_req(void)
+static struct pending_req *alloc_req(struct xen_blkif *blkif)
{
struct pending_req *req = NULL;
unsigned long flags;
- spin_lock_irqsave(&blkbk->pending_free_lock, flags);
- if (!list_empty(&blkbk->pending_free)) {
- req = list_entry(blkbk->pending_free.next, struct pending_req,
+ spin_lock_irqsave(&blkif->pending_free_lock, flags);
+ if (!list_empty(&blkif->pending_free)) {
+ req = list_entry(blkif->pending_free.next, struct pending_req,
free_list);
list_del(&req->free_list);
}
- spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+ spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
return req;
}
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void)
* Return the 'pending_req' structure back to the freepool. We also
* wake up the thread if it was waiting for a free page.
*/
-static void free_req(struct pending_req *req)
+static void free_req(struct xen_blkif *blkif, struct pending_req *req)
{
unsigned long flags;
int was_empty;
- spin_lock_irqsave(&blkbk->pending_free_lock, flags);
- was_empty = list_empty(&blkbk->pending_free);
- list_add(&req->free_list, &blkbk->pending_free);
- spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+ spin_lock_irqsave(&blkif->pending_free_lock, flags);
+ was_empty = list_empty(&blkif->pending_free);
+ list_add(&req->free_list, &blkif->pending_free);
+ spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
if (was_empty)
- wake_up(&blkbk->pending_free_wq);
+ wake_up(&blkif->pending_free_wq);
}
/*
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
static void print_stats(struct xen_blkif *blkif)
{
pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
- " | ds %4llu\n",
+ " | ds %4llu | pg: %4u/%4d\n",
current->comm, blkif->st_oo_req,
blkif->st_rd_req, blkif->st_wr_req,
- blkif->st_f_req, blkif->st_ds_req);
+ blkif->st_f_req, blkif->st_ds_req,
+ blkif->persistent_gnt_c,
+ xen_blkif_max_pgrants);
blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
blkif->st_rd_req = 0;
blkif->st_wr_req = 0;
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg)
{
struct xen_blkif *blkif = arg;
struct xen_vbd *vbd = &blkif->vbd;
+ unsigned long timeout;
+ int ret;
xen_blkif_get(blkif);
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg)
if (unlikely(vbd->size != vbd_sz(vbd)))
xen_vbd_resize(blkif);
- wait_event_interruptible(
+ timeout = msecs_to_jiffies(LRU_INTERVAL);
+
+ timeout = wait_event_interruptible_timeout(
blkif->wq,
- blkif->waiting_reqs || kthread_should_stop());
- wait_event_interruptible(
- blkbk->pending_free_wq,
- !list_empty(&blkbk->pending_free) ||
- kthread_should_stop());
+ blkif->waiting_reqs || kthread_should_stop(),
+ timeout);
+ if (timeout == 0)
+ goto purge_gnt_list;
+ timeout = wait_event_interruptible_timeout(
+ blkif->pending_free_wq,
+ !list_empty(&blkif->pending_free) ||
+ kthread_should_stop(),
+ timeout);
+ if (timeout == 0)
+ goto purge_gnt_list;
blkif->waiting_reqs = 0;
smp_mb(); /* clear flag *before* checking for work */
- if (do_block_io_op(blkif))
+ ret = do_block_io_op(blkif);
+ if (ret > 0)
blkif->waiting_reqs = 1;
+ if (ret == -EACCES)
+ wait_event_interruptible(blkif->shutdown_wq,
+ kthread_should_stop());
+
+purge_gnt_list:
+ if (blkif->vbd.feature_gnt_persistent &&
+ time_after(jiffies, blkif->next_lru)) {
+ purge_persistent_gnt(blkif);
+ blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+ }
+
+ /* Shrink if we have more than xen_blkif_max_buffer_pages */
+ shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
if (log_stats && time_after(jiffies, blkif->st_print))
print_stats(blkif);
}
+ /* Since we are shutting down remove all pages from the buffer */
+ shrink_free_pagepool(blkif, 0 /* All */);
+
/* Free all persistent grant pages */
if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
- free_persistent_gnts(&blkif->persistent_gnts,
+ free_persistent_gnts(blkif, &blkif->persistent_gnts,
blkif->persistent_gnt_c);
BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg)
return 0;
}
-struct seg_buf {
- unsigned int offset;
- unsigned int nsec;
-};
/*
* Unmap the grant references, and also remove the M2P over-rides
* used in the 'pending_req'.
*/
-static void xen_blkbk_unmap(struct pending_req *req)
+static void xen_blkbk_unmap(struct xen_blkif *blkif,
+ struct grant_page *pages[],
+ int num)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int i, invcount = 0;
- grant_handle_t handle;
int ret;
- for (i = 0; i < req->nr_pages; i++) {
- if (!test_bit(i, req->unmap_seg))
+ for (i = 0; i < num; i++) {
+ if (pages[i]->persistent_gnt != NULL) {
+ put_persistent_gnt(blkif, pages[i]->persistent_gnt);
continue;
- handle = pending_handle(req, i);
- if (handle == BLKBACK_INVALID_HANDLE)
+ }
+ if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
continue;
- gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
- GNTMAP_host_map, handle);
- pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
- pages[invcount] = virt_to_page(vaddr(req, i));
- invcount++;
+ unmap_pages[invcount] = pages[i]->page;
+ gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page),
+ GNTMAP_host_map, pages[i]->handle);
+ pages[i]->handle = BLKBACK_INVALID_HANDLE;
+ if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
+ invcount);
+ BUG_ON(ret);
+ put_free_pages(blkif, unmap_pages, invcount);
+ invcount = 0;
+ }
+ }
+ if (invcount) {
+ ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
+ BUG_ON(ret);
+ put_free_pages(blkif, unmap_pages, invcount);
}
-
- ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
- BUG_ON(ret);
}
-static int xen_blkbk_map(struct blkif_request *req,
- struct pending_req *pending_req,
- struct seg_buf seg[],
- struct page *pages[])
+static int xen_blkbk_map(struct xen_blkif *blkif,
+ struct grant_page *pages[],
+ int num, bool ro)
{
struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct persistent_gnt *persistent_gnt = NULL;
- struct xen_blkif *blkif = pending_req->blkif;
phys_addr_t addr = 0;
- int i, j;
- bool new_map;
- int nseg = req->u.rw.nr_segments;
+ int i, seg_idx, new_map_idx;
int segs_to_map = 0;
int ret = 0;
+ int last_map = 0, map_until = 0;
int use_persistent_gnts;
use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
- BUG_ON(blkif->persistent_gnt_c >
- max_mapped_grant_pages(pending_req->blkif->blk_protocol));
-
/*
* Fill out preq.nr_sects with proper amount of sectors, and setup
* assign map[..] with the PFN of the page in our domain with the
* corresponding grant reference for each page.
*/
- for (i = 0; i < nseg; i++) {
+again:
+ for (i = map_until; i < num; i++) {
uint32_t flags;
if (use_persistent_gnts)
persistent_gnt = get_persistent_gnt(
- &blkif->persistent_gnts,
- req->u.rw.seg[i].gref);
+ blkif,
+ pages[i]->gref);
if (persistent_gnt) {
/*
* We are using persistent grants and
* the grant is already mapped
*/
- new_map = false;
- } else if (use_persistent_gnts &&
- blkif->persistent_gnt_c <
- max_mapped_grant_pages(blkif->blk_protocol)) {
- /*
- * We are using persistent grants, the grant is
- * not mapped but we have room for it
- */
- new_map = true;
- persistent_gnt = kmalloc(
- sizeof(struct persistent_gnt),
- GFP_KERNEL);
- if (!persistent_gnt)
- return -ENOMEM;
- if (alloc_xenballooned_pages(1, &persistent_gnt->page,
- false)) {
- kfree(persistent_gnt);
- return -ENOMEM;
- }
- persistent_gnt->gnt = req->u.rw.seg[i].gref;
- persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
-
- pages_to_gnt[segs_to_map] =
- persistent_gnt->page;
- addr = (unsigned long) pfn_to_kaddr(
- page_to_pfn(persistent_gnt->page));
-
- add_persistent_gnt(&blkif->persistent_gnts,
- persistent_gnt);
- blkif->persistent_gnt_c++;
- pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
- persistent_gnt->gnt, blkif->persistent_gnt_c,
- max_mapped_grant_pages(blkif->blk_protocol));
+ pages[i]->page = persistent_gnt->page;
+ pages[i]->persistent_gnt = persistent_gnt;
} else {
- /*
- * We are either using persistent grants and
- * hit the maximum limit of grants mapped,
- * or we are not using persistent grants.
- */
- if (use_persistent_gnts &&
- !blkif->vbd.overflow_max_grants) {
- blkif->vbd.overflow_max_grants = 1;
- pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
- blkif->domid, blkif->vbd.handle);
- }
- new_map = true;
- pages[i] = blkbk->pending_page(pending_req, i);
- addr = vaddr(pending_req, i);
- pages_to_gnt[segs_to_map] =
- blkbk->pending_page(pending_req, i);
- }
-
- if (persistent_gnt) {
- pages[i] = persistent_gnt->page;
- persistent_gnts[i] = persistent_gnt;
- } else {
- persistent_gnts[i] = NULL;
- }
-
- if (new_map) {
+ if (get_free_page(blkif, &pages[i]->page))
+ goto out_of_memory;
+ addr = vaddr(pages[i]->page);
+ pages_to_gnt[segs_to_map] = pages[i]->page;
+ pages[i]->persistent_gnt = NULL;
flags = GNTMAP_host_map;
- if (!persistent_gnt &&
- (pending_req->operation != BLKIF_OP_READ))
+ if (!use_persistent_gnts && ro)
flags |= GNTMAP_readonly;
gnttab_set_map_op(&map[segs_to_map++], addr,
- flags, req->u.rw.seg[i].gref,
+ flags, pages[i]->gref,
blkif->domid);
}
+ map_until = i + 1;
+ if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
+ break;
}
if (segs_to_map) {
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req,
* so that when we access vaddr(pending_req,i) it has the contents of
* the page from the other domain.
*/
- bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
- for (i = 0, j = 0; i < nseg; i++) {
- if (!persistent_gnts[i] ||
- persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
+ for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
+ if (!pages[seg_idx]->persistent_gnt) {
/* This is a newly mapped grant */
- BUG_ON(j >= segs_to_map);
- if (unlikely(map[j].status != 0)) {
+ BUG_ON(new_map_idx >= segs_to_map);
+ if (unlikely(map[new_map_idx].status != 0)) {
pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
- map[j].handle = BLKBACK_INVALID_HANDLE;
+ pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
ret |= 1;
- if (persistent_gnts[i]) {
- rb_erase(&persistent_gnts[i]->node,
- &blkif->persistent_gnts);
- blkif->persistent_gnt_c--;
- kfree(persistent_gnts[i]);
- persistent_gnts[i] = NULL;
- }
+ goto next;
}
+ pages[seg_idx]->handle = map[new_map_idx].handle;
+ } else {
+ continue;
}
- if (persistent_gnts[i]) {
- if (persistent_gnts[i]->handle ==
- BLKBACK_INVALID_HANDLE) {
+ if (use_persistent_gnts &&
+ blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+ /*
+ * We are using persistent grants, the grant is
+ * not mapped but we might have room for it.
+ */
+ persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
+ GFP_KERNEL);
+ if (!persistent_gnt) {
/*
- * If this is a new persistent grant
- * save the handler
+ * If we don't have enough memory to
+ * allocate the persistent_gnt struct
+ * map this grant non-persistenly
*/
- persistent_gnts[i]->handle = map[j++].handle;
+ goto next;
}
- pending_handle(pending_req, i) =
- persistent_gnts[i]->handle;
+ persistent_gnt->gnt = map[new_map_idx].ref;
+ persistent_gnt->handle = map[new_map_idx].handle;
+ persistent_gnt->page = pages[seg_idx]->page;
+ if (add_persistent_gnt(blkif,
+ persistent_gnt)) {
+ kfree(persistent_gnt);
+ persistent_gnt = NULL;
+ goto next;
+ }
+ pages[seg_idx]->persistent_gnt = persistent_gnt;
+ pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
+ persistent_gnt->gnt, blkif->persistent_gnt_c,
+ xen_blkif_max_pgrants);
+ goto next;
+ }
+ if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
+ blkif->vbd.overflow_max_grants = 1;
+ pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
+ blkif->domid, blkif->vbd.handle);
+ }
+ /*
+ * We could not map this grant persistently, so use it as
+ * a non-persistent grant.
+ */
+next:
+ new_map_idx++;
+ }
+ segs_to_map = 0;
+ last_map = map_until;
+ if (map_until != num)
+ goto again;
- if (ret)
- continue;
- } else {
- pending_handle(pending_req, i) = map[j++].handle;
- bitmap_set(pending_req->unmap_seg, i, 1);
+ return ret;
+
+out_of_memory:
+ pr_alert(DRV_PFX "%s: out of memory\n", __func__);
+ put_free_pages(blkif, pages_to_gnt, segs_to_map);
+ return -ENOMEM;
+}
+
+static int xen_blkbk_map_seg(struct pending_req *pending_req)
+{
+ int rc;
+
+ rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+ pending_req->nr_pages,
+ (pending_req->operation != BLKIF_OP_READ));
+
+ return rc;
+}
- if (ret)
- continue;
+static int xen_blkbk_parse_indirect(struct blkif_request *req,
+ struct pending_req *pending_req,
+ struct seg_buf seg[],
+ struct phys_req *preq)
+{
+ struct grant_page **pages = pending_req->indirect_pages;
+ struct xen_blkif *blkif = pending_req->blkif;
+ int indirect_grefs, rc, n, nseg, i;
+ struct blkif_request_segment_aligned *segments = NULL;
+
+ nseg = pending_req->nr_pages;
+ indirect_grefs = INDIRECT_PAGES(nseg);
+ BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+ for (i = 0; i < indirect_grefs; i++)
+ pages[i]->gref = req->u.indirect.indirect_grefs[i];
+
+ rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+ if (rc)
+ goto unmap;
+
+ for (n = 0, i = 0; n < nseg; n++) {
+ if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
+ /* Map indirect segments */
+ if (segments)
+ kunmap_atomic(segments);
+ segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
+ }
+ i = n % SEGS_PER_INDIRECT_FRAME;
+ pending_req->segments[n]->gref = segments[i].gref;
+ seg[n].nsec = segments[i].last_sect -
+ segments[i].first_sect + 1;
+ seg[n].offset = (segments[i].first_sect << 9);
+ if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (segments[i].last_sect < segments[i].first_sect)) {
+ rc = -EINVAL;
+ goto unmap;
}
- seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+ preq->nr_sects += seg[n].nsec;
}
- return ret;
+
+unmap:
+ if (segments)
+ kunmap_atomic(segments);
+ xen_blkbk_unmap(blkif, pages, indirect_grefs);
+ return rc;
}
static int dispatch_discard_io(struct xen_blkif *blkif,
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
int status = BLKIF_RSP_OKAY;
struct block_device *bdev = blkif->vbd.bdev;
unsigned long secure;
+ struct phys_req preq;
+
+ preq.sector_number = req->u.discard.sector_number;
+ preq.nr_sects = req->u.discard.nr_sectors;
+ err = xen_vbd_translate(&preq, blkif, WRITE);
+ if (err) {
+ pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n",
+ preq.sector_number,
+ preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
+ goto fail_response;
+ }
blkif->st_ds_req++;
xen_blkif_get(blkif);
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
req->u.discard.nr_sectors,
GFP_KERNEL, secure);
-
+fail_response:
if (err == -EOPNOTSUPP) {
pr_debug(DRV_PFX "discard op failed, not supported\n");
status = BLKIF_RSP_EOPNOTSUPP;
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif,
struct blkif_request *req,
struct pending_req *pending_req)
{
- free_req(pending_req);
+ free_req(blkif, pending_req);
make_response(blkif, req->u.other.id, req->operation,
BLKIF_RSP_EOPNOTSUPP);
return -EIO;
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
* the proper response on the ring.
*/
if (atomic_dec_and_test(&pending_req->pendcnt)) {
- xen_blkbk_unmap(pending_req);
+ xen_blkbk_unmap(pending_req->blkif,
+ pending_req->segments,
+ pending_req->nr_pages);
make_response(pending_req->blkif, pending_req->id,
pending_req->operation, pending_req->status);
xen_blkif_put(pending_req->blkif);
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
if (atomic_read(&pending_req->blkif->drain))
complete(&pending_req->blkif->drain_complete);
}
- free_req(pending_req);
+ free_req(pending_req->blkif, pending_req);
}
}
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif)
rp = blk_rings->common.sring->req_prod;
rmb(); /* Ensure we see queued requests up to 'rp'. */
+ if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
+ rc = blk_rings->common.rsp_prod_pvt;
+ pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
+ rp, rc, rp - rc, blkif->vbd.pdevice);
+ return -EACCES;
+ }
while (rc != rp) {
if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif)
break;
}
- pending_req = alloc_req();
+ pending_req = alloc_req(blkif);
if (NULL == pending_req) {
blkif->st_oo_req++;
more_to_do = 1;
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif)
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
case BLKIF_OP_FLUSH_DISKCACHE:
+ case BLKIF_OP_INDIRECT:
if (dispatch_rw_block_io(blkif, &req, pending_req))
goto done;
break;
case BLKIF_OP_DISCARD:
- free_req(pending_req);
+ free_req(blkif, pending_req);
if (dispatch_discard_io(blkif, &req))
goto done;
break;
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
struct pending_req *pending_req)
{
struct phys_req preq;
- struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct seg_buf *seg = pending_req->seg;
unsigned int nseg;
struct bio *bio = NULL;
- struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct bio **biolist = pending_req->biolist;
int i, nbio = 0;
int operation;
struct blk_plug plug;
bool drain = false;
- struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct grant_page **pages = pending_req->segments;
+ unsigned short req_operation;
+
+ req_operation = req->operation == BLKIF_OP_INDIRECT ?
+ req->u.indirect.indirect_op : req->operation;
+ if ((req->operation == BLKIF_OP_INDIRECT) &&
+ (req_operation != BLKIF_OP_READ) &&
+ (req_operation != BLKIF_OP_WRITE)) {
+ pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
+ req_operation);
+ goto fail_response;
+ }
- switch (req->operation) {
+ switch (req_operation) {
case BLKIF_OP_READ:
blkif->st_rd_req++;
operation = READ;
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
}
/* Check that the number of segments is sane. */
- nseg = req->u.rw.nr_segments;
+ nseg = req->operation == BLKIF_OP_INDIRECT ?
+ req->u.indirect.nr_segments : req->u.rw.nr_segments;
if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
- unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+ unlikely((req->operation != BLKIF_OP_INDIRECT) &&
+ (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
+ unlikely((req->operation == BLKIF_OP_INDIRECT) &&
+ (nseg > MAX_INDIRECT_SEGMENTS))) {
pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
nseg);
/* Haven't submitted any bio's yet. */
goto fail_response;
}
- preq.sector_number = req->u.rw.sector_number;
preq.nr_sects = 0;
pending_req->blkif = blkif;
pending_req->id = req->u.rw.id;
- pending_req->operation = req->operation;
+ pending_req->operation = req_operation;
pending_req->status = BLKIF_RSP_OKAY;
pending_req->nr_pages = nseg;
- for (i = 0; i < nseg; i++) {
- seg[i].nsec = req->u.rw.seg[i].last_sect -
- req->u.rw.seg[i].first_sect + 1;
- if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
- (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
+ if (req->operation != BLKIF_OP_INDIRECT) {
+ preq.dev = req->u.rw.handle;
+ preq.sector_number = req->u.rw.sector_number;
+ for (i = 0; i < nseg; i++) {
+ pages[i]->gref = req->u.rw.seg[i].gref;
+ seg[i].nsec = req->u.rw.seg[i].last_sect -
+ req->u.rw.seg[i].first_sect + 1;
+ seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+ if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (req->u.rw.seg[i].last_sect <
+ req->u.rw.seg[i].first_sect))
+ goto fail_response;
+ preq.nr_sects += seg[i].nsec;
+ }
+ } else {
+ preq.dev = req->u.indirect.handle;
+ preq.sector_number = req->u.indirect.sector_number;
+ if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
goto fail_response;
- preq.nr_sects += seg[i].nsec;
-
}
if (xen_vbd_translate(&preq, blkif, operation) != 0) {
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* the hypercall to unmap the grants - that is all done in
* xen_blkbk_unmap.
*/
- if (xen_blkbk_map(req, pending_req, seg, pages))
+ if (xen_blkbk_map_seg(pending_req))
goto fail_flush;
/*
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
for (i = 0; i < nseg; i++) {
while ((bio == NULL) ||
(bio_add_page(bio,
- pages[i],
+ pages[i]->page,
seg[i].nsec << 9,
seg[i].offset) == 0)) {
- bio = bio_alloc(GFP_KERNEL, nseg-i);
+ int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_KERNEL, nr_iovecs);
if (unlikely(bio == NULL))
goto fail_put_bio;
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
return 0;
fail_flush:
- xen_blkbk_unmap(pending_req);
+ xen_blkbk_unmap(blkif, pending_req->segments,
+ pending_req->nr_pages);
fail_response:
/* Haven't submitted any bio's yet. */
- make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
- free_req(pending_req);
+ make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+ free_req(blkif, pending_req);
msleep(1); /* back off a bit */
return -EIO;
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
static int __init xen_blkif_init(void)
{
- int i, mmap_pages;
int rc = 0;
if (!xen_domain())
return -ENODEV;
- blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
- if (!blkbk) {
- pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
- return -ENOMEM;
- }
-
- mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
-
- blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) *
- xen_blkif_reqs, GFP_KERNEL);
- blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
- mmap_pages, GFP_KERNEL);
- blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
- mmap_pages, GFP_KERNEL);
-
- if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
- !blkbk->pending_pages) {
- rc = -ENOMEM;
- goto out_of_memory;
- }
-
- for (i = 0; i < mmap_pages; i++) {
- blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
- blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
- if (blkbk->pending_pages[i] == NULL) {
- rc = -ENOMEM;
- goto out_of_memory;
- }
- }
rc = xen_blkif_interface_init();
if (rc)
goto failed_init;
- INIT_LIST_HEAD(&blkbk->pending_free);
- spin_lock_init(&blkbk->pending_free_lock);
- init_waitqueue_head(&blkbk->pending_free_wq);
-
- for (i = 0; i < xen_blkif_reqs; i++)
- list_add_tail(&blkbk->pending_reqs[i].free_list,
- &blkbk->pending_free);
-
rc = xen_blkif_xenbus_init();
if (rc)
goto failed_init;
- return 0;
-
- out_of_memory:
- pr_alert(DRV_PFX "%s: out of memory\n", __func__);
failed_init:
- kfree(blkbk->pending_reqs);
- kfree(blkbk->pending_grant_handles);
- if (blkbk->pending_pages) {
- for (i = 0; i < mmap_pages; i++) {
- if (blkbk->pending_pages[i])
- __free_page(blkbk->pending_pages[i]);
- }
- kfree(blkbk->pending_pages);
- }
- kfree(blkbk);
- blkbk = NULL;
return rc;
}
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 60103e2..8d88075 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -50,6 +50,19 @@
__func__, __LINE__, ##args)
+/*
+ * This is the maximum number of segments that would be allowed in indirect
+ * requests. This value will also be passed to the frontend.
+ */
+#define MAX_INDIRECT_SEGMENTS 256
+
+#define SEGS_PER_INDIRECT_FRAME \
+ (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
+#define MAX_INDIRECT_PAGES \
+ ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+#define INDIRECT_PAGES(_segs) \
+ ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
/* Not a real protocol. Used to generate ring structs which contain
* the elements common to all protocols only. This way we get a
* compiler-checkable way to use common struct elements, so we can
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
uint64_t id; /* private guest value, echoed in resp */
} __attribute__((__packed__));
+struct blkif_x86_32_request_indirect {
+ uint8_t indirect_op;
+ uint16_t nr_segments;
+ uint64_t id;
+ blkif_sector_t sector_number;
+ blkif_vdev_t handle;
+ uint16_t _pad1;
+ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+ /*
+ * The maximum number of indirect segments (and pages) that will
+ * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+ * is also exported to the guest (via xenstore
+ * feature-max-indirect-segments entry), so the frontend knows how
+ * many indirect segments the backend supports.
+ */
+ uint64_t _pad2; /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
struct blkif_x86_32_request {
uint8_t operation; /* BLKIF_OP_??? */
union {
struct blkif_x86_32_request_rw rw;
struct blkif_x86_32_request_discard discard;
struct blkif_x86_32_request_other other;
+ struct blkif_x86_32_request_indirect indirect;
} u;
} __attribute__((__packed__));
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
uint64_t id; /* private guest value, echoed in resp */
} __attribute__((__packed__));
+struct blkif_x86_64_request_indirect {
+ uint8_t indirect_op;
+ uint16_t nr_segments;
+ uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
+ uint64_t id;
+ blkif_sector_t sector_number;
+ blkif_vdev_t handle;
+ uint16_t _pad2;
+ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+ /*
+ * The maximum number of indirect segments (and pages) that will
+ * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+ * is also exported to the guest (via xenstore
+ * feature-max-indirect-segments entry), so the frontend knows how
+ * many indirect segments the backend supports.
+ */
+ uint32_t _pad3; /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
struct blkif_x86_64_request {
uint8_t operation; /* BLKIF_OP_??? */
union {
struct blkif_x86_64_request_rw rw;
struct blkif_x86_64_request_discard discard;
struct blkif_x86_64_request_other other;
+ struct blkif_x86_64_request_indirect indirect;
} u;
} __attribute__((__packed__));
@@ -182,12 +234,26 @@ struct xen_vbd {
struct backend_info;
+/* Number of available flags */
+#define PERSISTENT_GNT_FLAGS_SIZE 2
+/* This persistent grant is currently in use */
+#define PERSISTENT_GNT_ACTIVE 0
+/*
+ * This persistent grant has been used, this flag is set when we remove the
+ * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
+ */
+#define PERSISTENT_GNT_WAS_ACTIVE 1
+
+/* Number of requests that we can fit in a ring */
+#define XEN_BLKIF_REQS 32
struct persistent_gnt {
struct page *page;
grant_ref_t gnt;
grant_handle_t handle;
+ DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
struct rb_node node;
+ struct list_head remove_node;
};
struct xen_blkif {
@@ -219,6 +285,23 @@ struct xen_blkif {
/* tree to store persistent grants */
struct rb_root persistent_gnts;
unsigned int persistent_gnt_c;
+ atomic_t persistent_gnt_in_use;
+ unsigned long next_lru;
+
+ /* used by the kworker that offload work from the persistent purge */
+ struct list_head persistent_purge_list;
+ struct work_struct persistent_purge_work;
+
+ /* buffer of free pages to map grant refs */
+ spinlock_t free_pages_lock;
+ int free_pages_num;
+ struct list_head free_pages;
+
+ /* List of all 'pending_req' available */
+ struct list_head pending_free;
+ /* And its spinlock. */
+ spinlock_t pending_free_lock;
+ wait_queue_head_t pending_free_wq;
/* statistics */
unsigned long st_print;
@@ -231,6 +314,41 @@ struct xen_blkif {
unsigned long long st_wr_sect;
wait_queue_head_t waiting_to_free;
+ /* Thread shutdown wait queue. */
+ wait_queue_head_t shutdown_wq;
+};
+
+struct seg_buf {
+ unsigned long offset;
+ unsigned int nsec;
+};
+
+struct grant_page {
+ struct page *page;
+ struct persistent_gnt *persistent_gnt;
+ grant_handle_t handle;
+ grant_ref_t gref;
+};
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+struct pending_req {
+ struct xen_blkif *blkif;
+ u64 id;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+ struct grant_page *segments[MAX_INDIRECT_SEGMENTS];
+ /* Indirect descriptors */
+ struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
+ struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
+ struct bio *biolist[MAX_INDIRECT_SEGMENTS];
};
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void);
irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
int xen_blkif_schedule(void *arg);
+int xen_blkif_purge_persistent(void *arg);
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
struct backend_info *be, int state);
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
static inline void blkif_get_x86_32_req(struct blkif_request *dst,
struct blkif_x86_32_request *src)
{
- int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
dst->operation = src->operation;
switch (src->operation) {
case BLKIF_OP_READ:
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
dst->u.discard.sector_number = src->u.discard.sector_number;
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
break;
+ case BLKIF_OP_INDIRECT:
+ dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+ dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+ dst->u.indirect.handle = src->u.indirect.handle;
+ dst->u.indirect.id = src->u.indirect.id;
+ dst->u.indirect.sector_number = src->u.indirect.sector_number;
+ barrier();
+ j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+ for (i = 0; i < j; i++)
+ dst->u.indirect.indirect_grefs[i] =
+ src->u.indirect.indirect_grefs[i];
+ break;
default:
/*
* Don't know how to translate this op. Only get the
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
static inline void blkif_get_x86_64_req(struct blkif_request *dst,
struct blkif_x86_64_request *src)
{
- int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
dst->operation = src->operation;
switch (src->operation) {
case BLKIF_OP_READ:
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
dst->u.discard.sector_number = src->u.discard.sector_number;
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
break;
+ case BLKIF_OP_INDIRECT:
+ dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+ dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+ dst->u.indirect.handle = src->u.indirect.handle;
+ dst->u.indirect.id = src->u.indirect.id;
+ dst->u.indirect.sector_number = src->u.indirect.sector_number;
+ barrier();
+ j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+ for (i = 0; i < j; i++)
+ dst->u.indirect.indirect_grefs[i] =
+ src->u.indirect.indirect_grefs[i];
+ break;
default:
/*
* Don't know how to translate this op. Only get the
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 8bfd1bc..2e5b69d 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
err = PTR_ERR(blkif->xenblkd);
blkif->xenblkd = NULL;
xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+ return;
}
}
static struct xen_blkif *xen_blkif_alloc(domid_t domid)
{
struct xen_blkif *blkif;
+ struct pending_req *req, *n;
+ int i, j;
+
+ BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
if (!blkif)
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
blkif->st_print = jiffies;
init_waitqueue_head(&blkif->waiting_to_free);
blkif->persistent_gnts.rb_node = NULL;
+ spin_lock_init(&blkif->free_pages_lock);
+ INIT_LIST_HEAD(&blkif->free_pages);
+ blkif->free_pages_num = 0;
+ atomic_set(&blkif->persistent_gnt_in_use, 0);
+
+ INIT_LIST_HEAD(&blkif->pending_free);
+
+ for (i = 0; i < XEN_BLKIF_REQS; i++) {
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ goto fail;
+ list_add_tail(&req->free_list,
+ &blkif->pending_free);
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+ req->segments[j] = kzalloc(sizeof(*req->segments[0]),
+ GFP_KERNEL);
+ if (!req->segments[j])
+ goto fail;
+ }
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+ req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
+ GFP_KERNEL);
+ if (!req->indirect_pages[j])
+ goto fail;
+ }
+ }
+ spin_lock_init(&blkif->pending_free_lock);
+ init_waitqueue_head(&blkif->pending_free_wq);
+ init_waitqueue_head(&blkif->shutdown_wq);
return blkif;
+
+fail:
+ list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+ list_del(&req->free_list);
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+ if (!req->segments[j])
+ break;
+ kfree(req->segments[j]);
+ }
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+ if (!req->indirect_pages[j])
+ break;
+ kfree(req->indirect_pages[j]);
+ }
+ kfree(req);
+ }
+
+ kmem_cache_free(xen_blkif_cachep, blkif);
+
+ return ERR_PTR(-ENOMEM);
}
static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
{
if (blkif->xenblkd) {
kthread_stop(blkif->xenblkd);
+ wake_up(&blkif->shutdown_wq);
blkif->xenblkd = NULL;
}
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
static void xen_blkif_free(struct xen_blkif *blkif)
{
+ struct pending_req *req, *n;
+ int i = 0, j;
+
if (!atomic_dec_and_test(&blkif->refcnt))
BUG();
+
+ /* Check that there is no request in use */
+ list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+ list_del(&req->free_list);
+
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+ kfree(req->segments[j]);
+
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+ kfree(req->indirect_pages[j]);
+
+ kfree(req);
+ i++;
+ }
+
+ WARN_ON(i != XEN_BLKIF_REQS);
+
kmem_cache_free(xen_blkif_cachep, blkif);
}
@@ -678,6 +753,11 @@ again:
dev->nodename);
goto abort;
}
+ err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
+ MAX_INDIRECT_SEGMENTS);
+ if (err)
+ dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
+ dev->nodename, err);
err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
(unsigned long long)vbd_sz(&be->blkif->vbd));
@@ -704,6 +784,11 @@ again:
dev->nodename);
goto abort;
}
+ err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
+ bdev_physical_block_size(be->blkif->vbd.bdev));
+ if (err)
+ xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
+ dev->nodename);
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d89ef86..a4660bb 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -74,12 +74,30 @@ struct grant {
struct blk_shadow {
struct blkif_request req;
struct request *request;
- struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct grant **grants_used;
+ struct grant **indirect_grants;
+ struct scatterlist *sg;
+};
+
+struct split_bio {
+ struct bio *bio;
+ atomic_t pending;
+ int err;
};
static DEFINE_MUTEX(blkfront_mutex);
static const struct block_device_operations xlvbd_block_fops;
+/*
+ * Maximum number of segments in indirect requests, the actual value used by
+ * the frontend driver is the minimum of this value and the value provided
+ * by the backend driver.
+ */
+
+static unsigned int xen_blkif_max_segments = 32;
+module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
+MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
+
#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
/*
@@ -98,7 +116,6 @@ struct blkfront_info
enum blkif_state connected;
int ring_ref;
struct blkif_front_ring ring;
- struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int evtchn, irq;
struct request_queue *rq;
struct work_struct work;
@@ -114,6 +131,7 @@ struct blkfront_info
unsigned int discard_granularity;
unsigned int discard_alignment;
unsigned int feature_persistent:1;
+ unsigned int max_indirect_segments;
int is_ready;
};
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock);
#define DEV_NAME "xvd" /* name in /dev */
+#define SEGS_PER_INDIRECT_FRAME \
+ (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
+#define INDIRECT_GREFS(_segs) \
+ ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
+static int blkfront_setup_indirect(struct blkfront_info *info);
+
static int get_id_from_freelist(struct blkfront_info *info)
{
unsigned long free = info->shadow_free;
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req)
struct blkif_request *ring_req;
unsigned long id;
unsigned int fsect, lsect;
- int i, ref;
+ int i, ref, n;
+ struct blkif_request_segment_aligned *segments = NULL;
/*
* Used to store if we are able to queue the request by just using
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req)
grant_ref_t gref_head;
struct grant *gnt_list_entry = NULL;
struct scatterlist *sg;
+ int nseg, max_grefs;
if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
return 1;
- /* Check if we have enought grants to allocate a requests */
- if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ max_grefs = info->max_indirect_segments ?
+ info->max_indirect_segments +
+ INDIRECT_GREFS(info->max_indirect_segments) :
+ BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+ /* Check if we have enough grants to allocate a requests */
+ if (info->persistent_gnts_c < max_grefs) {
new_persistent_gnts = 1;
if (gnttab_alloc_grant_references(
- BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
+ max_grefs - info->persistent_gnts_c,
&gref_head) < 0) {
gnttab_request_free_callback(
&info->callback,
blkif_restart_queue_callback,
info,
- BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ max_grefs);
return 1;
}
} else
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req)
id = get_id_from_freelist(info);
info->shadow[id].request = req;
- ring_req->u.rw.id = id;
- ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
- ring_req->u.rw.handle = info->handle;
-
- ring_req->operation = rq_data_dir(req) ?
- BLKIF_OP_WRITE : BLKIF_OP_READ;
-
- if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
- /*
- * Ideally we can do an unordered flush-to-disk. In case the
- * backend onlysupports barriers, use that. A barrier request
- * a superset of FUA, so we can implement it the same
- * way. (It's also a FLUSH+FUA, since it is
- * guaranteed ordered WRT previous writes.)
- */
- ring_req->operation = info->flush_op;
- }
-
if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
- /* id, sector_number and handle are set above. */
ring_req->operation = BLKIF_OP_DISCARD;
ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+ ring_req->u.discard.id = id;
+ ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
else
ring_req->u.discard.flag = 0;
} else {
- ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
- info->sg);
- BUG_ON(ring_req->u.rw.nr_segments >
- BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
- for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
+ BUG_ON(info->max_indirect_segments == 0 &&
+ req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ BUG_ON(info->max_indirect_segments &&
+ req->nr_phys_segments > info->max_indirect_segments);
+ nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+ ring_req->u.rw.id = id;
+ if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ /*
+ * The indirect operation can only be a BLKIF_OP_READ or
+ * BLKIF_OP_WRITE
+ */
+ BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
+ ring_req->operation = BLKIF_OP_INDIRECT;
+ ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
+ BLKIF_OP_WRITE : BLKIF_OP_READ;
+ ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
+ ring_req->u.indirect.handle = info->handle;
+ ring_req->u.indirect.nr_segments = nseg;
+ } else {
+ ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
+ ring_req->u.rw.handle = info->handle;
+ ring_req->operation = rq_data_dir(req) ?
+ BLKIF_OP_WRITE : BLKIF_OP_READ;
+ if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+ /*
+ * Ideally we can do an unordered flush-to-disk. In case the
+ * backend onlysupports barriers, use that. A barrier request
+ * a superset of FUA, so we can implement it the same
+ * way. (It's also a FLUSH+FUA, since it is
+ * guaranteed ordered WRT previous writes.)
+ */
+ ring_req->operation = info->flush_op;
+ }
+ ring_req->u.rw.nr_segments = nseg;
+ }
+ for_each_sg(info->shadow[id].sg, sg, nseg, i) {
fsect = sg->offset >> 9;
lsect = fsect + (sg->length >> 9) - 1;
+ if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
+ (i % SEGS_PER_INDIRECT_FRAME == 0)) {
+ if (segments)
+ kunmap_atomic(segments);
+
+ n = i / SEGS_PER_INDIRECT_FRAME;
+ gnt_list_entry = get_grant(&gref_head, info);
+ info->shadow[id].indirect_grants[n] = gnt_list_entry;
+ segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
+ ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
+ }
+
gnt_list_entry = get_grant(&gref_head, info);
ref = gnt_list_entry->gref;
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req)
BUG_ON(sg->offset + sg->length > PAGE_SIZE);
- shared_data = kmap_atomic(
- pfn_to_page(gnt_list_entry->pfn));
+ shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
bvec_data = kmap_atomic(sg_page(sg));
/*
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req)
kunmap_atomic(bvec_data);
kunmap_atomic(shared_data);
}
-
- ring_req->u.rw.seg[i] =
- (struct blkif_request_segment) {
- .gref = ref,
- .first_sect = fsect,
- .last_sect = lsect };
+ if (ring_req->operation != BLKIF_OP_INDIRECT) {
+ ring_req->u.rw.seg[i] =
+ (struct blkif_request_segment) {
+ .gref = ref,
+ .first_sect = fsect,
+ .last_sect = lsect };
+ } else {
+ n = i % SEGS_PER_INDIRECT_FRAME;
+ segments[n] =
+ (struct blkif_request_segment_aligned) {
+ .gref = ref,
+ .first_sect = fsect,
+ .last_sect = lsect };
+ }
}
+ if (segments)
+ kunmap_atomic(segments);
}
info->ring.req_prod_pvt++;
@@ -542,7 +608,9 @@ wait:
flush_requests(info);
}
-static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
+ unsigned int physical_sector_size,
+ unsigned int segments)
{
struct request_queue *rq;
struct blkfront_info *info = gd->private_data;
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
/* Hard sector size and max sectors impersonate the equiv. hardware. */
blk_queue_logical_block_size(rq, sector_size);
- blk_queue_max_hw_sectors(rq, 512);
+ blk_queue_physical_block_size(rq, physical_sector_size);
+ blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
/* Each segment in a request is up to an aligned page in size. */
blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
blk_queue_max_segment_size(rq, PAGE_SIZE);
/* Ensure a merged request will fit in a single I/O ring slot. */
- blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ blk_queue_max_segments(rq, segments);
/* Make sure buffer addresses are sector-aligned. */
blk_queue_dma_alignment(rq, 511);
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
static void xlvbd_flush(struct blkfront_info *info)
{
blk_queue_flush(info->rq, info->feature_flush);
- printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
+ printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
info->gd->disk_name,
info->flush_op == BLKIF_OP_WRITE_BARRIER ?
"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
"flush diskcache" : "barrier or flush"),
- info->feature_flush ? "enabled" : "disabled",
- info->feature_persistent ? "using persistent grants" : "");
+ info->feature_flush ? "enabled;" : "disabled;",
+ "persistent grants:",
+ info->feature_persistent ? "enabled;" : "disabled;",
+ "indirect descriptors:",
+ info->max_indirect_segments ? "enabled;" : "disabled;");
}
static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n)
static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info,
- u16 vdisk_info, u16 sector_size)
+ u16 vdisk_info, u16 sector_size,
+ unsigned int physical_sector_size)
{
struct gendisk *gd;
int nr_minors = 1;
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
gd->driverfs_dev = &(info->xbdev->dev);
set_capacity(gd, capacity);
- if (xlvbd_init_blk_queue(gd, sector_size)) {
+ if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
+ info->max_indirect_segments ? :
+ BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
del_gendisk(gd);
goto release;
}
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
{
struct grant *persistent_gnt;
struct grant *n;
+ int i, j, segs;
/* Prevent new requests being issued until we fix things up. */
spin_lock_irq(&info->io_lock);
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
}
BUG_ON(info->persistent_gnts_c != 0);
+ for (i = 0; i < BLK_RING_SIZE; i++) {
+ /*
+ * Clear persistent grants present in requests already
+ * on the shared ring
+ */
+ if (!info->shadow[i].request)
+ goto free_shadow;
+
+ segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+ info->shadow[i].req.u.indirect.nr_segments :
+ info->shadow[i].req.u.rw.nr_segments;
+ for (j = 0; j < segs; j++) {
+ persistent_gnt = info->shadow[i].grants_used[j];
+ gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+ __free_page(pfn_to_page(persistent_gnt->pfn));
+ kfree(persistent_gnt);
+ }
+
+ if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+ /*
+ * If this is not an indirect operation don't try to
+ * free indirect segments
+ */
+ goto free_shadow;
+
+ for (j = 0; j < INDIRECT_GREFS(segs); j++) {
+ persistent_gnt = info->shadow[i].indirect_grants[j];
+ gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+ __free_page(pfn_to_page(persistent_gnt->pfn));
+ kfree(persistent_gnt);
+ }
+
+free_shadow:
+ kfree(info->shadow[i].grants_used);
+ info->shadow[i].grants_used = NULL;
+ kfree(info->shadow[i].indirect_grants);
+ info->shadow[i].indirect_grants = NULL;
+ kfree(info->shadow[i].sg);
+ info->shadow[i].sg = NULL;
+ }
+
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
spin_unlock_irq(&info->io_lock);
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
struct blkif_response *bret)
{
int i = 0;
- struct bio_vec *bvec;
- struct req_iterator iter;
- unsigned long flags;
+ struct scatterlist *sg;
char *bvec_data;
void *shared_data;
- unsigned int offset = 0;
+ int nseg;
+
+ nseg = s->req.operation == BLKIF_OP_INDIRECT ?
+ s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
if (bret->operation == BLKIF_OP_READ) {
/*
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
* than PAGE_SIZE, we have to keep track of the current offset,
* to be sure we are copying the data from the right shared page.
*/
- rq_for_each_segment(bvec, s->request, iter) {
- BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
- if (bvec->bv_offset < offset)
- i++;
- BUG_ON(i >= s->req.u.rw.nr_segments);
+ for_each_sg(s->sg, sg, nseg, i) {
+ BUG_ON(sg->offset + sg->length > PAGE_SIZE);
shared_data = kmap_atomic(
pfn_to_page(s->grants_used[i]->pfn));
- bvec_data = bvec_kmap_irq(bvec, &flags);
- memcpy(bvec_data, shared_data + bvec->bv_offset,
- bvec->bv_len);
- bvec_kunmap_irq(bvec_data, &flags);
+ bvec_data = kmap_atomic(sg_page(sg));
+ memcpy(bvec_data + sg->offset,
+ shared_data + sg->offset,
+ sg->length);
+ kunmap_atomic(bvec_data);
kunmap_atomic(shared_data);
- offset = bvec->bv_offset + bvec->bv_len;
}
}
/* Add the persistent grant into the list of free grants */
- for (i = 0; i < s->req.u.rw.nr_segments; i++) {
+ for (i = 0; i < nseg; i++) {
list_add(&s->grants_used[i]->node, &info->persistent_gnts);
info->persistent_gnts_c++;
}
+ if (s->req.operation == BLKIF_OP_INDIRECT) {
+ for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
+ list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
+ info->persistent_gnts_c++;
+ }
+ }
}
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev,
SHARED_RING_INIT(sring);
FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
- sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
- /* Allocate memory for grants */
- err = fill_grant_buffer(info, BLK_RING_SIZE *
- BLKIF_MAX_SEGMENTS_PER_REQUEST);
- if (err)
- goto fail;
-
err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
if (err < 0) {
free_page((unsigned long)sring);
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev,
return 0;
}
+/*
+ * This is a clone of md_trim_bio, used to split a bio into smaller ones
+ */
+static void trim_bio(struct bio *bio, int offset, int size)
+{
+ /* 'bio' is a cloned bio which we need to trim to match
+ * the given offset and size.
+ * This requires adjusting bi_sector, bi_size, and bi_io_vec
+ */
+ int i;
+ struct bio_vec *bvec;
+ int sofar = 0;
+
+ size <<= 9;
+ if (offset == 0 && size == bio->bi_size)
+ return;
+
+ bio->bi_sector += offset;
+ bio->bi_size = size;
+ offset <<= 9;
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+ while (bio->bi_idx < bio->bi_vcnt &&
+ bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
+ /* remove this whole bio_vec */
+ offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
+ bio->bi_idx++;
+ }
+ if (bio->bi_idx < bio->bi_vcnt) {
+ bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
+ bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
+ }
+ /* avoid any complications with bi_idx being non-zero*/
+ if (bio->bi_idx) {
+ memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
+ (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
+ bio->bi_vcnt -= bio->bi_idx;
+ bio->bi_idx = 0;
+ }
+ /* Make sure vcnt and last bv are not too big */
+ bio_for_each_segment(bvec, bio, i) {
+ if (sofar + bvec->bv_len > size)
+ bvec->bv_len = size - sofar;
+ if (bvec->bv_len == 0) {
+ bio->bi_vcnt = i;
+ break;
+ }
+ sofar += bvec->bv_len;
+ }
+}
+
+static void split_bio_end(struct bio *bio, int error)
+{
+ struct split_bio *split_bio = bio->bi_private;
+
+ if (error)
+ split_bio->err = error;
+
+ if (atomic_dec_and_test(&split_bio->pending)) {
+ split_bio->bio->bi_phys_segments = 0;
+ bio_endio(split_bio->bio, split_bio->err);
+ kfree(split_bio);
+ }
+ bio_put(bio);
+}
static int blkif_recover(struct blkfront_info *info)
{
int i;
- struct blkif_request *req;
+ struct request *req, *n;
struct blk_shadow *copy;
- int j;
+ int rc;
+ struct bio *bio, *cloned_bio;
+ struct bio_list bio_list, merge_bio;
+ unsigned int segs, offset;
+ int pending, size;
+ struct split_bio *split_bio;
+ struct list_head requests;
/* Stage 1: Make a safe copy of the shadow state. */
copy = kmemdup(info->shadow, sizeof(info->shadow),
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info)
info->shadow_free = info->ring.req_prod_pvt;
info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
- /* Stage 3: Find pending requests and requeue them. */
+ rc = blkfront_setup_indirect(info);
+ if (rc) {
+ kfree(copy);
+ return rc;
+ }
+
+ segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ blk_queue_max_segments(info->rq, segs);
+ bio_list_init(&bio_list);
+ INIT_LIST_HEAD(&requests);
for (i = 0; i < BLK_RING_SIZE; i++) {
/* Not in use? */
if (!copy[i].request)
continue;
- /* Grab a request slot and copy shadow state into it. */
- req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
- *req = copy[i].req;
-
- /* We get a new request id, and must reset the shadow state. */
- req->u.rw.id = get_id_from_freelist(info);
- memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
-
- if (req->operation != BLKIF_OP_DISCARD) {
- /* Rewrite any grant references invalidated by susp/resume. */
- for (j = 0; j < req->u.rw.nr_segments; j++)
- gnttab_grant_foreign_access_ref(
- req->u.rw.seg[j].gref,
- info->xbdev->otherend_id,
- pfn_to_mfn(copy[i].grants_used[j]->pfn),
- 0);
+ /*
+ * Get the bios in the request so we can re-queue them.
+ */
+ if (copy[i].request->cmd_flags &
+ (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+ /*
+ * Flush operations don't contain bios, so
+ * we need to requeue the whole request
+ */
+ list_add(&copy[i].request->queuelist, &requests);
+ continue;
}
- info->shadow[req->u.rw.id].req = *req;
-
- info->ring.req_prod_pvt++;
+ merge_bio.head = copy[i].request->bio;
+ merge_bio.tail = copy[i].request->biotail;
+ bio_list_merge(&bio_list, &merge_bio);
+ copy[i].request->bio = NULL;
+ blk_put_request(copy[i].request);
}
kfree(copy);
+ /*
+ * Empty the queue, this is important because we might have
+ * requests in the queue with more segments than what we
+ * can handle now.
+ */
+ spin_lock_irq(&info->io_lock);
+ while ((req = blk_fetch_request(info->rq)) != NULL) {
+ if (req->cmd_flags &
+ (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+ list_add(&req->queuelist, &requests);
+ continue;
+ }
+ merge_bio.head = req->bio;
+ merge_bio.tail = req->biotail;
+ bio_list_merge(&bio_list, &merge_bio);
+ req->bio = NULL;
+ if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
+ pr_alert("diskcache flush request found!\n");
+ __blk_put_request(info->rq, req);
+ }
+ spin_unlock_irq(&info->io_lock);
+
xenbus_switch_state(info->xbdev, XenbusStateConnected);
spin_lock_irq(&info->io_lock);
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info)
/* Now safe for us to use the shared ring */
info->connected = BLKIF_STATE_CONNECTED;
- /* Send off requeued requests */
- flush_requests(info);
-
/* Kick any other new requests queued since we resumed */
kick_pending_request_queues(info);
+ list_for_each_entry_safe(req, n, &requests, queuelist) {
+ /* Requeue pending requests (flush or discard) */
+ list_del_init(&req->queuelist);
+ BUG_ON(req->nr_phys_segments > segs);
+ blk_requeue_request(info->rq, req);
+ }
spin_unlock_irq(&info->io_lock);
+ while ((bio = bio_list_pop(&bio_list)) != NULL) {
+ /* Traverse the list of pending bios and re-queue them */
+ if (bio_segments(bio) > segs) {
+ /*
+ * This bio has more segments than what we can
+ * handle, we have to split it.
+ */
+ pending = (bio_segments(bio) + segs - 1) / segs;
+ split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
+ BUG_ON(split_bio == NULL);
+ atomic_set(&split_bio->pending, pending);
+ split_bio->bio = bio;
+ for (i = 0; i < pending; i++) {
+ offset = (i * segs * PAGE_SIZE) >> 9;
+ size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
+ (unsigned int)(bio->bi_size >> 9) - offset);
+ cloned_bio = bio_clone(bio, GFP_NOIO);
+ BUG_ON(cloned_bio == NULL);
+ trim_bio(cloned_bio, offset, size);
+ cloned_bio->bi_private = split_bio;
+ cloned_bio->bi_end_io = split_bio_end;
+ submit_bio(cloned_bio->bi_rw, cloned_bio);
+ }
+ /*
+ * Now we have to wait for all those smaller bios to
+ * end, so we can also end the "parent" bio.
+ */
+ continue;
+ }
+ /* We don't need to split this bio */
+ submit_bio(bio->bi_rw, bio);
+ }
+
return 0;
}
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev)
blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
err = talk_to_blkback(dev, info);
- if (info->connected == BLKIF_STATE_SUSPENDED && !err)
- err = blkif_recover(info);
+
+ /*
+ * We have to wait for the backend to switch to
+ * connected state, since we want to read which
+ * features it supports.
+ */
return err;
}
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info)
kfree(type);
}
+static int blkfront_setup_indirect(struct blkfront_info *info)
+{
+ unsigned int indirect_segments, segs;
+ int err, i;
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-max-indirect-segments", "%u", &indirect_segments,
+ NULL);
+ if (err) {
+ info->max_indirect_segments = 0;
+ segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ } else {
+ info->max_indirect_segments = min(indirect_segments,
+ xen_blkif_max_segments);
+ segs = info->max_indirect_segments;
+ }
+
+ err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
+ if (err)
+ goto out_of_memory;
+
+ for (i = 0; i < BLK_RING_SIZE; i++) {
+ info->shadow[i].grants_used = kzalloc(
+ sizeof(info->shadow[i].grants_used[0]) * segs,
+ GFP_NOIO);
+ info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
+ if (info->max_indirect_segments)
+ info->shadow[i].indirect_grants = kzalloc(
+ sizeof(info->shadow[i].indirect_grants[0]) *
+ INDIRECT_GREFS(segs),
+ GFP_NOIO);
+ if ((info->shadow[i].grants_used == NULL) ||
+ (info->shadow[i].sg == NULL) ||
+ (info->max_indirect_segments &&
+ (info->shadow[i].indirect_grants == NULL)))
+ goto out_of_memory;
+ sg_init_table(info->shadow[i].sg, segs);
+ }
+
+
+ return 0;
+
+out_of_memory:
+ for (i = 0; i < BLK_RING_SIZE; i++) {
+ kfree(info->shadow[i].grants_used);
+ info->shadow[i].grants_used = NULL;
+ kfree(info->shadow[i].sg);
+ info->shadow[i].sg = NULL;
+ kfree(info->shadow[i].indirect_grants);
+ info->shadow[i].indirect_grants = NULL;
+ }
+ return -ENOMEM;
+}
+
/*
* Invoked when the backend is finally 'ready' (and has told produced
* the details about the physical device - #sectors, size, etc).
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info)
{
unsigned long long sectors;
unsigned long sector_size;
+ unsigned int physical_sector_size;
unsigned int binfo;
int err;
int barrier, flush, discard, persistent;
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info)
set_capacity(info->gd, sectors);
revalidate_disk(info->gd);
- /* fall through */
+ return;
case BLKIF_STATE_SUSPENDED:
+ /*
+ * If we are recovering from suspension, we need to wait
+ * for the backend to announce it's features before
+ * reconnecting, at least we need to know if the backend
+ * supports indirect descriptors, and how many.
+ */
+ blkif_recover(info);
return;
default:
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info)
return;
}
+ /*
+ * physcial-sector-size is a newer field, so old backends may not
+ * provide this. Assume physical sector size to be the same as
+ * sector_size in that case.
+ */
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "physical-sector-size", "%u", &physical_sector_size);
+ if (err != 1)
+ physical_sector_size = sector_size;
+
info->feature_flush = 0;
info->flush_op = 0;
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info)
else
info->feature_persistent = persistent;
- err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
+ err = blkfront_setup_indirect(info);
+ if (err) {
+ xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+ info->xbdev->otherend);
+ return;
+ }
+
+ err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
+ physical_sector_size);
if (err) {
xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
info->xbdev->otherend);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8bda129..dac7738 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -272,6 +272,8 @@ enum {
* - memcg: use_hierarchy is on by default and the cgroup file for
* the flag is not created.
*
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
* The followings are planned changes.
*
* - release_agent will be disallowed once replacement notification
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 1b4d4ee..de7d74a 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -177,7 +177,11 @@ enum drbd_ret_code {
ERR_NEED_APV_100 = 163,
ERR_NEED_ALLOW_TWO_PRI = 164,
ERR_MD_UNCLEAN = 165,
-
+ ERR_MD_LAYOUT_CONNECTED = 166,
+ ERR_MD_LAYOUT_TOO_BIG = 167,
+ ERR_MD_LAYOUT_TOO_SMALL = 168,
+ ERR_MD_LAYOUT_NO_FIT = 169,
+ ERR_IMPLICIT_SHRINK = 170,
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
};
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index d0d8fac..e8c4457 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -181,6 +181,8 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
__u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size)
__flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force)
__flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync)
+ __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF)
+ __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF)
)
GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 1fedf2b..17e50bb 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -215,4 +215,13 @@
#define DRBD_ALWAYS_ASBP_DEF 0
#define DRBD_USE_RLE_DEF 1
+#define DRBD_AL_STRIPES_MIN 1
+#define DRBD_AL_STRIPES_MAX 1024
+#define DRBD_AL_STRIPES_DEF 1
+#define DRBD_AL_STRIPES_SCALE '1'
+
+#define DRBD_AL_STRIPE_SIZE_MIN 4
+#define DRBD_AL_STRIPE_SIZE_MAX 16777216
+#define DRBD_AL_STRIPE_SIZE_DEF 32
+#define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */
#endif
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index ffd4652..65e1209 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t;
#define BLKIF_OP_DISCARD 5
/*
+ * Recognized if "feature-max-indirect-segments" in present in the backend
+ * xenbus info. The "feature-max-indirect-segments" node contains the maximum
+ * number of segments allowed by the backend per request. If the node is
+ * present, the frontend might use blkif_request_indirect structs in order to
+ * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
+ * maximum number of indirect segments is fixed by the backend, but the
+ * frontend can issue requests with any number of indirect segments as long as
+ * it's less than the number provided by the backend. The indirect_grefs field
+ * in blkif_request_indirect should be filled by the frontend with the
+ * grant references of the pages that are holding the indirect segments.
+ * This pages are filled with an array of blkif_request_segment_aligned
+ * that hold the information about the segments. The number of indirect
+ * pages to use is determined by the maximum number of segments
+ * a indirect request contains. Every indirect page can contain a maximum
+ * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)),
+ * so to calculate the number of indirect pages to use we have to do
+ * ceil(indirect_segments/512).
+ *
+ * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
+ * create the "feature-max-indirect-segments" node!
+ */
+#define BLKIF_OP_INDIRECT 6
+
+/*
* Maximum scatter/gather segments per request.
* This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
* NB. This could be 12 if the ring indexes weren't stored in the same page.
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
+
+struct blkif_request_segment_aligned {
+ grant_ref_t gref; /* reference to I/O buffer frame */
+ /* @first_sect: first sector in frame to transfer (inclusive). */
+ /* @last_sect: last sector in frame to transfer (inclusive). */
+ uint8_t first_sect, last_sect;
+ uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */
+} __attribute__((__packed__));
+
struct blkif_request_rw {
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
@@ -147,12 +181,31 @@ struct blkif_request_other {
uint64_t id; /* private guest value, echoed in resp */
} __attribute__((__packed__));
+struct blkif_request_indirect {
+ uint8_t indirect_op;
+ uint16_t nr_segments;
+#ifdef CONFIG_X86_64
+ uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */
+#endif
+ uint64_t id;
+ blkif_sector_t sector_number;
+ blkif_vdev_t handle;
+ uint16_t _pad2;
+ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+#ifdef CONFIG_X86_64
+ uint32_t _pad3; /* make it 64 byte aligned */
+#else
+ uint64_t _pad3; /* make it 64 byte aligned */
+#endif
+} __attribute__((__packed__));
+
struct blkif_request {
uint8_t operation; /* BLKIF_OP_??? */
union {
struct blkif_request_rw rw;
struct blkif_request_discard discard;
struct blkif_request_other other;
+ struct blkif_request_indirect indirect;
} u;
} __attribute__((__packed__));
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index 75271b9..7d28aff 100644
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -188,6 +188,11 @@ struct __name##_back_ring { \
#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
(((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
+/* Ill-behaved frontend determination: Can there be this many requests? */
+#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \
+ (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r))
+
+
#define RING_PUSH_REQUESTS(_r) do { \
wmb(); /* back sees requests /before/ updated producer index */ \
(_r)->sring->req_prod = (_r)->req_prod_pvt; \
OpenPOWER on IntegriCloud