From a54c8f0f2d7df525ff997e2afe71866a1a013064 Mon Sep 17 00:00:00 2001
From: Cathy Avery <cathy.avery@oracle.com>
Date: Fri, 2 Oct 2015 09:35:01 -0400
Subject: xen-blkfront: check for null drvdata in blkback_changed
 (XenbusStateClosing)

xen-blkfront will crash if the check to talk_to_blkback()
in blkback_changed()(XenbusStateInitWait) returns an error.
The driver data is freed and info is set to NULL. Later during
the close process via talk_to_blkback's call to xenbus_dev_fatal()
the null pointer is passed to and dereference in blkfront_closing.

CC: stable@vger.kernel.org
Signed-off-by: Cathy Avery <cathy.avery@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 6d89ed3..c8fdbc7 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1968,7 +1968,8 @@ static void blkback_changed(struct xenbus_device *dev,
 			break;
 		/* Missed the backend's Closing state -- fallthrough */
 	case XenbusStateClosing:
-		blkfront_closing(info);
+		if (info)
+			blkfront_closing(info);
 		break;
 	}
 }
-- 
cgit v1.1


From dcc909d90ccdbb73226397ff6d298f7af35b0e11 Mon Sep 17 00:00:00 2001
From: Markus Pargmann <mpa@pengutronix.de>
Date: Tue, 6 Oct 2015 20:03:54 +0200
Subject: nbd: Add locking for tasks

The timeout handling introduced in
	7e2893a16d3e (nbd: Fix timeout detection)
introduces a race condition which may lead to killing of tasks that are
not in nbd context anymore. This was not observed or reproducable yet.

This patch adds locking to critical use of task_recv and task_send to
avoid killing tasks that already left the NBD thread functions. This
lock is only acquired if a timeout occures or the nbd device
starts/stops.

Reported-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Markus Pargmann <mpa@pengutronix.de>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Fixes: 7e2893a16d3e ("nbd: Fix timeout detection")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 293495a..1b87623 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -60,6 +60,7 @@ struct nbd_device {
 	bool disconnect; /* a disconnect has been requested by user */
 
 	struct timer_list timeout_timer;
+	spinlock_t tasks_lock;
 	struct task_struct *task_recv;
 	struct task_struct *task_send;
 
@@ -140,21 +141,23 @@ static void sock_shutdown(struct nbd_device *nbd)
 static void nbd_xmit_timeout(unsigned long arg)
 {
 	struct nbd_device *nbd = (struct nbd_device *)arg;
-	struct task_struct *task;
+	unsigned long flags;
 
 	if (list_empty(&nbd->queue_head))
 		return;
 
 	nbd->disconnect = true;
 
-	task = READ_ONCE(nbd->task_recv);
-	if (task)
-		force_sig(SIGKILL, task);
+	spin_lock_irqsave(&nbd->tasks_lock, flags);
+
+	if (nbd->task_recv)
+		force_sig(SIGKILL, nbd->task_recv);
 
-	task = READ_ONCE(nbd->task_send);
-	if (task)
+	if (nbd->task_send)
 		force_sig(SIGKILL, nbd->task_send);
 
+	spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
 	dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
 }
 
@@ -403,17 +406,24 @@ static int nbd_thread_recv(struct nbd_device *nbd)
 {
 	struct request *req;
 	int ret;
+	unsigned long flags;
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	sk_set_memalloc(nbd->sock->sk);
 
+	spin_lock_irqsave(&nbd->tasks_lock, flags);
 	nbd->task_recv = current;
+	spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 
 	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
 	if (ret) {
 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+
+		spin_lock_irqsave(&nbd->tasks_lock, flags);
 		nbd->task_recv = NULL;
+		spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
 		return ret;
 	}
 
@@ -429,7 +439,9 @@ static int nbd_thread_recv(struct nbd_device *nbd)
 
 	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
 
+	spin_lock_irqsave(&nbd->tasks_lock, flags);
 	nbd->task_recv = NULL;
+	spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 
 	if (signal_pending(current)) {
 		siginfo_t info;
@@ -534,8 +546,11 @@ static int nbd_thread_send(void *data)
 {
 	struct nbd_device *nbd = data;
 	struct request *req;
+	unsigned long flags;
 
+	spin_lock_irqsave(&nbd->tasks_lock, flags);
 	nbd->task_send = current;
+	spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 
 	set_user_nice(current, MIN_NICE);
 	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
@@ -572,7 +587,15 @@ static int nbd_thread_send(void *data)
 		nbd_handle_req(nbd, req);
 	}
 
+	spin_lock_irqsave(&nbd->tasks_lock, flags);
 	nbd->task_send = NULL;
+	spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+
+	/* Clear maybe pending signals */
+	if (signal_pending(current)) {
+		siginfo_t info;
+		dequeue_signal_lock(current, &current->blocked, &info);
+	}
 
 	return 0;
 }
@@ -1052,6 +1075,7 @@ static int __init nbd_init(void)
 		nbd_dev[i].magic = NBD_MAGIC;
 		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
 		spin_lock_init(&nbd_dev[i].queue_lock);
+		spin_lock_init(&nbd_dev[i].tasks_lock);
 		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
 		mutex_init(&nbd_dev[i].tx_lock);
 		init_timer(&nbd_dev[i].timeout_timer);
-- 
cgit v1.1


From 9ad18ab938375502c03cf467abecbb77264c9475 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 12:47:50 -0400
Subject: writeback: laptop_mode_timer_fn() needs rcu_read_lock() around
 bdi_writeback iteration

laptop_mode_timer_fn() was using bdi_for_each_wb() without the
required RCU locking leading to the following warning.

 WARNING: CPU: 0 PID: 0 at include/linux/backing-dev.h:415 laptop_mode_timer_fn+0x106/0x170()
 ...
 Call Trace:
  <IRQ>  [<ffffffff81480cdc>] dump_stack+0x4e/0x82
  [<ffffffff81051912>] warn_slowpath_common+0x82/0xc0
  [<ffffffff81051a0a>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8115f0e6>] laptop_mode_timer_fn+0x106/0x170
  [<ffffffff810ca8e3>] call_timer_fn+0xb3/0x2f0
  [<ffffffff810cad25>] run_timer_softirq+0x205/0x370
  [<ffffffff81056854>] __do_softirq+0xd4/0x460
  [<ffffffff81056d69>] irq_exit+0x89/0xa0
  [<ffffffff8185a892>] smp_apic_timer_interrupt+0x42/0x50
  [<ffffffff81858a44>] apic_timer_interrupt+0x84/0x90
 ...

Fix it by adding rcu_read_lock() around the iteration.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: a06fd6b10228 ("writeback: make laptop_mode_timer_fn() handle multiple bdi_writeback's")
Reviewed-by: Jan Kara <jack@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 mm/page-writeback.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0a931cd..902e5f2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1965,10 +1965,12 @@ void laptop_mode_timer_fn(unsigned long data)
 	if (!bdi_has_dirty_io(&q->backing_dev_info))
 		return;
 
+	rcu_read_lock();
 	bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
 		if (wb_has_dirty_io(wb))
 			wb_start_writeback(wb, nr_pages, true,
 					   WB_REASON_LAPTOP_TIMER);
+	rcu_read_unlock();
 }
 
 /*
-- 
cgit v1.1


From 6fdf860f15d4a6be8f0947bad608d687fe0c7af7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 12:47:51 -0400
Subject: writeback: fix bdi_writeback iteration in
 wakeup_dirtytime_writeback()

wakeup_dirtytime_writeback() walks and wakes up all wb's of all bdi's;
unfortunately, it was always waking up bdi->wb instead of the wb being
walked.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 001fe6f617b1 ("writeback: make wakeup_dirtytime_writeback() handle multiple bdi_writeback's")
Reviewed-by: Jan Kara <jack@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 091a364..d0da306 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1897,8 +1897,8 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
 		struct wb_iter iter;
 
 		bdi_for_each_wb(wb, bdi, &iter, 0)
-			if (!list_empty(&bdi->wb.b_dirty_time))
-				wb_wakeup(&bdi->wb);
+			if (!list_empty(&wb->b_dirty_time))
+				wb_wakeup(wb);
 	}
 	rcu_read_unlock();
 	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
-- 
cgit v1.1


From b817525a4a80c04e4ca44192d97a1ffa9f2be572 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 2 Oct 2015 14:47:05 -0400
Subject: writeback: bdi_writeback iteration must not skip dying ones

bdi_for_each_wb() is used in several places to wake up or issue
writeback work items to all wb's (bdi_writeback's) on a given bdi.
The iteration is performed by walking bdi->cgwb_tree; however, the
tree only indexes wb's which are currently active.

For example, when a memcg gets associated with a different blkcg, the
old wb is removed from the tree so that the new one can be indexed.
The old wb starts dying from then on but will linger till all its
inodes are drained.  As these dying wb's may still host dirty inodes,
writeback operations which affect all wb's must include them.
bdi_for_each_wb() skipping dying wb's led to sync(2) missing and
failing to sync the inodes belonging to those wb's.

This patch adds a RCU protected @bdi->wb_list which lists all wb's
beloinging to that bdi.  wb's are added on creation and removed on
release rather than on the start of destruction.  bdi_for_each_wb()
usages are replaced with list_for_each[_continue]_rcu() iterations
over @bdi->wb_list and bdi_for_each_wb() and its helpers are removed.

v2: Updated as per Jan.  last_wb ref leak in bdi_split_work_to_wbs()
    fixed and unnecessary list head severing in cgwb_bdi_destroy()
    removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Artem Bityutskiy <dedekind1@gmail.com>
Fixes: ebe41ab0c79d ("writeback: implement bdi_for_each_wb()")
Link: http://lkml.kernel.org/g/1443012552.19983.209.camel@gmail.com
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c                | 31 ++++++++++++++------
 include/linux/backing-dev-defs.h |  3 ++
 include/linux/backing-dev.h      | 63 ----------------------------------------
 mm/backing-dev.c                 | 14 ++++++++-
 mm/page-writeback.c              |  3 +-
 5 files changed, 39 insertions(+), 75 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0da306..29e4599 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -778,19 +778,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 				  struct wb_writeback_work *base_work,
 				  bool skip_if_busy)
 {
-	int next_memcg_id = 0;
-	struct bdi_writeback *wb;
-	struct wb_iter iter;
+	struct bdi_writeback *last_wb = NULL;
+	struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
+						struct bdi_writeback, bdi_node);
 
 	might_sleep();
 restart:
 	rcu_read_lock();
-	bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+	list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
 		DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
 		struct wb_writeback_work fallback_work;
 		struct wb_writeback_work *work;
 		long nr_pages;
 
+		if (last_wb) {
+			wb_put(last_wb);
+			last_wb = NULL;
+		}
+
 		/* SYNC_ALL writes out I_DIRTY_TIME too */
 		if (!wb_has_dirty_io(wb) &&
 		    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,12 +824,22 @@ restart:
 
 		wb_queue_work(wb, work);
 
-		next_memcg_id = wb->memcg_css->id + 1;
+		/*
+		 * Pin @wb so that it stays on @bdi->wb_list.  This allows
+		 * continuing iteration from @wb after dropping and
+		 * regrabbing rcu read lock.
+		 */
+		wb_get(wb);
+		last_wb = wb;
+
 		rcu_read_unlock();
 		wb_wait_for_completion(bdi, &fallback_work_done);
 		goto restart;
 	}
 	rcu_read_unlock();
+
+	if (last_wb)
+		wb_put(last_wb);
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
@@ -1857,12 +1872,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		struct bdi_writeback *wb;
-		struct wb_iter iter;
 
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		bdi_for_each_wb(wb, bdi, &iter, 0)
+		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
 					   false, reason);
 	}
@@ -1894,9 +1908,8 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		struct bdi_writeback *wb;
-		struct wb_iter iter;
 
-		bdi_for_each_wb(wb, bdi, &iter, 0)
+		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			if (!list_empty(&wb->b_dirty_time))
 				wb_wakeup(wb);
 	}
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a23209b..1b4d69f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,8 @@ struct bdi_writeback {
 	struct list_head work_list;
 	struct delayed_work dwork;	/* work item used for writeback */
 
+	struct list_head bdi_node;	/* anchored at bdi->wb_list */
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct percpu_ref refcnt;	/* used only for !root wb's */
 	struct fprop_local_percpu memcg_completions;
@@ -150,6 +152,7 @@ struct backing_dev_info {
 	atomic_long_t tot_write_bandwidth;
 
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
+	struct list_head wb_list; /* list of all wbs */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d5eb4ad1..78677e5 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -408,61 +408,6 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
 	rcu_read_unlock();
 }
 
-struct wb_iter {
-	int			start_memcg_id;
-	struct radix_tree_iter	tree_iter;
-	void			**slot;
-};
-
-static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
-						   struct backing_dev_info *bdi)
-{
-	struct radix_tree_iter *titer = &iter->tree_iter;
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (iter->start_memcg_id >= 0) {
-		iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
-		iter->start_memcg_id = -1;
-	} else {
-		iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
-	}
-
-	if (!iter->slot)
-		iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
-	if (iter->slot)
-		return *iter->slot;
-	return NULL;
-}
-
-static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
-						   struct backing_dev_info *bdi,
-						   int start_memcg_id)
-{
-	iter->start_memcg_id = start_memcg_id;
-
-	if (start_memcg_id)
-		return __wb_iter_next(iter, bdi);
-	else
-		return &bdi->wb;
-}
-
-/**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
- * @wb_cur: cursor struct bdi_writeback pointer
- * @bdi: bdi to walk wb's of
- * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_memcg_id: memcg ID to start iteration from
- *
- * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
- * to be used as temp storage during iteration.  rcu_read_lock() must be
- * held throughout iteration.
- */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)		\
-	for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);	\
-	     (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
-
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool inode_cgwb_enabled(struct inode *inode)
@@ -522,14 +467,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 {
 }
 
-struct wb_iter {
-	int		next_id;
-};
-
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
-	for ((iter)->next_id = (start_blkcg_id);			\
-	     ({	(wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
-
 static inline int inode_congested(struct inode *inode, int cong_bits)
 {
 	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 2df8ddc..e92d779 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -480,6 +480,10 @@ static void cgwb_release_workfn(struct work_struct *work)
 						release_work);
 	struct backing_dev_info *bdi = wb->bdi;
 
+	spin_lock_irq(&cgwb_lock);
+	list_del_rcu(&wb->bdi_node);
+	spin_unlock_irq(&cgwb_lock);
+
 	wb_shutdown(wb);
 
 	css_put(wb->memcg_css);
@@ -575,6 +579,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 		ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
 		if (!ret) {
 			atomic_inc(&bdi->usage_cnt);
+			list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
 			list_add(&wb->memcg_node, memcg_cgwb_list);
 			list_add(&wb->blkcg_node, blkcg_cgwb_list);
 			css_get(memcg_css);
@@ -764,15 +769,22 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
 
 int bdi_init(struct backing_dev_info *bdi)
 {
+	int ret;
+
 	bdi->dev = NULL;
 
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
 	INIT_LIST_HEAD(&bdi->bdi_list);
+	INIT_LIST_HEAD(&bdi->wb_list);
 	init_waitqueue_head(&bdi->wb_waitq);
 
-	return cgwb_bdi_init(bdi);
+	ret = cgwb_bdi_init(bdi);
+
+	list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+
+	return ret;
 }
 EXPORT_SYMBOL(bdi_init);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 902e5f2..0f1a94e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1956,7 +1956,6 @@ void laptop_mode_timer_fn(unsigned long data)
 	int nr_pages = global_page_state(NR_FILE_DIRTY) +
 		global_page_state(NR_UNSTABLE_NFS);
 	struct bdi_writeback *wb;
-	struct wb_iter iter;
 
 	/*
 	 * We want to write everything out, not just down to the dirty
@@ -1966,7 +1965,7 @@ void laptop_mode_timer_fn(unsigned long data)
 		return;
 
 	rcu_read_lock();
-	bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+	list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
 		if (wb_has_dirty_io(wb))
 			wb_start_writeback(wb, nr_pages, true,
 					   WB_REASON_LAPTOP_TIMER);
-- 
cgit v1.1


From d60d1bddd5b642711a237511845853755b25bf1f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 12:47:53 -0400
Subject: writeback: memcg dirty_throttle_control should be initialized with
 wb->memcg_completions

MDTC_INIT() is used to initialize dirty_throttle_control for memcg
domains.  It used DTC_INIT_COMMON() to initialized mdtc->wb and
->wb_completions which is incorrect as DTC_INIT_COMMON() sets the
latter to wb->completions instead of wb->memcg_completions.  This can
lead to wildly incorrect results when calculating the proportion of
dirty memory the memcg domain should get.

Remove DTC_INIT_COMMON() and update MDTC_INIT() to initialize
mdtc->wb_completions to wb->memcg_completions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 mm/page-writeback.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0f1a94e..56c0bff 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -145,9 +145,6 @@ struct dirty_throttle_control {
 	unsigned long		pos_ratio;
 };
 
-#define DTC_INIT_COMMON(__wb)	.wb = (__wb),				\
-				.wb_completions = &(__wb)->completions
-
 /*
  * Length of period for aging writeout fractions of bdis. This is an
  * arbitrarily chosen number. The longer the period, the slower fractions will
@@ -157,12 +154,16 @@ struct dirty_throttle_control {
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
-#define GDTC_INIT(__wb)		.dom = &global_wb_domain,		\
-				DTC_INIT_COMMON(__wb)
+#define GDTC_INIT(__wb)		.wb = (__wb),				\
+				.dom = &global_wb_domain,		\
+				.wb_completions = &(__wb)->completions
+
 #define GDTC_INIT_NO_WB		.dom = &global_wb_domain
-#define MDTC_INIT(__wb, __gdtc)	.dom = mem_cgroup_wb_domain(__wb),	\
-				.gdtc = __gdtc,				\
-				DTC_INIT_COMMON(__wb)
+
+#define MDTC_INIT(__wb, __gdtc)	.wb = (__wb),				\
+				.dom = mem_cgroup_wb_domain(__wb),	\
+				.wb_completions = &(__wb)->memcg_completions, \
+				.gdtc = __gdtc
 
 static bool mdtc_valid(struct dirty_throttle_control *dtc)
 {
@@ -213,7 +214,8 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
-#define GDTC_INIT(__wb)		DTC_INIT_COMMON(__wb)
+#define GDTC_INIT(__wb)		.wb = (__wb),                           \
+				.wb_completions = &(__wb)->completions
 #define GDTC_INIT_NO_WB
 #define MDTC_INIT(__wb, __gdtc)
 
-- 
cgit v1.1


From c5edf9cdc4c483b9a94c03fc0b9f769bd090bf3e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Sep 2015 13:04:26 -0400
Subject: writeback: fix incorrect calculation of available memory for memcg
 domains

For memcg domains, the amount of available memory was calculated as

 min(the amount currently in use + headroom according to memcg,
     total clean memory)

This isn't quite correct as what should be capped by the amount of
clean memory is the headroom, not the sum of memory in use and
headroom.  For example, if a memcg domain has a significant amount of
dirty memory, the above can lead to a value which is lower than the
current amount in use which doesn't make much sense.  In most
circumstances, the above leads to a number which is somewhat but not
drastically lower.

As the amount of memory which can be readily allocated to the memcg
domain is capped by the amount of system-wide clean memory which is
not already assigned to the memcg itself, the number we want is

 the amount currently in use +
 min(headroom according to memcg, clean memory elsewhere in the system)

This patch updates mem_cgroup_wb_stats() to return the number of
filepages and headroom instead of the calculated available pages.
mdtc_cap_avail() is renamed to mdtc_calc_avail() and performs the
above calculation from file, headroom, dirty and globally clean pages.

v2: Dummy mem_cgroup_wb_stats() implementation wasn't updated leading
    to build failure when !CGROUP_WRITEBACK.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: c2aa723a6093 ("writeback: implement memcg writeback domain based throttling")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/memcontrol.h |  8 +++++---
 mm/memcontrol.c            | 35 +++++++++++++++++------------------
 mm/page-writeback.c        | 29 ++++++++++++++++++-----------
 3 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6452ff4..3e3318dd 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -676,8 +676,9 @@ enum {
 
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
-			 unsigned long *pdirty, unsigned long *pwriteback);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+			 unsigned long *pheadroom, unsigned long *pdirty,
+			 unsigned long *pwriteback);
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
@@ -687,7 +688,8 @@ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 }
 
 static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
-				       unsigned long *pavail,
+				       unsigned long *pfilepages,
+				       unsigned long *pheadroom,
 				       unsigned long *pdirty,
 				       unsigned long *pwriteback)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1fedbde..882c10c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3740,44 +3740,43 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 /**
  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
  * @wb: bdi_writeback in question
- * @pavail: out parameter for number of available pages
+ * @pfilepages: out parameter for number of file pages
+ * @pheadroom: out parameter for number of allocatable pages according to memcg
  * @pdirty: out parameter for number of dirty pages
  * @pwriteback: out parameter for number of pages under writeback
  *
- * Determine the numbers of available, dirty, and writeback pages in @wb's
- * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
- * more involved.
+ * Determine the numbers of file, headroom, dirty, and writeback pages in
+ * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
+ * is a bit more involved.
  *
- * A memcg's headroom is "min(max, high) - used".  The available memory is
- * calculated as the lowest headroom of itself and the ancestors plus the
- * number of pages already being used for file pages.  Note that this
- * doesn't consider the actual amount of available memory in the system.
- * The caller should further cap *@pavail accordingly.
+ * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
+ * headroom is calculated as the lowest headroom of itself and the
+ * ancestors.  Note that this doesn't consider the actual amount of
+ * available memory in the system.  The caller should further cap
+ * *@pheadroom accordingly.
  */
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
-			 unsigned long *pdirty, unsigned long *pwriteback)
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+			 unsigned long *pheadroom, unsigned long *pdirty,
+			 unsigned long *pwriteback)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
-	unsigned long head_room = PAGE_COUNTER_MAX;
-	unsigned long file_pages;
 
 	*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 
 	/* this should eventually include NR_UNSTABLE_NFS */
 	*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+						     (1 << LRU_ACTIVE_FILE));
+	*pheadroom = PAGE_COUNTER_MAX;
 
-	file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
-						    (1 << LRU_ACTIVE_FILE));
 	while ((parent = parent_mem_cgroup(memcg))) {
 		unsigned long ceiling = min(memcg->memory.limit, memcg->high);
 		unsigned long used = page_counter_read(&memcg->memory);
 
-		head_room = min(head_room, ceiling - min(ceiling, used));
+		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
 		memcg = parent;
 	}
-
-	*pavail = file_pages + head_room;
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 56c0bff..2c90357 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -684,13 +684,19 @@ static unsigned long hard_dirty_limit(struct wb_domain *dom,
 	return max(thresh, dom->dirty_limit);
 }
 
-/* memory available to a memcg domain is capped by system-wide clean memory */
-static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+/*
+ * Memory which can be further allocated to a memcg domain is capped by
+ * system-wide clean memory excluding the amount being used in the domain.
+ */
+static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
+			    unsigned long filepages, unsigned long headroom)
 {
 	struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
-	unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+	unsigned long clean = filepages - min(filepages, mdtc->dirty);
+	unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+	unsigned long other_clean = global_clean - min(global_clean, clean);
 
-	mdtc->avail = min(mdtc->avail, clean);
+	mdtc->avail = filepages + min(headroom, other_clean);
 }
 
 /**
@@ -1564,16 +1570,16 @@ static void balance_dirty_pages(struct address_space *mapping,
 		}
 
 		if (mdtc) {
-			unsigned long writeback;
+			unsigned long filepages, headroom, writeback;
 
 			/*
 			 * If @wb belongs to !root memcg, repeat the same
 			 * basic calculations for the memcg domain.
 			 */
-			mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
-					    &writeback);
-			mdtc_cap_avail(mdtc);
+			mem_cgroup_wb_stats(wb, &filepages, &headroom,
+					    &mdtc->dirty, &writeback);
 			mdtc->dirty += writeback;
+			mdtc_calc_avail(mdtc, filepages, headroom);
 
 			domain_dirty_limits(mdtc);
 
@@ -1895,10 +1901,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
 		return true;
 
 	if (mdtc) {
-		unsigned long writeback;
+		unsigned long filepages, headroom, writeback;
 
-		mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
-		mdtc_cap_avail(mdtc);
+		mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
+				    &writeback);
+		mdtc_calc_avail(mdtc, filepages, headroom);
 		domain_dirty_limits(mdtc);	/* ditto, ignore writeback */
 
 		if (mdtc->dirty > mdtc->bg_thresh)
-- 
cgit v1.1


From 835da3f99d329b1160a1f7fc82c7ac81163d63d0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 6 Oct 2015 22:29:48 +0200
Subject: nvme: fix 32-bit build warning

Compiling the nvme driver on 32-bit warns about a cast from a __u64
variable to a pointer:

drivers/block/nvme-core.c: In function 'nvme_submit_io':
drivers/block/nvme-core.c:1847:4: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
    (void __user *)io.addr, length, NULL, 0);

The cast here is intentional and safe, so we can shut up the
gcc warning by adding an intermediate cast to 'uintptr_t'.

I had previously submitted a patch to fix this problem in the
nvme driver, but it was accepted on the same day that two new
warnings got added.

For clarification, I also change the third instance of this cast
to use uintptr_t instead of unsigned long now.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: d29ec8241c10e ("nvme: submit internal commands through the block layer")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 6f04771..fce353b 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1804,7 +1804,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
 	length = (io.nblocks + 1) << ns->lba_shift;
 	meta_len = (io.nblocks + 1) * ns->ms;
-	metadata = (void __user *)(unsigned long)io.metadata;
+	metadata = (void __user *)(uintptr_t)io.metadata;
 	write = io.opcode & 1;
 
 	if (ns->ext) {
@@ -1844,7 +1844,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.metadata = cpu_to_le64(meta_dma);
 
 	status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
-			(void __user *)io.addr, length, NULL, 0);
+			(void __user *)(uintptr_t)io.addr, length, NULL, 0);
  unmap:
 	if (meta) {
 		if (status == NVME_SC_SUCCESS && !write) {
@@ -1886,7 +1886,7 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
 	status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
-			NULL, (void __user *)cmd.addr, cmd.data_len,
+			NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
 			&cmd.result, timeout);
 	if (status >= 0) {
 		if (put_user(cmd.result, &ucmd->result))
-- 
cgit v1.1


From f42d79ab67322e51b92dd7aa965e310c71352a64 Mon Sep 17 00:00:00 2001
From: Junichi Nomura <j-nomura@ce.jp.nec.com>
Date: Wed, 14 Oct 2015 05:02:15 +0000
Subject: blk-mq: fix use-after-free in blk_mq_free_tag_set()

tags is freed in blk_mq_free_rq_map() and should not be used after that.
The problem doesn't manifest if CONFIG_CPUMASK_OFFSTACK is false because
free_cpumask_var() is nop.

tags->cpumask is allocated in blk_mq_init_tags() so it's natural to
free cpumask in its counter part, blk_mq_free_tags().

Fixes: f26cdc8536ad ("blk-mq: Shared tag enhancements")
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Keith Busch <keith.busch@intel.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 1 +
 block/blk-mq.c     | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index ed96474..ec2d119 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -641,6 +641,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
 	bt_free(&tags->bitmap_tags);
 	bt_free(&tags->breserved_tags);
+	free_cpumask_var(tags->cpumask);
 	kfree(tags);
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7785ae96..85f0143 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2296,10 +2296,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i]) {
+		if (set->tags[i])
 			blk_mq_free_rq_map(set, set->tags[i], i);
-			free_cpumask_var(set->tags[i]->cpumask);
-		}
 	}
 
 	kfree(set->tags);
-- 
cgit v1.1


From 81c04b943872e0332872df18cec1dec89b178b4d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 12 Oct 2015 21:23:39 +0200
Subject: nvme: use an integer value to Linux errno values

Use a separate integer variable to hold the signed Linux errno
values we pass back to the block layer.  Note that for pass through
commands those might still be NVMe values, but those fit into the
int as well.

Fixes: f4829a9b7a61: ("blk-mq: fix racy updates of rq->errors")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index fce353b..84e4a80 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -603,8 +603,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 	struct nvme_iod *iod = ctx;
 	struct request *req = iod_get_private(iod);
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
+	int error = 0;
 
 	if (unlikely(status)) {
 		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
@@ -621,9 +621,11 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 
 		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 			if (cmd_rq->ctx == CMD_CTX_CANCELLED)
-				status = -EINTR;
+				error = -EINTR;
+			else
+				error = status;
 		} else {
-			status = nvme_error_status(status);
+			error = nvme_error_status(status);
 		}
 	}
 
@@ -635,7 +637,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 	if (cmd_rq->aborted)
 		dev_warn(nvmeq->dev->dev,
 			"completing aborted command with status:%04x\n",
-			status);
+			error);
 
 	if (iod->nents) {
 		dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
@@ -649,7 +651,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 	}
 	nvme_free_iod(nvmeq->dev, iod);
 
-	blk_mq_complete_request(req, status);
+	blk_mq_complete_request(req, error);
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-- 
cgit v1.1


From b02176f30cd30acccd3b633ab7d9aed8b5da52ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 8 Sep 2015 12:20:22 -0400
Subject: block: don't release bdi while request_queue has live references

bdi's are initialized in two steps, bdi_init() and bdi_register(), but
destroyed in a single step by bdi_destroy() which, for a bdi embedded
in a request_queue, is called during blk_cleanup_queue() which makes
the queue invisible and starts the draining of remaining usages.

A request_queue's user can access the congestion state of the embedded
bdi as long as it holds a reference to the queue.  As such, it may
access the congested state of a queue which finished
blk_cleanup_queue() but hasn't reached blk_release_queue() yet.
Because the congested state was embedded in backing_dev_info which in
turn is embedded in request_queue, accessing the congested state after
bdi_destroy() was called was fine.  The bdi was destroyed but the
memory region for the congested state remained accessible till the
queue got released.

a13f35e87140 ("writeback: don't embed root bdi_writeback_congested in
bdi_writeback") changed the situation.  Now, the root congested state
which is expected to be pinned while request_queue remains accessible
is separately reference counted and the base ref is put during
bdi_destroy().  This means that the root congested state may go away
prematurely while the queue is between bdi_dstroy() and
blk_cleanup_queue(), which was detected by Andrey's KASAN tests.

The root cause of this problem is that bdi doesn't distinguish the two
steps of destruction, unregistration and release, and now the root
congested state actually requires a separate release step.  To fix the
issue, this patch separates out bdi_unregister() and bdi_exit() from
bdi_destroy().  bdi_unregister() is called from blk_cleanup_queue()
and bdi_exit() from blk_release_queue().  bdi_destroy() is now just a
simple wrapper calling the two steps back-to-back.

While at it, the prototype of bdi_destroy() is moved right below
bdi_setup_and_register() so that the counterpart operations are
located together.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: a13f35e87140 ("writeback: don't embed root bdi_writeback_congested in bdi_writeback")
Cc: stable@vger.kernel.org # v4.2+
Reported-and-tested-by: Andrey Konovalov <andreyknvl@google.com>
Link: http://lkml.kernel.org/g/CAAeHK+zUJ74Zn17=rOyxacHU18SgCfC6bsYW=6kCY5GXJBwGfQ@mail.gmail.com
Reviewed-by: Jan Kara <jack@suse.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c            |  2 +-
 block/blk-sysfs.c           |  1 +
 include/linux/backing-dev.h |  6 +++++-
 mm/backing-dev.c            | 12 +++++++++++-
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2eb722d..18e92a6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -576,7 +576,7 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
-	bdi_destroy(&q->backing_dev_info);
+	bdi_unregister(&q->backing_dev_info);
 
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3e44a9d..07b42f5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -540,6 +540,7 @@ static void blk_release_queue(struct kobject *kobj)
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 
+	bdi_exit(&q->backing_dev_info);
 	blkcg_exit_queue(q);
 
 	if (q->elevator) {
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 78677e5..c85f749 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -19,13 +19,17 @@
 #include <linux/slab.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
-void bdi_destroy(struct backing_dev_info *bdi);
+void bdi_exit(struct backing_dev_info *bdi);
 
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+void bdi_unregister(struct backing_dev_info *bdi);
+
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
+void bdi_destroy(struct backing_dev_info *bdi);
+
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 			bool range_cyclic, enum wb_reason reason);
 void wb_start_background_writeback(struct bdi_writeback *wb);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e92d779..9e84139 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -835,7 +835,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
 	synchronize_rcu_expedited();
 }
 
-void bdi_destroy(struct backing_dev_info *bdi)
+void bdi_unregister(struct backing_dev_info *bdi)
 {
 	/* make sure nobody finds us on the bdi_list anymore */
 	bdi_remove_from_list(bdi);
@@ -847,9 +847,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
 		device_unregister(bdi->dev);
 		bdi->dev = NULL;
 	}
+}
 
+void bdi_exit(struct backing_dev_info *bdi)
+{
+	WARN_ON_ONCE(bdi->dev);
 	wb_exit(&bdi->wb);
 }
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+	bdi_unregister(bdi);
+	bdi_exit(bdi);
+}
 EXPORT_SYMBOL(bdi_destroy);
 
 /*
-- 
cgit v1.1


From 0dfc70c33409afc232ef0b9ec210535dfbf9bc61 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 15 Oct 2015 13:38:48 -0600
Subject: NVMe: Fix memory leak on retried commands

Resources are reallocated for requeued commands, so unmap and release
the iod for the failed command.

It's a pretty bad memory leak and causes a kernel hang if you remove a
drive because of a busy dma pool. You'll get messages spewing like this:

  nvme 0000:xx:xx.x: dma_pool_destroy prp list 256, ffff880420dec000 busy

and lock up pci and the driver since removal never completes while
holding a lock.

Cc: stable@vger.kernel.org
Cc: <stable@vger.kernel.org> # 4.0.x-
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 84e4a80..ccc0c1f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -604,6 +604,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 	struct request *req = iod_get_private(iod);
 	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
+	bool requeue = false;
 	int error = 0;
 
 	if (unlikely(status)) {
@@ -611,12 +612,13 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 		    && (jiffies - req->start_time) < req->timeout) {
 			unsigned long flags;
 
+			requeue = true;
 			blk_mq_requeue_request(req);
 			spin_lock_irqsave(req->q->queue_lock, flags);
 			if (!blk_queue_stopped(req->q))
 				blk_mq_kick_requeue_list(req->q);
 			spin_unlock_irqrestore(req->q->queue_lock, flags);
-			return;
+			goto release_iod;
 		}
 
 		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
@@ -639,6 +641,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 			"completing aborted command with status:%04x\n",
 			error);
 
+release_iod:
 	if (iod->nents) {
 		dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
 			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
@@ -651,7 +654,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 	}
 	nvme_free_iod(nvmeq->dev, iod);
 
-	blk_mq_complete_request(req, error);
+	if (likely(!requeue))
+		blk_mq_complete_request(req, error);
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-- 
cgit v1.1


From e27c5b9d23168cc2cb8fec147ae7ed1f7a2005c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 Oct 2015 18:14:19 -0400
Subject: writeback: remove broken rbtree_postorder_for_each_entry_safe() usage
 in cgwb_bdi_destroy()

a20135ffbc44 ("writeback: don't drain bdi_writeback_congested on bdi
destruction") added rbtree_postorder_for_each_entry_safe() which is
used to remove all entries; however, according to Cody, the iterator
isn't safe against operations which may rebalance the tree.  Fix it by
switching to repeatedly removing rb_first() until empty.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Cody P Schafer <dev@codyps.com>
Fixes: a20135ffbc44 ("writeback: don't drain bdi_writeback_congested on bdi destruction")
Link: http://lkml.kernel.org/g/1443997973-1700-1-git-send-email-dev@codyps.com
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 mm/backing-dev.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 9e84139..619984f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -681,7 +681,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
 {
 	struct radix_tree_iter iter;
-	struct bdi_writeback_congested *congested, *congested_n;
+	struct rb_node *rbn;
 	void **slot;
 
 	WARN_ON(test_bit(WB_registered, &bdi->wb.state));
@@ -691,9 +691,11 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
 	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
 		cgwb_kill(*slot);
 
-	rbtree_postorder_for_each_entry_safe(congested, congested_n,
-					&bdi->cgwb_congested_tree, rb_node) {
-		rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree);
+	while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
+		struct bdi_writeback_congested *congested =
+			rb_entry(rbn, struct bdi_writeback_congested, rb_node);
+
+		rb_erase(rbn, &bdi->cgwb_congested_tree);
 		congested->bdi = NULL;	/* mark @congested unlinked */
 	}
 
-- 
cgit v1.1