From 7f0e7bed936a0c422641a046551829a01341dd80 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Jun 2010 18:14:34 +0200
Subject: writeback: fix writeback completion notifications

The code dealing with bdi_work->state and completion of a bdi_work is a
major mess currently.  This patch makes sure we directly use one set of
flags to deal with it, and use it consistently, which means:

 - always notify about completion from the rcu callback.  We only ever
   wait for it from on-stack callers, so this simplification does not
   even cause a theoretical slowdown currently.  It also makes sure we
   don't miss out on the notification if we ever add other callers to
   wait for it.
 - make earlier completion notification depending on the on-stack
   allocation, not the sync mode.  If we introduce new callers that
   want to do WB_SYNC_NONE writeback from on-stack callers this will
   be nessecary.

Also rename bdi_wait_on_work_clear to bdi_wait_on_work_done and inline
a few small functions into their only caller to make the code
understandable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 65 +++++++++++++------------------------------------------
 1 file changed, 15 insertions(+), 50 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1d1088f..dbf6f10 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -63,24 +63,16 @@ struct bdi_work {
 };
 
 enum {
-	WS_USED_B = 0,
-	WS_ONSTACK_B,
+	WS_INPROGRESS = 0,
+	WS_ONSTACK,
 };
 
-#define WS_USED (1 << WS_USED_B)
-#define WS_ONSTACK (1 << WS_ONSTACK_B)
-
-static inline bool bdi_work_on_stack(struct bdi_work *work)
-{
-	return test_bit(WS_ONSTACK_B, &work->state);
-}
-
 static inline void bdi_work_init(struct bdi_work *work,
 				 struct wb_writeback_args *args)
 {
 	INIT_RCU_HEAD(&work->rcu_head);
 	work->args = *args;
-	work->state = WS_USED;
+	__set_bit(WS_INPROGRESS, &work->state);
 }
 
 /**
@@ -95,43 +87,16 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 	return !list_empty(&bdi->work_list);
 }
 
-static void bdi_work_clear(struct bdi_work *work)
-{
-	clear_bit(WS_USED_B, &work->state);
-	smp_mb__after_clear_bit();
-	/*
-	 * work can have disappeared at this point. bit waitq functions
-	 * should be able to tolerate this, provided bdi_sched_wait does
-	 * not dereference it's pointer argument.
-	*/
-	wake_up_bit(&work->state, WS_USED_B);
-}
-
 static void bdi_work_free(struct rcu_head *head)
 {
 	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
 
-	if (!bdi_work_on_stack(work))
-		kfree(work);
-	else
-		bdi_work_clear(work);
-}
-
-static void wb_work_complete(struct bdi_work *work)
-{
-	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
-	int onstack = bdi_work_on_stack(work);
+	clear_bit(WS_INPROGRESS, &work->state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&work->state, WS_INPROGRESS);
 
-	/*
-	 * For allocated work, we can clear the done/seen bit right here.
-	 * For on-stack work, we need to postpone both the clear and free
-	 * to after the RCU grace period, since the stack could be invalidated
-	 * as soon as bdi_work_clear() has done the wakeup.
-	 */
-	if (!onstack)
-		bdi_work_clear(work);
-	if (sync_mode == WB_SYNC_NONE || onstack)
-		call_rcu(&work->rcu_head, bdi_work_free);
+	if (!test_bit(WS_ONSTACK, &work->state))
+		kfree(work);
 }
 
 static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
@@ -147,7 +112,7 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 		list_del_rcu(&work->list);
 		spin_unlock(&bdi->wb_lock);
 
-		wb_work_complete(work);
+		call_rcu(&work->rcu_head, bdi_work_free);
 	}
 }
 
@@ -185,9 +150,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
  * Used for on-stack allocated work items. The caller needs to wait until
  * the wb threads have acked the work before it's safe to continue.
  */
-static void bdi_wait_on_work_clear(struct bdi_work *work)
+static void bdi_wait_on_work_done(struct bdi_work *work)
 {
-	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
+	wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait,
 		    TASK_UNINTERRUPTIBLE);
 }
 
@@ -234,10 +199,10 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 	struct bdi_work work;
 
 	bdi_work_init(&work, &args);
-	work.state |= WS_ONSTACK;
+	__set_bit(WS_ONSTACK, &work.state);
 
 	bdi_queue_work(bdi, &work);
-	bdi_wait_on_work_clear(&work);
+	bdi_wait_on_work_done(&work);
 }
 
 /**
@@ -911,7 +876,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
 		 */
-		if (args.sync_mode == WB_SYNC_NONE)
+		if (!test_bit(WS_ONSTACK, &work->state))
 			wb_clear_pending(wb, work);
 
 		wrote += wb_writeback(wb, &args);
@@ -920,7 +885,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		 * This is a data integrity writeback, so only do the
 		 * notification when we have completed the work.
 		 */
-		if (args.sync_mode == WB_SYNC_ALL)
+		if (test_bit(WS_ONSTACK, &work->state))
 			wb_clear_pending(wb, work);
 	}
 
-- 
cgit v1.1


From 3c4d716538f3eefb1c1f10961a047a6456a2b590 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Jun 2010 18:14:43 +0200
Subject: writeback: queue work on stack in writeback_inodes_sb

If we want to rely on s_umount in the caller we need to wait for completion
of the I/O submission before returning to the caller.  Refactor
bdi_sync_writeback into a bdi_queue_work_onstack helper and use it for this
case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index dbf6f10..7596669 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -178,30 +178,22 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 }
 
 /**
- * bdi_sync_writeback - start and wait for writeback
- * @bdi: the backing device to write from
+ * bdi_queue_work_onstack - start and wait for writeback
  * @sb: write inodes from this super_block
  *
  * Description:
- *   This does WB_SYNC_ALL data integrity writeback and waits for the
- *   IO to complete. Callers must hold the sb s_umount semaphore for
+ *   This function initiates writeback and waits for the operation to
+ *   complete. Callers must hold the sb s_umount semaphore for
  *   reading, to avoid having the super disappear before we are done.
  */
-static void bdi_sync_writeback(struct backing_dev_info *bdi,
-			       struct super_block *sb)
+static void bdi_queue_work_onstack(struct wb_writeback_args *args)
 {
-	struct wb_writeback_args args = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_ALL,
-		.nr_pages	= LONG_MAX,
-		.range_cyclic	= 0,
-	};
 	struct bdi_work work;
 
-	bdi_work_init(&work, &args);
+	bdi_work_init(&work, args);
 	__set_bit(WS_ONSTACK, &work.state);
 
-	bdi_queue_work(bdi, &work);
+	bdi_queue_work(args->sb->s_bdi, &work);
 	bdi_wait_on_work_done(&work);
 }
 
@@ -944,7 +936,7 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 
 /*
  * Schedule writeback for all backing devices. This does WB_SYNC_NONE
- * writeback, for integrity writeback see bdi_sync_writeback().
+ * writeback, for integrity writeback see bdi_queue_work_onstack().
  */
 static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 {
@@ -1183,12 +1175,15 @@ void writeback_inodes_sb(struct super_block *sb)
 {
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-	long nr_to_write;
+	struct wb_writeback_args args = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_NONE,
+	};
 
-	nr_to_write = nr_dirty + nr_unstable +
+	args.nr_pages = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+	bdi_queue_work_onstack(&args);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1218,7 +1213,14 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-	bdi_sync_writeback(sb->s_bdi, sb);
+	struct wb_writeback_args args = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_pages	= LONG_MAX,
+		.range_cyclic	= 0,
+	};
+
+	bdi_queue_work_onstack(&args);
 	wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
-- 
cgit v1.1


From cf37e972478ec58a8a54a6b4f951815f0ae28f78 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Jun 2010 18:14:51 +0200
Subject: writeback: enforce s_umount locking in writeback_inodes_sb

Make sure that not only sync_filesystem but all callers of writeback_inodes_sb
have the superblock protected against remount.  As-is this disables all
functionality for these callers, but the next patch relies on this locking to
fix writeback_inodes_sb for sync_filesystem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 6 ++++++
 fs/ubifs/budget.c | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7596669..2627f0d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1180,6 +1180,8 @@ void writeback_inodes_sb(struct super_block *sb)
 		.sync_mode	= WB_SYNC_NONE,
 	};
 
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
 	args.nr_pages = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
@@ -1197,7 +1199,9 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 int writeback_inodes_sb_if_idle(struct super_block *sb)
 {
 	if (!writeback_in_progress(sb->s_bdi)) {
+		down_read(&sb->s_umount);
 		writeback_inodes_sb(sb);
+		up_read(&sb->s_umount);
 		return 1;
 	} else
 		return 0;
@@ -1220,6 +1224,8 @@ void sync_inodes_sb(struct super_block *sb)
 		.range_cyclic	= 0,
 	};
 
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
 	bdi_queue_work_onstack(&args);
 	wait_sb_inodes(sb);
 }
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 076ca50..c8ff0d1 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -62,7 +62,9 @@
  */
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
+	down_read(&c->vfs_sb->s_umount);
 	writeback_inodes_sb(c->vfs_sb);
+	up_read(&c->vfs_sb->s_umount);
 }
 
 /**
-- 
cgit v1.1


From d19de7edf59cdd586777b009e0e8fbe5412dd35f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Jun 2010 18:14:58 +0200
Subject: writeback: fix writeback_inodes_wb from writeback_inodes_sb

When we call writeback_inodes_wb from writeback_inodes_sb we always have
s_umount held, which currently makes the whole operation a no-op.

But if we are called to write out inodes for a specific superblock we always
have s_umount held, so replace the incorrect logic checking for WB_SYNC_ALL
which only worked by coincidence with the proper check for an explicit
superblock argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 61 +++++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 38 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2627f0d..68ece4b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -518,39 +518,19 @@ select_queue:
 	return ret;
 }
 
-static void unpin_sb_for_writeback(struct super_block *sb)
-{
-	up_read(&sb->s_umount);
-	put_super(sb);
-}
-
-enum sb_pin_state {
-	SB_PINNED,
-	SB_NOT_PINNED,
-	SB_PIN_FAILED
-};
-
 /*
- * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * For background writeback the caller does not have the sb pinned
  * before calling writeback. So make sure that we do pin it, so it doesn't
  * go away while we are writing inodes from it.
  */
-static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
-					      struct super_block *sb)
+static bool pin_sb_for_writeback(struct super_block *sb)
 {
-	/*
-	 * Caller must already hold the ref for this
-	 */
-	if (wbc->sync_mode == WB_SYNC_ALL) {
-		WARN_ON(!rwsem_is_locked(&sb->s_umount));
-		return SB_NOT_PINNED;
-	}
 	spin_lock(&sb_lock);
 	sb->s_count++;
 	if (down_read_trylock(&sb->s_umount)) {
 		if (sb->s_root) {
 			spin_unlock(&sb_lock);
-			return SB_PINNED;
+			return true;
 		}
 		/*
 		 * umounted, drop rwsem again and fall through to failure
@@ -559,7 +539,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
 	}
 	sb->s_count--;
 	spin_unlock(&sb_lock);
-	return SB_PIN_FAILED;
+	return false;
 }
 
 /*
@@ -638,24 +618,29 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 		struct inode *inode = list_entry(wb->b_io.prev,
 						 struct inode, i_list);
 		struct super_block *sb = inode->i_sb;
-		enum sb_pin_state state;
 
-		if (wbc->sb && sb != wbc->sb) {
-			/* super block given and doesn't
-			   match, skip this inode */
-			redirty_tail(inode);
-			continue;
-		}
-		state = pin_sb_for_writeback(wbc, sb);
+		if (wbc->sb) {
+			/*
+			 * We are requested to write out inodes for a specific
+			 * superblock.  This means we already have s_umount
+			 * taken by the caller which also waits for us to
+			 * complete the writeout.
+			 */
+			if (sb != wbc->sb) {
+				redirty_tail(inode);
+				continue;
+			}
 
-		if (state == SB_PIN_FAILED) {
-			requeue_io(inode);
-			continue;
+			WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+			ret = writeback_sb_inodes(sb, wb, wbc);
+		} else {
+			if (!pin_sb_for_writeback(sb))
+				continue;
+			ret = writeback_sb_inodes(sb, wb, wbc);
+			drop_super(sb);
 		}
-		ret = writeback_sb_inodes(sb, wb, wbc);
 
-		if (state == SB_PINNED)
-			unpin_sb_for_writeback(sb);
 		if (ret)
 			break;
 	}
-- 
cgit v1.1


From b8c2f3474f1077599ec6e90c2f263f17055cc3d8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Jun 2010 18:15:07 +0200
Subject: writeback: simplify wakeup_flusher_threads

bdi_writeback_all only has one caller, so fold it to simplify the code and
flatten the call stack.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 68ece4b..4fcca4f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -920,42 +920,32 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 }
 
 /*
- * Schedule writeback for all backing devices. This does WB_SYNC_NONE
- * writeback, for integrity writeback see bdi_queue_work_onstack().
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.
  */
-static void bdi_writeback_all(struct super_block *sb, long nr_pages)
+void wakeup_flusher_threads(long nr_pages)
 {
+	struct backing_dev_info *bdi;
 	struct wb_writeback_args args = {
-		.sb		= sb,
-		.nr_pages	= nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 	};
-	struct backing_dev_info *bdi;
 
-	rcu_read_lock();
+	if (nr_pages) {
+		args.nr_pages = nr_pages;
+	} else {
+		args.nr_pages = global_page_state(NR_FILE_DIRTY) +
+				global_page_state(NR_UNSTABLE_NFS);
+	}
 
+	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-
 		bdi_alloc_queue_work(bdi, &args);
 	}
-
 	rcu_read_unlock();
 }
 
-/*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
- */
-void wakeup_flusher_threads(long nr_pages)
-{
-	if (nr_pages == 0)
-		nr_pages = global_page_state(NR_FILE_DIRTY) +
-				global_page_state(NR_UNSTABLE_NFS);
-	bdi_writeback_all(NULL, nr_pages);
-}
-
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
-- 
cgit v1.1


From c5444198ca210498e8ac0ba121b4cd3537aa12f7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Jun 2010 18:15:15 +0200
Subject: writeback: simplify and split bdi_start_writeback

bdi_start_writeback now never gets a superblock passed, so we can just remove
that case.  And to further untangle the code and flatten the call stack
split it into two trivial helpers for it's two callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c           | 32 ++++++++++++++++++++------------
 include/linux/backing-dev.h |  4 ++--
 mm/page-writeback.c         |  5 ++---
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4fcca4f..0079bf5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -200,7 +200,6 @@ static void bdi_queue_work_onstack(struct wb_writeback_args *args)
 /**
  * bdi_start_writeback - start writeback
  * @bdi: the backing device to write from
- * @sb: write inodes from this super_block
  * @nr_pages: the number of pages to write
  *
  * Description:
@@ -209,25 +208,34 @@ static void bdi_queue_work_onstack(struct wb_writeback_args *args)
  *   completion. Caller need not hold sb s_umount semaphore.
  *
  */
-void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-			 long nr_pages)
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
 	struct wb_writeback_args args = {
-		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.nr_pages	= nr_pages,
 		.range_cyclic	= 1,
 	};
 
-	/*
-	 * We treat @nr_pages=0 as the special case to do background writeback,
-	 * ie. to sync pages until the background dirty threshold is reached.
-	 */
-	if (!nr_pages) {
-		args.nr_pages = LONG_MAX;
-		args.for_background = 1;
-	}
+	bdi_alloc_queue_work(bdi, &args);
+}
 
+/**
+ * bdi_start_background_writeback - start background writeback
+ * @bdi: the backing device to write from
+ *
+ * Description:
+ *   This does WB_SYNC_NONE background writeback. The IO is only
+ *   started when this function returns, we make no guarentees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ */
+void bdi_start_background_writeback(struct backing_dev_info *bdi)
+{
+	struct wb_writeback_args args = {
+		.sync_mode	= WB_SYNC_NONE,
+		.nr_pages	= LONG_MAX,
+		.for_background = 1,
+		.range_cyclic	= 1,
+	};
 	bdi_alloc_queue_work(bdi, &args);
 }
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index aee5f6c..9ae2889 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -105,8 +105,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
-void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-				long nr_pages);
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
+void bdi_start_background_writeback(struct backing_dev_info *bdi);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bbd396a..54f28bd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
 			       + global_page_state(NR_UNSTABLE_NFS))
 					  > background_thresh)))
-		bdi_start_writeback(bdi, NULL, 0);
+		bdi_start_background_writeback(bdi);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -705,9 +705,8 @@ void laptop_mode_timer_fn(unsigned long data)
 	 * We want to write everything out, not just down to the dirty
 	 * threshold
 	 */
-
 	if (bdi_has_dirty_io(&q->backing_dev_info))
-		bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
+		bdi_start_writeback(&q->backing_dev_info, nr_pages);
 }
 
 /*
-- 
cgit v1.1


From 334132ae921a14ac2b2ba48e174136f7f2c9aae1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 Jun 2010 14:28:43 +0200
Subject: writeback: add missing requeue_io in writeback_inodes_wb

In "writeback: fix writeback_inodes_wb from writeback_inodes_sb" I
accidentally removed the requeue_io if we need to skip a superblock
because we can't pin it.  Add it back, otherwise we're getting spurious
lockups after multiple xfstests runs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0079bf5..3a066e9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -643,8 +643,10 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 
 			ret = writeback_sb_inodes(sb, wb, wbc);
 		} else {
-			if (!pin_sb_for_writeback(sb))
+			if (!pin_sb_for_writeback(sb)) {
+				requeue_io(inode);
 				continue;
+			}
 			ret = writeback_sb_inodes(sb, wb, wbc);
 			drop_super(sb);
 		}
-- 
cgit v1.1


From 29cb48594b873f6193d6327097e504bd3e2314de Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 Jun 2010 15:31:01 +0200
Subject: writeback: fix pin_sb_for_writeback

We need to check for s_instances to make sure we don't bother working
against a filesystem that is beeing unmounted, and we need to call
put_super to make sure a superblock is freed when we race against
umount.  Also no need to keep sb_lock after we got a reference on it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3a066e9..0609607 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -534,19 +534,21 @@ select_queue:
 static bool pin_sb_for_writeback(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
+	if (list_empty(&sb->s_instances)) {
+		spin_unlock(&sb_lock);
+		return false;
+	}
+
 	sb->s_count++;
+	spin_unlock(&sb_lock);
+
 	if (down_read_trylock(&sb->s_umount)) {
-		if (sb->s_root) {
-			spin_unlock(&sb_lock);
+		if (sb->s_root)
 			return true;
-		}
-		/*
-		 * umounted, drop rwsem again and fall through to failure
-		 */
 		up_read(&sb->s_umount);
 	}
-	sb->s_count--;
-	spin_unlock(&sb_lock);
+
+	put_super(sb);
 	return false;
 }
 
-- 
cgit v1.1


From dc66c74de6f4238020db3e2041d4aca5c5b3e9bc Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 2 Jun 2010 14:31:29 +0200
Subject: drbd: Fixed a race between disk-attach and unexpected state changes

This was a very hard to trigger race condition.

If we got a state packet from the peer, after drbd_nl_disk() has
already changed the disk state to D_NEGOTIATING but
after_state_ch() was not yet run by the worker, then receive_state()
might called drbd_sync_handshake(), which in turn crashed
when accessing p_uuid.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 2 --
 drivers/block/drbd/drbd_nl.c   | 6 ++++++
 include/linux/drbd.h           | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 6b077f9..7258c95 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1236,8 +1236,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Last part of the attaching process ... */
 	if (ns.conn >= C_CONNECTED &&
 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-		kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
-		mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
 		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
 		drbd_send_uuids(mdev);
 		drbd_send_state(mdev);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 632e324..2151f18 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1114,6 +1114,12 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		mdev->new_state_tmp.i = ns.i;
 		ns.i = os.i;
 		ns.disk = D_NEGOTIATING;
+
+		/* We expect to receive up-to-date UUIDs soon.
+		   To avoid a race in receive_state, free p_uuid while
+		   holding req_lock. I.e. atomic with the state change */
+		kfree(mdev->p_uuid);
+		mdev->p_uuid = NULL;
 	}
 
 	rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 30da4ae..b8d2516 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.8rc2"
+#define REL_VERSION "8.3.8"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 94
-- 
cgit v1.1


From d4a3895f5d024b47ef8e9d98c59a9b86dcdcef59 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 14 Jun 2010 12:55:09 +0200
Subject: cpqarray: fix wrong __init type on pci probe function

It needs to be __devinit, not __init.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cpqarray.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 91d1163..95ae5b9 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -503,7 +503,7 @@ Enomem4:
 	return -1;
 }
 
-static int __init cpqarray_init_one( struct pci_dev *pdev,
+static int __devinit cpqarray_init_one( struct pci_dev *pdev,
 	const struct pci_device_id *ent)
 {
 	int i;
-- 
cgit v1.1


From 552618d124b68d41c2effaaaa3ca5b8ce9598502 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 14 Jun 2010 15:21:33 +0200
Subject: cpqarray: fix two more wrong section type

cpqarray_register_ctlr() and cpqarray_eisa_detect() also
need to be marked as __devinit.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cpqarray.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 95ae5b9..abb4ec6 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -386,7 +386,7 @@ static void __devexit cpqarray_remove_one_eisa (int i)
 }
 
 /* pdev is NULL for eisa */
-static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
+static int __devinit cpqarray_register_ctlr( int i, struct pci_dev *pdev)
 {
 	struct request_queue *q;
 	int j;
@@ -740,7 +740,7 @@ __setup("smart2=", cpqarray_setup);
 /*
  * Find an EISA controller's signature.  Set up an hba if we find it.
  */
-static int __init cpqarray_eisa_detect(void)
+static int __devinit cpqarray_eisa_detect(void)
 {
 	int i=0, j;
 	__u32 board_id;
-- 
cgit v1.1


From 79600aadcf35dd31ec284928cf45296fea98db61 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Tue, 15 Jun 2010 08:12:34 +0200
Subject: cciss: set SCSI max cmd len to 16, as default is wrong

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Cc: Mike Miller <mikem@beardog.cce.hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss_scsi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 3381505..72dae92 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -861,6 +861,7 @@ cciss_scsi_detect(int ctlr)
 	sh->n_io_port = 0;	// I don't think we use these two...
 	sh->this_id = SELF_SCSI_ID;  
 	sh->sg_tablesize = hba[ctlr]->maxsgentries;
+	sh->max_cmd_len = MAX_COMMAND_SIZE;
 
 	((struct cciss_scsi_adapter_data_t *) 
 		hba[ctlr]->scsi_ctlr)->scsi_host = sh;
-- 
cgit v1.1


From fbbf055692aeb25c54c49d9ca84532de836fbba0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 17 Jun 2010 09:54:32 +0200
Subject: block: fix DISCARD_BARRIER requests

Filesystems assume that DISCARD_BARRIER are full barriers, so that they
don't have to track in-progress discard operation when submitting new I/O.
But currently we only treat them as elevator barriers, which don't
actually do the nessecary queue drains.

Also remove the unlikely around both the DISCARD and BARRIER requests -
the happen far too often for a static mispredict.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f84cce4..3deca77 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1149,13 +1149,10 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	else
 		req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
 
-	if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
+	if (bio_rw_flagged(bio, BIO_RW_DISCARD))
 		req->cmd_flags |= REQ_DISCARD;
-		if (bio_rw_flagged(bio, BIO_RW_BARRIER))
-			req->cmd_flags |= REQ_SOFTBARRIER;
-	} else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
+	if (bio_rw_flagged(bio, BIO_RW_BARRIER))
 		req->cmd_flags |= REQ_HARDBARRIER;
-
 	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		req->cmd_flags |= REQ_RW_SYNC;
 	if (bio_rw_flagged(bio, BIO_RW_META))
-- 
cgit v1.1


From c10b61f0910466b4b99c266a7d76ac4390743fb5 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Thu, 17 Jun 2010 10:19:11 -0400
Subject: cfq: Don't allow queue merges for queues that have no process
 references

Hi,

A user reported a kernel bug when running a particular program that did
the following:

created 32 threads
- each thread took a mutex, grabbed a global offset, added a buffer size
  to that offset, released the lock
- read from the given offset in the file
- created a new thread to do the same
- exited

The result is that cfq's close cooperator logic would trigger, as the
threads were issuing I/O within the mean seek distance of one another.
This workload managed to routinely trigger a use after free bug when
walking the list of merge candidates for a particular cfqq
(cfqq->new_cfqq).  The logic used for merging queues looks like this:

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
{
	int process_refs, new_process_refs;
	struct cfq_queue *__cfqq;

	/* Avoid a circular list and skip interim queue merges */
	while ((__cfqq = new_cfqq->new_cfqq)) {
		if (__cfqq == cfqq)
			return;
		new_cfqq = __cfqq;
	}

	process_refs = cfqq_process_refs(cfqq);
	/*
	 * If the process for the cfqq has gone away, there is no
	 * sense in merging the queues.
	 */
	if (process_refs == 0)
		return;

	/*
	 * Merge in the direction of the lesser amount of work.
	 */
	new_process_refs = cfqq_process_refs(new_cfqq);
	if (new_process_refs >= process_refs) {
		cfqq->new_cfqq = new_cfqq;
		atomic_add(process_refs, &new_cfqq->ref);
	} else {
		new_cfqq->new_cfqq = cfqq;
		atomic_add(new_process_refs, &cfqq->ref);
	}
}

When a merge candidate is found, we add the process references for the
queue with less references to the queue with more.  The actual merging
of queues happens when a new request is issued for a given cfqq.  In the
case of the test program, it only does a single pread call to read in
1MB, so the actual merge never happens.

Normally, this is fine, as when the queue exits, we simply drop the
references we took on the other cfqqs in the merge chain:

	/*
	 * If this queue was scheduled to merge with another queue, be
	 * sure to drop the reference taken on that queue (and others in
	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
	 */
	__cfqq = cfqq->new_cfqq;
	while (__cfqq) {
		if (__cfqq == cfqq) {
			WARN(1, "cfqq->new_cfqq loop detected\n");
			break;
		}
		next = __cfqq->new_cfqq;
		cfq_put_queue(__cfqq);
		__cfqq = next;
	}

However, there is a hole in this logic.  Consider the following (and
keep in mind that each I/O keeps a reference to the cfqq):

q1->new_cfqq = q2   // q2 now has 2 process references
q3->new_cfqq = q2   // q2 now has 3 process references

// the process associated with q2 exits
// q2 now has 2 process references

// queue 1 exits, drops its reference on q2
// q2 now has 1 process reference

// q3 exits, so has 0 process references, and hence drops its references
// to q2, which leaves q2 also with 0 process references

q4 comes along and wants to merge with q3

q3->new_cfqq still points at q2!  We follow that link and end up at an
already freed cfqq.

So, the fix is to not follow a merge chain if the top-most queue does
not have a process reference, otherwise any queue in the chain could be
already freed.  I also changed the logic to disallow merging with a
queue that does not have any process references.  Previously, we did
this check for one of the merge candidates, but not the other.  That
doesn't really make sense.

Without the attached patch, my system would BUG within a couple of
seconds of running the reproducer program.  With the patch applied, my
system ran the program for over an hour without issues.

This addresses the following bugzilla:
    https://bugzilla.kernel.org/show_bug.cgi?id=16217

Thanks a ton to Phil Carns for providing the bug report and an excellent
reproducer.

[ Note for stable: this applies to 2.6.32/33/34 ].

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Reported-by: Phil Carns <carns@mcs.anl.gov>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5ff4f48..153f627 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1986,6 +1986,15 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	int process_refs, new_process_refs;
 	struct cfq_queue *__cfqq;
 
+	/*
+	 * If there are no process references on the new_cfqq, then it is
+	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
+	 * chain may have dropped their last reference (not just their
+	 * last process reference).
+	 */
+	if (!cfqq_process_refs(new_cfqq))
+		return;
+
 	/* Avoid a circular list and skip interim queue merges */
 	while ((__cfqq = new_cfqq->new_cfqq)) {
 		if (__cfqq == cfqq)
@@ -1994,17 +2003,17 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	}
 
 	process_refs = cfqq_process_refs(cfqq);
+	new_process_refs = cfqq_process_refs(new_cfqq);
 	/*
 	 * If the process for the cfqq has gone away, there is no
 	 * sense in merging the queues.
 	 */
-	if (process_refs == 0)
+	if (process_refs == 0 || new_process_refs == 0)
 		return;
 
 	/*
 	 * Merge in the direction of the lesser amount of work.
 	 */
-	new_process_refs = cfqq_process_refs(new_cfqq);
 	if (new_process_refs >= process_refs) {
 		cfqq->new_cfqq = new_cfqq;
 		atomic_add(process_refs, &new_cfqq->ref);
-- 
cgit v1.1


From e98ef89b30b8a2e882b11d4965347015770f3627 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 18 Jun 2010 10:39:47 -0400
Subject: cfq-iosched: Fixed boot warning with BLK_CGROUP=y and
 CFQ_GROUP_IOSCHED=n

Hi Jens,

Few days back Ingo noticed a CFQ boot time warning. This patch fixes it.
The issue here is that with CFQ_GROUP_IOSCHED=n, CFQ should not really
be making blkio stat related calls.

> Hm, it's still not entirely fixed, as of 2.6.35-rc2-00131-g7908a9e. With
> some
> configs i get bad spinlock warnings during bootup:
>
> [   28.968013] initcall net_olddevs_init+0x0/0x82 returned 0 after 93750
> usecs
> [   28.972003] calling  b44_init+0x0/0x55 @ 1
> [   28.976009] bus: 'pci': add driver b44
> [   28.976374]  sda:
> [   28.978157] BUG: spinlock bad magic on CPU#1, async/0/117
> [   28.980000]  lock: 7e1c5bbc, .magic: 00000000, .owner: <none>/-1, +.owner_cpu: 0
> [   28.980000] Pid: 117, comm: async/0 Not tainted +2.6.35-rc2-tip-01092-g010e7ef-dirty #8183
> [   28.980000] Call Trace:
> [   28.980000]  [<41ba6d55>] ? printk+0x20/0x24
> [   28.980000]  [<4134b7b7>] spin_bug+0x7c/0x87
> [   28.980000]  [<4134b853>] do_raw_spin_lock+0x1e/0x123
> [   28.980000]  [<41ba92ca>] ? _raw_spin_lock_irqsave+0x12/0x20
> [   28.980000]  [<41ba92d2>] _raw_spin_lock_irqsave+0x1a/0x20
> [   28.980000]  [<4133476f>] blkiocg_update_io_add_stats+0x25/0xfb
> [   28.980000]  [<41335dae>] ? cfq_prio_tree_add+0xb1/0xc1
> [   28.980000]  [<41337bc7>] cfq_insert_request+0x8c/0x425

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c |  54 ++++++++++++------------
 block/cfq.h         | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 27 deletions(-)
 create mode 100644 block/cfq.h

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 153f627..7982b83 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,7 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include "cfq.h"
 
 /*
  * tunables
@@ -879,7 +879,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 	cfqg->saved_workload_slice = 0;
-	blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -939,8 +939,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
-	blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
-	blkiocg_set_start_empty_time(&cfqg->blkg);
+	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -995,7 +995,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 
 	/* Add group onto cgroup list */
 	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					MKDEV(major, minor));
 	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
@@ -1079,7 +1079,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd)
 		 * it from cgroup list, then it will take care of destroying
 		 * cfqg also.
 		 */
-		if (!blkiocg_del_blkio_group(&cfqg->blkg))
+		if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
 			cfq_destroy_cfqg(cfqd, cfqg);
 	}
 }
@@ -1421,10 +1421,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
-						rq_is_sync(rq));
+	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+					rq_data_dir(rq), rq_is_sync(rq));
 	cfq_add_rq_rb(rq);
-	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
 			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
 			rq_is_sync(rq));
 }
@@ -1482,8 +1482,8 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
-						rq_is_sync(rq));
+	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
+					rq_data_dir(rq), rq_is_sync(rq));
 	if (rq_is_meta(rq)) {
 		WARN_ON(!cfqq->meta_pending);
 		cfqq->meta_pending--;
@@ -1518,8 +1518,8 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
-					cfq_bio_sync(bio));
+	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
+					bio_data_dir(bio), cfq_bio_sync(bio));
 }
 
 static void
@@ -1539,8 +1539,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
-					rq_is_sync(next));
+	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
+					rq_data_dir(next), rq_is_sync(next));
 }
 
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -1571,7 +1571,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	del_timer(&cfqd->idle_slice_timer);
-	blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 }
 
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -1580,7 +1580,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
+		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
@@ -1911,7 +1911,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	sl = cfqd->cfq_slice_idle;
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-	blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
+	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
 }
 
@@ -1931,7 +1931,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	elv_dispatch_sort(q, rq);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
-	blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
 					rq_data_dir(rq), rq_is_sync(rq));
 }
 
@@ -3257,7 +3257,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else {
-				blkiocg_update_idle_time_stats(
+				cfq_blkiocg_update_idle_time_stats(
 						&cfqq->cfqg->blkg);
 				cfq_mark_cfqq_must_dispatch(cfqq);
 			}
@@ -3285,7 +3285,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
 			&cfqd->serving_group->blkg, rq_data_dir(rq),
 			rq_is_sync(rq));
 	cfq_rq_enqueued(cfqd, cfqq, rq);
@@ -3373,9 +3373,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
-	blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
-			rq_io_start_time_ns(rq), rq_data_dir(rq),
-			rq_is_sync(rq));
+	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
+			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
+			rq_data_dir(rq), rq_is_sync(rq));
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -3739,7 +3739,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 
 	cfq_put_async_queues(cfqd);
 	cfq_release_cfq_groups(cfqd);
-	blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+	cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
 
 	spin_unlock_irq(q->queue_lock);
 
@@ -3807,8 +3807,8 @@ static void *cfq_init_queue(struct request_queue *q)
 	 */
 	atomic_set(&cfqg->ref, 1);
 	rcu_read_lock();
-	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
-					0);
+	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
+					(void *)cfqd, 0);
 	rcu_read_unlock();
 #endif
 	/*
diff --git a/block/cfq.h b/block/cfq.h
new file mode 100644
index 0000000..8783318
--- /dev/null
+++ b/block/cfq.h
@@ -0,0 +1,115 @@
+#ifndef _CFQ_H
+#define _CFQ_H
+#include "blk-cgroup.h"
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
+	struct blkio_group *curr_blkg, bool direction, bool sync)
+{
+	blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+			unsigned long dequeue)
+{
+	blkiocg_update_dequeue_stats(blkg, dequeue);
+}
+
+static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
+			unsigned long time)
+{
+	blkiocg_update_timeslice_used(blkg, time);
+}
+
+static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
+{
+	blkiocg_set_start_empty_time(blkg);
+}
+
+static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+				bool direction, bool sync)
+{
+	blkiocg_update_io_remove_stats(blkg, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+		bool direction, bool sync)
+{
+	blkiocg_update_io_merged_stats(blkg, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+	blkiocg_update_idle_time_stats(blkg);
+}
+
+static inline void
+cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+{
+	blkiocg_update_avg_queue_size_stats(blkg);
+}
+
+static inline void
+cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{
+	blkiocg_update_set_idle_time_stats(blkg);
+}
+
+static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+				uint64_t bytes, bool direction, bool sync)
+{
+	blkiocg_update_dispatch_stats(blkg, bytes, direction, sync);
+}
+
+static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
+{
+	cfq_blkiocg_update_completion_stats(blkg, start_time, io_start_time,
+				direction, sync);
+}
+
+static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+			struct blkio_group *blkg, void *key, dev_t dev) {
+	blkiocg_add_blkio_group(blkcg, blkg, key, dev);
+}
+
+static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+	return blkiocg_del_blkio_group(blkg);
+}
+
+#else /* CFQ_GROUP_IOSCHED */
+static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
+	struct blkio_group *curr_blkg, bool direction, bool sync) {}
+
+static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+			unsigned long dequeue) {}
+
+static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
+			unsigned long time) {}
+static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
+static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+				bool direction, bool sync) {}
+static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+		bool direction, bool sync) {}
+static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+}
+static inline void
+cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {}
+
+static inline void
+cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {}
+
+static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+				uint64_t bytes, bool direction, bool sync) {}
+static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {}
+
+static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+			struct blkio_group *blkg, void *key, dev_t dev) {}
+static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+	return 0;
+}
+
+#endif /* CFQ_GROUP_IOSCHED */
+#endif
-- 
cgit v1.1


From 9e495db1a1f931e82c9edccd677dd171be5b85d2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 21 Jun 2010 09:10:55 +0200
Subject: cfq: fix recursive call in cfq_blkiocg_update_completion_stats()

e98ef89b has a typo, causing cfq_blkiocg_update_completion_stats()
to call itself instead of blkiocg_update_completion_stats().

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/cfq.h b/block/cfq.h
index 8783318..93448e5 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -63,7 +63,7 @@ static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 
 static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
 {
-	cfq_blkiocg_update_completion_stats(blkg, start_time, io_start_time,
+	blkiocg_update_completion_stats(blkg, start_time, io_start_time,
 				direction, sync);
 }
 
-- 
cgit v1.1


From 1b99973f1c82707e46e8cb9416865a1e955e8f8c Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 24 Jun 2010 07:43:57 +0800
Subject: block: Don't count_vm_events for discard bio in submit_bio.

In submit_bio, we count vm events by check READ/WRITE.
But actually DISCARD_NOBARRIER also has the WRITE flag set.
It looks as if in blkdev_issue_discard, we also add a
page as the payload and the bio_has_data check isn't enough.
So add another check for discard bio.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3deca77..f0640d7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1583,7 +1583,7 @@ void submit_bio(int rw, struct bio *bio)
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
-	if (bio_has_data(bio)) {
+	if (bio_has_data(bio) && !(rw & (1 << BIO_RW_DISCARD))) {
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
-- 
cgit v1.1